Convert from Word Document to HTML

Python convert word to html

import win32com.client
doc = win32com.client.GetObject("demo.docx")
doc.SaveAs (FileName="hey.html", FileFormat=8)
doc.Close ()

Convert Word document into HTML without losing original

Though this approach above turned out to be extremely slow on large documents, so I had to do it differently:

    public static string RenderHTMLFile(Document doc)
{
string fn = Path.GetTempPath() + TmpPrefix +GUID() + ".html";

var vba = doc.VBProject;
var module = vba.VBComponents.Add(Microsoft.Vbe.Interop.vbext_ComponentType.vbext_ct_StdModule);

var code = Properties.Resources.HTMLRenderer;
module.CodeModule.AddFromString(code);

var dataMacro = Word.Run("renderHTMLCopy", fn);

return fn;
}

Where Properties.Resources.HTMLRenderer is a txt file with the following VB code:

Sub renderHTMLCopy(ByVal path As String)
'
' renderHTMLCopy Macro
'
'
Selection.WholeStory
Selection.Copy
Documents.Add
Selection.PasteAndFormat wdPasteDefault
ActiveDocument.SaveAs2 path, WdSaveFormat.wdFormatFilteredHTML
ActiveDocument.Close False

End Sub

The previous version took about 1500ms for a small document, this one renders the same document in roughly 400ms!

Convert Word to HTML then render HTML on webpage

We use http://www.aspose.com/ (I think the one we use is Aspose words) to perform s similar task, and it works quite well. (there is a cost involved)

I would suggest that converting to HTML gives the worst rendition of the document.
One solution we use, is to generate a Jpeg image of the document and display that.

If you need to be able to perform operations like find and copy/pasting text - I would recommend converting the document to a .pdf, and displaying it inline, in whichever standard pdf viewer the client machine has installed.

Convert multiple Word documents to HTML files using VBA

Code:

Option Explicit

Sub ChangeDocsToTxtOrRTFOrHTML()
'with export to PDF in Word 2007
Dim fs As Object
Dim oFolder As Object
Dim tFolder As Object
Dim oFile As Object
Dim strDocName As String
Dim intPos As Integer
Dim locFolder As String
Dim fileType As String
On Error Resume Next
locFolder = InputBox("Enter the folder path to DOCs", "File Conversion", "C:\myDocs")
Select Case Application.Version
Case Is < 12
Do
fileType = UCase(InputBox("Change DOC to TXT, RTF, HTML", "File Conversion", "TXT"))
Loop Until (fileType = "TXT" Or fileType = "RTF" Or fileType = "HTML")
Case Is >= 12
Do
fileType = UCase(InputBox("Change DOC to TXT, RTF, HTML or PDF(2007+ only)", "File Conversion", "TXT"))
Loop Until (fileType = "TXT" Or fileType = "RTF" Or fileType = "HTML" Or fileType = "PDF")
End Select
Application.ScreenUpdating = False
Set fs = CreateObject("Scripting.FileSystemObject")
Set oFolder = fs.GetFolder(locFolder)
Set tFolder = fs.CreateFolder(locFolder & "Converted")
Set tFolder = fs.GetFolder(locFolder & "Converted")
For Each oFile In oFolder.Files
Dim d As Document
Set d = Application.Documents.Open(oFile.Path)
strDocName = ActiveDocument.Name
intPos = InStrRev(strDocName, ".")
strDocName = Left(strDocName, intPos - 1)
ChangeFileOpenDirectory tFolder
Select Case fileType
Case Is = "TXT"
strDocName = strDocName & ".txt"
ActiveDocument.SaveAs FileName:=strDocName, FileFormat:=wdFormatText
Case Is = "RTF"
strDocName = strDocName & ".rtf"
ActiveDocument.SaveAs FileName:=strDocName, FileFormat:=wdFormatRTF
Case Is = "HTML"
strDocName = strDocName & ".html"
ActiveDocument.SaveAs FileName:=strDocName, FileFormat:=wdFormatFilteredHTML
Case Is = "PDF"
strDocName = strDocName & ".pdf"

' *** Word 2007 users - remove the apostrophe at the start of the next line ***
'ActiveDocument.ExportAsFixedFormat OutputFileName:=strDocName, ExportFormat:=wdExportFormatPDF

End Select
d.Close
ChangeFileOpenDirectory oFolder
Next oFile
Application.ScreenUpdating = True
End Sub


Option Explicit

Private Declare Sub Sleep Lib "kernel32" (ByVal dwMilliseconds As Long)

Sub ConvertDocs()
Dim fs As Object
Dim oFolder As Object
Dim tFolder As Object
Dim oFile As Object
Dim strDocName As String
Dim intPos As Integer
Dim locFolder As String
Dim fileType As String
Dim office2007 As Boolean
Dim lf As LinkFormat
Dim oField As Field
Dim oIShape As InlineShape
Dim oShape As Shape
On Error Resume Next
locFolder = InputBox("Enter the path to the folder with the documents to be converted", "File Conversion", "C:\myDocs")
If Application.Version >= 12 Then
office2007 = True
Do
fileType = UCase(InputBox("Enter one of the following formats (to convert to): TXT, RTF, HTML, DOC, DOCX or PDF", "File Conversion", "TXT"))
Loop Until (fileType = "TXT" Or fileType = "RTF" Or fileType = "HTML" Or fileType = "PDF" Or fileType = "DOC" Or fileType = "DOCX")
Else
office2007 = False
Do
fileType = UCase(InputBox("Enter one of the following formats (to convert to): TXT, RTF, HTML or DOC", "File Conversion", "TXT"))
Loop Until (fileType = "TXT" Or fileType = "RTF" Or fileType = "HTML" Or fileType = "DOC")
End Select
Application.ScreenUpdating = False
Set fs = CreateObject("Scripting.FileSystemObject")
Set oFolder = fs.GetFolder(locFolder)
Set tFolder = fs.CreateFolder(locFolder & "Converted")
Set tFolder = fs.GetFolder(locFolder & "Converted")
For Each oFile In oFolder.Files
Dim d As Document
Set d = Application.Documents.Open(oFile.Path)
' put the document into print view
If fileType = "RTF" Or fileType = "DOC" Or fileType = "DOCX" Then
With ActiveWindow.View
.ReadingLayout = False
.Type = wdPrintView
End With
End If
' try to embed linked images from fields, shapes and inline shapes into the document
' (for some reason this does not work for all images in all HTML files I've tested)
If Not fileType = "HTML" Then
For Each oField In d.Fields
Set lf = oField.LinkFormat
If oField.Type = wdFieldIncludePicture And Not lf Is Nothing And Not lf.SavePictureWithDocument Then
lf.SavePictureWithDocument = True
Sleep (2000)
lf.BreakLink()
d.UndoClear()
End If
Next
For Each oShape In d.Shapes
Set lf = oShape.LinkFormat
If Not lf Is Nothing And Not lf.SavePictureWithDocument Then
lf.SavePictureWithDocument = True
Sleep (2000)
lf.BreakLink()
d.UndoClear()
End If
Next
For Each oIShape In d.InlineShapes
Set lf = oIShape.LinkFormat
If Not lf Is Nothing And Not lf.SavePictureWithDocument Then
lf.SavePictureWithDocument = True
Sleep (2000)
lf.BreakLink()
d.UndoClear()
End If
Next
End If
strDocName = d.Name
intPos = InStrRev(strDocName, ".")
strDocName = Left(strDocName, intPos - 1)
ChangeFileOpenDirectory(tFolder)
' Check out these links for a comprehensive list of supported file formats and format constants:
' http://msdn.microsoft.com/en-us/library/microsoft.office.interop.word.wdsaveformat.aspx
' http://msdn.microsoft.com/en-us/library/office/bb238158.aspx
' (In the latter list you can see the values that the constants are associated with.
' Office 2003 only supported values up to wdFormatXML(=11). Values from wdFormatXMLDocument(=12)
' til wdFormatDocumentDefault(=16) were added in Office 2007, and wdFormatPDF(=17) and wdFormatXPS(=18)
' were added in Office 2007 SP2. Office 2010 added the various wdFormatFlatXML* formats and wdFormatOpenDocumentText.)
If Not office2007 And fileType = "DOCX" Then
fileType = "DOC"
End If
Select Case fileType
Case Is = "TXT"
strDocName = strDocName & ".txt"
d.SaveAs(FileName := strDocName, FileFormat := wdFormatText)
Case Is = "RTF"
strDocName = strDocName & ".rtf"
d.SaveAs(FileName := strDocName, FileFormat := wdFormatRTF)
Case Is = "HTML"
strDocName = strDocName & ".html"
d.SaveAs(FileName := strDocName, FileFormat := wdFormatFilteredHTML)
Case Is = "DOC"
strDocName = strDocName & ".doc"
d.SaveAs(FileName := strDocName, FileFormat := wdFormatDocument)
Case Is = "DOCX"
strDocName = strDocName & ".docx"
' *** Word 2007+ users - remove the apostrophe at the start of the next line ***
'd.SaveAs(FileName := strDocName, FileFormat := wdFormatDocumentDefault)
Case Is = "PDF"
strDocName = strDocName & ".pdf"
' *** Word 2007 SP2+ users - remove the apostrophe at the start of the next line ***
'd.ExportAsFixedFormat(OutputFileName := strDocName, ExportFormat := wdExportFormatPDF)
End Select
d.Close
ChangeFileOpenDirectory(oFolder)
Next oFile
Application.ScreenUpdating = True
End Sub

Also, see this...

https://www.youtube.com/watch?v=4vFQV6RtYMM

Convert Word doc to HTML programmatically in Java

We use tm-extractors (http://mvnrepository.com/artifact/org.textmining/tm-extractors), and fall back to the commercial Aspose (http://www.aspose.com/). Both have native Java APIs.

Aspose.Words convert to html (only body content)

Document doc = new Document(MyDir + "inputdocx.docx");
var options = new Aspose.Words.Saving.HtmlSaveOptions(SaveFormat.Html)
{
ImageSavingCallback = new HandleImageSaving(),
};
String html = doc.FirstSection.Body.ToString(options);

OpenXml Convert from Word document to HTML with Header

OpenXmlPowerTools ignores headers and footers when converting a docx-document to HTML, so they won't show up in the resulting HTML (you can browse the source code on github).

Perhaps it's because the concept of a 'page' doesn't apply to HTML, so there's no obvious equivalent to a document header.



Related Topics



Leave a reply



Submit