DOCX: Как сконвертировать DOCX/DOC-файл в TXT-файл?
В этом разделе
Вот C#/VB.NET код, который демонстрирует, как преобразовать DOCX/DOC-файл в TXT-файл:
/// <summary>
/// Converts DOCX file to a TXT file.
/// </summary>
public void ConvertDocxToTxt(string docxFilePath, string txtFilePath)
{
using (Vintasoft.Imaging.ImageCollection images = new Vintasoft.Imaging.ImageCollection())
{
// open DOCX file
images.Add(docxFilePath);
try
{
// page number
int pageNumber = 1;
// document text content
System.Text.StringBuilder content = new System.Text.StringBuilder();
// for each page of DOCX file
foreach (Vintasoft.Imaging.VintasoftImage image in images)
{
// write page number
content.AppendFormat("\tPage Number: {0}", pageNumber++);
content.AppendLine();
content.AppendLine();
// find text region metadata
Vintasoft.Imaging.Metadata.TextRegionMetadata textRegionMetadata =
image.Metadata.MetadataTree.FindChildNode<Vintasoft.Imaging.Metadata.TextRegionMetadata>();
// if current page has text content
if (textRegionMetadata != null)
{
// get text region
Vintasoft.Imaging.Text.TextRegion textRegion = textRegionMetadata.GetTextRegion();
if (textRegion != null)
{
// if text region has text content
if (textRegion.TextContent != null)
// write page text content
content.Append(textRegion.TextContent);
}
}
// if page separator must be added between pages
if (pageNumber < images.Count)
{
content.AppendLine();
content.AppendLine();
content.AppendLine();
}
}
// write DOCX file text content to a TXT file
System.IO.File.WriteAllText(txtFilePath, content.ToString());
}
finally
{
// clear and dispose images
images.ClearAndDisposeItems();
}
}
}
''' <summary>
''' Converts DOCX file to a TXT file.
''' </summary>
Public Sub ConvertDocxToTxt(docxFilePath As String, txtFilePath As String)
Using images As New Vintasoft.Imaging.ImageCollection()
' open DOCX file
images.Add(docxFilePath)
Try
' page number
Dim pageNumber As Integer = 1
' document text content
Dim content As New System.Text.StringBuilder()
' for each page of DOCX file
For Each image As Vintasoft.Imaging.VintasoftImage In images
' write page number
content.AppendFormat(vbTab & "Page Number: {0}", System.Math.Max(System.Threading.Interlocked.Increment(pageNumber),pageNumber - 1))
content.AppendLine()
content.AppendLine()
' find text region metadata
Dim textRegionMetadata As Vintasoft.Imaging.Metadata.TextRegionMetadata = image.Metadata.MetadataTree.FindChildNode(Of Vintasoft.Imaging.Metadata.TextRegionMetadata)()
' if current page has text content
If textRegionMetadata IsNot Nothing Then
' get text region
Dim textRegion As Vintasoft.Imaging.Text.TextRegion = textRegionMetadata.GetTextRegion()
If textRegion IsNot Nothing Then
' if text region has text content
If textRegion.TextContent IsNot Nothing Then
' write page text content
content.Append(textRegion.TextContent)
End If
End If
End If
' if page separator must be added between pages
If pageNumber < images.Count Then
content.AppendLine()
content.AppendLine()
content.AppendLine()
End If
Next
' write DOCX file text content to a TXT file
System.IO.File.WriteAllText(txtFilePath, content.ToString())
Finally
' clear and dispose images
images.ClearAndDisposeItems()
End Try
End Using
End Sub