PDF: Поиск и извлечение текста в PDF документе

В этом разделе

Класс TextRegion предназначен для поиска и извлечения текста из всей PDF страницы или из области страницы. Текстовый регион, который представляет всю PDF страницу, можно получить с помощью свойства PdfPage.TextRegion. Текстовый регион, который представляет некоторую область PDF страницы, можно получить с помощью метода TextRegion.GetSubregion.

ВАЖНО! Все координаты, определяющие расположение текста на PDF странице, задаются в системе координат PDF страницы. Все размеры, определяющие размеры областей текста, задаются в единицах измерения PDF страницы. Информация о системе координат и единицах измерения PDF страницы доступна здесь .

Поиск по тексту

Класс TextRegion позволяет:

искать текст на всей странице или в определенной области страницы
искать текст, чувствительный к регистру или нечувствительный к регистру
указать направление поиска текста
использовать регулярные выражения в поиске текста
определить пользовательский алгоритм поиска текста

Вот C#/VB.NET код, который демонстрирует, как найти текст на PDF странице:

public static Vintasoft.Imaging.Text.TextRegion FindTextOnPdfPage(
    Vintasoft.Imaging.Pdf.PdfDocument document,
    int pageIndex,
    string text)
{
    // specify that non-case sensitive text must be searched
    Vintasoft.Imaging.Text.TextSearchEngine searchEngine = 
        Vintasoft.Imaging.Text.TextSearchEngine.Create(text, true);
    // find text
    int startIndex = 0;
    return document.Pages[pageIndex].TextRegion.FindText(text, ref startIndex, false);
}

VB.NET

Public Shared Function FindTextOnPdfPage(document As Vintasoft.Imaging.Pdf.PdfDocument, pageIndex As Integer, text As String) As Vintasoft.Imaging.Text.TextRegion
    ' specify that non-case sensitive text must be searched
    Dim searchEngine As Vintasoft.Imaging.Text.TextSearchEngine = Vintasoft.Imaging.Text.TextSearchEngine.Create(text, True)
    ' find text
    Dim startIndex As Integer = 0
    Return document.Pages(pageIndex).TextRegion.FindText(text, startIndex, False)
End Function

Вот C#/VB.NET код, который демонстрирует, как искать текст в PDF документе с помощью регулярного выражения:

/// <summary>
/// Outputs the information about digits in content of PDF document.
/// </summary>
/// <param name="document">PDF document where digits should be searched.</param>
public void SearchDigitsInTextOfPdfDocument(Vintasoft.Imaging.Pdf.PdfDocument document)
{
    System.Console.WriteLine("Searching the digits in text of PDF document is started.");

    for (int i = 0; i < document.Pages.Count; i++)
    {
        Vintasoft.Imaging.Text.TextRegion[] textRegions = 
            SimpleDigitsSearchOnPdfPage(document.Pages[i], new System.Text.RegularExpressions.Regex(@"\d+"));
        if (textRegions != null)
        {
            for (int j = 0; j < textRegions.Length; j++)
            {
                System.Console.WriteLine(string.Format("- Text={0}, Rectangle={1}",
                    textRegions[j].TextContent,
                    textRegions[j].Rectangle));
            }
        }
    }

    System.Console.WriteLine("Searching the digits in text of PDF document is finished.");
}

/// <summary>
/// Searches a text, defined with regular expression, on PDF page.
/// </summary>
/// <param name="page">PDF page where text should be searched.</param>
/// <param name="regex">Regular expression which defines the searching text.</param>
/// <returns>An array of text regions on PDF page where text was found.</returns>
public Vintasoft.Imaging.Text.TextRegion[] SimpleDigitsSearchOnPdfPage(
    Vintasoft.Imaging.Pdf.Tree.PdfPage page, 
    System.Text.RegularExpressions.Regex regex)
{
    System.Collections.Generic.List<Vintasoft.Imaging.Text.TextRegion> textRegions = 
        new System.Collections.Generic.List<Vintasoft.Imaging.Text.TextRegion>();
    Vintasoft.Imaging.Text.TextSearchEngine textSearchEngine = 
        Vintasoft.Imaging.Text.TextSearchEngine.Create(regex);

    Vintasoft.Imaging.Text.TextRegion textRegion = null;
    int startIndex = 0;
    do
    {
        // search text
        textRegion = page.TextRegion.FindText(textSearchEngine, ref startIndex, false);
        // if found text is not empty
        if (textRegion != null)
        {
            // add result
            textRegions.Add(textRegion);
            // shitf start index
            startIndex += textRegion.TextContent.Length;
        }

    } while (textRegion != null);

    return textRegions.ToArray();
}

VB.NET

''' <summary>
''' Outputs the information about digits in content of PDF document.
''' </summary>
''' <param name="document">PDF document where digits should be searched.</param>
Public Sub SearchDigitsInTextOfPdfDocument(document As Vintasoft.Imaging.Pdf.PdfDocument)
    System.Console.WriteLine("Searching the digits in text of PDF document is started.")

    For i As Integer = 0 To document.Pages.Count - 1
        Dim textRegions As Vintasoft.Imaging.Text.TextRegion() = SimpleDigitsSearchOnPdfPage(document.Pages(i), New System.Text.RegularExpressions.Regex("\d+"))
        If textRegions IsNot Nothing Then
            For j As Integer = 0 To textRegions.Length - 1
                System.Console.WriteLine(String.Format("- Text={0}, Rectangle={1}", textRegions(j).TextContent, textRegions(j).Rectangle))
            Next
        End If
    Next

    System.Console.WriteLine("Searching the digits in text of PDF document is finished.")
End Sub

''' <summary>
''' Searches a text, defined with regular expression, on PDF page.
''' </summary>
''' <param name="page">PDF page where text should be searched.</param>
''' <param name="regex">Regular expression which defines the searching text.</param>
''' <returns>An array of text regions on PDF page where text was found.</returns>
Public Function SimpleDigitsSearchOnPdfPage(page As Vintasoft.Imaging.Pdf.Tree.PdfPage, regex As System.Text.RegularExpressions.Regex) As Vintasoft.Imaging.Text.TextRegion()
    Dim textRegions As New System.Collections.Generic.List(Of Vintasoft.Imaging.Text.TextRegion)()
    Dim textSearchEngine As Vintasoft.Imaging.Text.TextSearchEngine = Vintasoft.Imaging.Text.TextSearchEngine.Create(regex)

    Dim textRegion As Vintasoft.Imaging.Text.TextRegion = Nothing
    Dim startIndex As Integer = 0
    Do
        ' search text
        textRegion = page.TextRegion.FindText(textSearchEngine, startIndex, False)
        ' if found text is not empty
        If textRegion IsNot Nothing Then
            ' add result
            textRegions.Add(textRegion)
            ' shitf start index
            startIndex += textRegion.TextContent.Length

        End If
    Loop While textRegion IsNot Nothing

    Return textRegions.ToArray()
End Function

Вот C#/VB.NET код, который демонстрирует, как искать текст в PDF документе, используя определенный пользователем алгоритм поиска текста:

/// <summary>
/// Outputs the information about digits in content of PDF document.
/// </summary>
/// <param name="document">PDF document where digits should be searched.</param>
public void SearchDigitsInTextOfPdfDocumentUsingTextSearchEngine(Vintasoft.Imaging.Pdf.PdfDocument document)
{
    System.Console.WriteLine("Searching the digits in text of PDF document.");

    for (int i = 0; i < document.Pages.Count; i++)
    {
        Vintasoft.Imaging.Text.TextRegion[] textRegions = 
            AdvancedDigitsSearchOnPdfPage(document.Pages[i]);
        if (textRegions != null)
        {
            for (int j = 0; j < textRegions.Length; j++)
            {
                System.Console.WriteLine(string.Format("- Text={0}, Rectangle={1}",
                    textRegions[j].TextContent,
                    textRegions[j].Rectangle));
            }
        }
    }

    System.Console.WriteLine("Searching the digits in text of PDF document is finished.");
}

/// <summary>
/// Searches digits on PDF page.
/// </summary>
/// <param name="page">PDF page where digits should be searched.</param>
/// <returns>An array of text regions on PDF page where text was found.</returns>
public Vintasoft.Imaging.Text.TextRegion[] AdvancedDigitsSearchOnPdfPage(
    Vintasoft.Imaging.Pdf.Tree.PdfPage page)
{
    System.Collections.Generic.List<Vintasoft.Imaging.Text.TextRegion> textRegions = 
        new System.Collections.Generic.List<Vintasoft.Imaging.Text.TextRegion>();
    DigitsSearchEngine digitsSearchEngine = new DigitsSearchEngine();

    Vintasoft.Imaging.Text.TextRegion textRegion = null;
    int startIndex = 0;
    do
    {
        // search text
        textRegion = page.TextRegion.FindText(digitsSearchEngine, ref startIndex, false);
        if (textRegion != null)
        {
            // add result
            textRegions.Add(textRegion);
            // shitf start index
            startIndex += textRegion.TextContent.Length;
        }

    } while (textRegion != null);

    return textRegions.ToArray();
}

/// <summary>
/// Class for searching the digits in text of PDF page.
/// </summary>
class DigitsSearchEngine : Vintasoft.Imaging.Text.TextSearchEngine
{

    /// <summary>
    /// Searches the first text matching in the string of PDF page.
    /// </summary>
    /// <param name="sourceString">Source string (string of PDF page) where text must be searched.</param>
    /// <param name="startIndex">The zero-based index, in the sourceString, from which text must be searched.</param>
    /// <param name="length">The number of characters, in the sourceString, to analyze.</param>
    /// <param name="rightToLeft">Indicates that text should be searched from right to left.</param>
    /// <returns>
    /// Vintasoft.Imaging.Pdf.Content.TextExtraction.TextSearchResult object that
    /// contains information about searched text if text is found; otherwise, null.
    /// </returns>
    public override Vintasoft.Imaging.Text.TextSearchResult Find(
        string sourceString, int startIndex, int length, bool rightToLeft)
    {
        int startDigitIndex = -1;
        int endDigitIndex = -1;
        int start = 0;
        int end = 0;

        // if searching text from the right to the left
        if (rightToLeft)
        {
            start = startIndex + length;
            end = 0;
            for (int index = start - 1; index >= end; index--)
            {
                if (char.IsDigit(sourceString[index]) && endDigitIndex == -1)
                    endDigitIndex = index + 1;
                else if (!char.IsDigit(sourceString[index]) && endDigitIndex != -1)
                {
                    startDigitIndex = index + 1;
                    break;
                }
            }
            if (endDigitIndex != -1 && startDigitIndex == -1)
                startDigitIndex = 0;
        }
        // if searching text from the left to the right
        else
        {
            start = startIndex;
            end = startIndex + length;
            for (int index = start; index < end; index++)
            {
                if (char.IsDigit(sourceString[index]) && startDigitIndex == -1)
                    startDigitIndex = index;
                else if (!char.IsDigit(sourceString[index]) && startDigitIndex != -1)
                {
                    endDigitIndex = index;
                    break;
                }
            }
            if (startDigitIndex != -1 && endDigitIndex == -1)
                endDigitIndex = end;
        }

        // if digit is not found
        if (startDigitIndex == -1)
            return null;

        // return the text search result
        return new Vintasoft.Imaging.Text.TextSearchResult(
            startDigitIndex, endDigitIndex - startDigitIndex);
    }
}

VB.NET

''' <summary>
''' Outputs the information about digits in content of PDF document.
''' </summary>
''' <param name="document">PDF document where digits should be searched.</param>
Public Sub SearchDigitsInTextOfPdfDocumentUsingTextSearchEngine(document As Vintasoft.Imaging.Pdf.PdfDocument)
    System.Console.WriteLine("Searching the digits in text of PDF document.")

    For i As Integer = 0 To document.Pages.Count - 1
        Dim textRegions As Vintasoft.Imaging.Text.TextRegion() = AdvancedDigitsSearchOnPdfPage(document.Pages(i))
        If textRegions IsNot Nothing Then
            For j As Integer = 0 To textRegions.Length - 1
                System.Console.WriteLine(String.Format("- Text={0}, Rectangle={1}", textRegions(j).TextContent, textRegions(j).Rectangle))
            Next
        End If
    Next

    System.Console.WriteLine("Searching the digits in text of PDF document is finished.")
End Sub

''' <summary>
''' Searches digits on PDF page.
''' </summary>
''' <param name="page">PDF page where digits should be searched.</param>
''' <returns>An array of text regions on PDF page where text was found.</returns>
Public Function AdvancedDigitsSearchOnPdfPage(page As Vintasoft.Imaging.Pdf.Tree.PdfPage) As Vintasoft.Imaging.Text.TextRegion()
    Dim textRegions As New System.Collections.Generic.List(Of Vintasoft.Imaging.Text.TextRegion)()
    Dim digitsSearchEngine As New DigitsSearchEngine()

    Dim textRegion As Vintasoft.Imaging.Text.TextRegion = Nothing
    Dim startIndex As Integer = 0
    Do
        ' search text
        textRegion = page.TextRegion.FindText(digitsSearchEngine, startIndex, False)
        If textRegion IsNot Nothing Then
            ' add result
            textRegions.Add(textRegion)
            ' shitf start index
            startIndex += textRegion.TextContent.Length

        End If
    Loop While textRegion IsNot Nothing

    Return textRegions.ToArray()
End Function

''' <summary>
''' Class for searching the digits in text of PDF page.
''' </summary>
Private Class DigitsSearchEngine
    Inherits Vintasoft.Imaging.Text.TextSearchEngine

    ''' <summary>
    ''' Searches the first text matching in the string of PDF page.
    ''' </summary>
    ''' <param name="sourceString">Source string (string of PDF page) where text must be searched.</param>
    ''' <param name="startIndex">The zero-based index, in the sourceString, from which text must be searched.</param>
    ''' <param name="length">The number of characters, in the sourceString, to analyze.</param>
    ''' <param name="rightToLeft">Indicates that text should be searched from right to left.</param>
    ''' <returns>
    ''' Vintasoft.Imaging.Pdf.Content.TextExtraction.TextSearchResult object that
    ''' contains information about searched text if text is found; otherwise, null.
    ''' </returns>
    Public Overrides Function Find(sourceString As String, startIndex As Integer, length As Integer, rightToLeft As Boolean) As Vintasoft.Imaging.Text.TextSearchResult
        Dim startDigitIndex As Integer = -1
        Dim endDigitIndex As Integer = -1
        Dim start As Integer = 0
        Dim [end] As Integer = 0

        ' if searching text from the right to the left
        If rightToLeft Then
            start = startIndex + length
            [end] = 0
            For index As Integer = start - 1 To [end] Step -1
                If Char.IsDigit(sourceString(index)) AndAlso endDigitIndex = -1 Then
                    endDigitIndex = index + 1
                ElseIf Not Char.IsDigit(sourceString(index)) AndAlso endDigitIndex <> -1 Then
                    startDigitIndex = index + 1
                    Exit For
                End If
            Next
            If endDigitIndex <> -1 AndAlso startDigitIndex = -1 Then
                startDigitIndex = 0
            End If
        Else
            ' if searching text from the left to the right
            start = startIndex
            [end] = startIndex + length
            For index As Integer = start To [end] - 1
                If Char.IsDigit(sourceString(index)) AndAlso startDigitIndex = -1 Then
                    startDigitIndex = index
                ElseIf Not Char.IsDigit(sourceString(index)) AndAlso startDigitIndex <> -1 Then
                    endDigitIndex = index
                    Exit For
                End If
            Next
            If startDigitIndex <> -1 AndAlso endDigitIndex = -1 Then
                endDigitIndex = [end]
            End If
        End If

        ' if digit is not found
        If startDigitIndex = -1 Then
            Return Nothing
        End If

        ' return the text search result
        Return New Vintasoft.Imaging.Text.TextSearchResult(startDigitIndex, endDigitIndex - startDigitIndex)
    End Function
End Class

Извлечение текста

Класс TextRegion позволяет извлекать:

текст со всей страницы - TextRegion.TextContent
текст из региона страницы - TextRegion.GetSubregion
строку из указанного региона - TextRegion.GetLineSubregion
слово из указанного региона - TextRegion.GetWordSubregion

При извлечении текста из области страницы необходимо указать, как именно он должен быть извлечен. SDK позволяет извлекать текст:

полными текстовыми строками, т.е. извлекаются все строки текста, которые полностью или частично находятся в заданной области.
строго из указанной области, т.е. извлекается только тот текст, который находится в указанной области.

По умолчанию текст извлекается полными текстовыми строками.

Вот C#/VB.NET код, который демонстрирует, как извлечь весь текст из всей PDF страницы:

public static string ExtractTextFromPdfPage(Vintasoft.Imaging.Pdf.PdfDocument document, int pageIndex)
{
    return document.Pages[pageIndex].TextRegion.TextContent;
}

VB.NET

Public Shared Function ExtractTextFromPdfPage(document As Vintasoft.Imaging.Pdf.PdfDocument, pageIndex As Integer) As String
    Return document.Pages(pageIndex).TextRegion.TextContent
End Function

Также класс TextRegion позволяет извлекать текст из PDF страницы в виде древовидной структуры, т.е. можно получить область, представляющую весь текст всей страницы - PdfPage.TextRegion, затем все текстовые строки - TextRegion.Lines, затем все символы текстовой строки - TextRegionLine.Symbols.

Отправить отзыв