VintaSoft Imaging .NET SDK 14.0: Документация для .NET разработчика
В этом разделе
    PDF: Поиск и извлечение текста в PDF документе
    В этом разделе
    Класс TextRegion предназначен для поиска и извлечения текста из всей PDF страницы или из области страницы. Текстовый регион, который представляет всю PDF страницу, можно получить с помощью свойства PdfPage.TextRegion. Текстовый регион, который представляет некоторую область PDF страницы, можно получить с помощью метода TextRegion.GetSubregion.


    ВАЖНО! Все координаты, определяющие расположение текста на PDF странице, задаются в системе координат PDF страницы. Все размеры, определяющие размеры областей текста, задаются в единицах измерения PDF страницы. Информация о системе координат и единицах измерения PDF страницы доступна здесь .


    Поиск по тексту

    Класс TextRegion позволяет:
    Вот C#/VB.NET код, который демонстрирует, как найти текст на PDF странице:
    public static Vintasoft.Imaging.Text.TextRegion FindTextOnPdfPage(
        Vintasoft.Imaging.Pdf.PdfDocument document,
        int pageIndex,
        string text)
    {
        // specify that non-case sensitive text must be searched
        Vintasoft.Imaging.Text.TextSearchEngine searchEngine = 
            Vintasoft.Imaging.Text.TextSearchEngine.Create(text, true);
        // find text
        int startIndex = 0;
        return document.Pages[pageIndex].TextRegion.FindText(text, ref startIndex, false);
    }
    
    Public Shared Function FindTextOnPdfPage(document As Vintasoft.Imaging.Pdf.PdfDocument, pageIndex As Integer, text As String) As Vintasoft.Imaging.Text.TextRegion
        ' specify that non-case sensitive text must be searched
        Dim searchEngine As Vintasoft.Imaging.Text.TextSearchEngine = Vintasoft.Imaging.Text.TextSearchEngine.Create(text, True)
        ' find text
        Dim startIndex As Integer = 0
        Return document.Pages(pageIndex).TextRegion.FindText(text, startIndex, False)
    End Function
    


    Вот C#/VB.NET код, который демонстрирует, как искать текст в PDF документе с помощью регулярного выражения:
    /// <summary>
    /// Outputs the information about digits in content of PDF document.
    /// </summary>
    /// <param name="document">PDF document where digits should be searched.</param>
    public void SearchDigitsInTextOfPdfDocument(Vintasoft.Imaging.Pdf.PdfDocument document)
    {
        System.Console.WriteLine("Searching the digits in text of PDF document is started.");
    
        for (int i = 0; i < document.Pages.Count; i++)
        {
            Vintasoft.Imaging.Text.TextRegion[] textRegions = 
                SimpleDigitsSearchOnPdfPage(document.Pages[i], new System.Text.RegularExpressions.Regex(@"\d+"));
            if (textRegions != null)
            {
                for (int j = 0; j < textRegions.Length; j++)
                {
                    System.Console.WriteLine(string.Format("- Text={0}, Rectangle={1}",
                        textRegions[j].TextContent,
                        textRegions[j].Rectangle));
                }
            }
        }
    
        System.Console.WriteLine("Searching the digits in text of PDF document is finished.");
    }
    
    /// <summary>
    /// Searches a text, defined with regular expression, on PDF page.
    /// </summary>
    /// <param name="page">PDF page where text should be searched.</param>
    /// <param name="regex">Regular expression which defines the searching text.</param>
    /// <returns>An array of text regions on PDF page where text was found.</returns>
    public Vintasoft.Imaging.Text.TextRegion[] SimpleDigitsSearchOnPdfPage(
        Vintasoft.Imaging.Pdf.Tree.PdfPage page, 
        System.Text.RegularExpressions.Regex regex)
    {
        System.Collections.Generic.List<Vintasoft.Imaging.Text.TextRegion> textRegions = 
            new System.Collections.Generic.List<Vintasoft.Imaging.Text.TextRegion>();
        Vintasoft.Imaging.Text.TextSearchEngine textSearchEngine = 
            Vintasoft.Imaging.Text.TextSearchEngine.Create(regex);
    
        Vintasoft.Imaging.Text.TextRegion textRegion = null;
        int startIndex = 0;
        do
        {
            // search text
            textRegion = page.TextRegion.FindText(textSearchEngine, ref startIndex, false);
            // if found text is not empty
            if (textRegion != null)
            {
                // add result
                textRegions.Add(textRegion);
                // shitf start index
                startIndex += textRegion.TextContent.Length;
            }
    
        } while (textRegion != null);
    
        return textRegions.ToArray();
    }
    
    ''' <summary>
    ''' Outputs the information about digits in content of PDF document.
    ''' </summary>
    ''' <param name="document">PDF document where digits should be searched.</param>
    Public Sub SearchDigitsInTextOfPdfDocument(document As Vintasoft.Imaging.Pdf.PdfDocument)
        System.Console.WriteLine("Searching the digits in text of PDF document is started.")
    
        For i As Integer = 0 To document.Pages.Count - 1
            Dim textRegions As Vintasoft.Imaging.Text.TextRegion() = SimpleDigitsSearchOnPdfPage(document.Pages(i), New System.Text.RegularExpressions.Regex("\d+"))
            If textRegions IsNot Nothing Then
                For j As Integer = 0 To textRegions.Length - 1
                    System.Console.WriteLine(String.Format("- Text={0}, Rectangle={1}", textRegions(j).TextContent, textRegions(j).Rectangle))
                Next
            End If
        Next
    
        System.Console.WriteLine("Searching the digits in text of PDF document is finished.")
    End Sub
    
    ''' <summary>
    ''' Searches a text, defined with regular expression, on PDF page.
    ''' </summary>
    ''' <param name="page">PDF page where text should be searched.</param>
    ''' <param name="regex">Regular expression which defines the searching text.</param>
    ''' <returns>An array of text regions on PDF page where text was found.</returns>
    Public Function SimpleDigitsSearchOnPdfPage(page As Vintasoft.Imaging.Pdf.Tree.PdfPage, regex As System.Text.RegularExpressions.Regex) As Vintasoft.Imaging.Text.TextRegion()
        Dim textRegions As New System.Collections.Generic.List(Of Vintasoft.Imaging.Text.TextRegion)()
        Dim textSearchEngine As Vintasoft.Imaging.Text.TextSearchEngine = Vintasoft.Imaging.Text.TextSearchEngine.Create(regex)
    
        Dim textRegion As Vintasoft.Imaging.Text.TextRegion = Nothing
        Dim startIndex As Integer = 0
        Do
            ' search text
            textRegion = page.TextRegion.FindText(textSearchEngine, startIndex, False)
            ' if found text is not empty
            If textRegion IsNot Nothing Then
                ' add result
                textRegions.Add(textRegion)
                ' shitf start index
                startIndex += textRegion.TextContent.Length
    
            End If
        Loop While textRegion IsNot Nothing
    
        Return textRegions.ToArray()
    End Function
    


    Вот C#/VB.NET код, который демонстрирует, как искать текст в PDF документе, используя определенный пользователем алгоритм поиска текста:
    /// <summary>
    /// Outputs the information about digits in content of PDF document.
    /// </summary>
    /// <param name="document">PDF document where digits should be searched.</param>
    public void SearchDigitsInTextOfPdfDocumentUsingTextSearchEngine(Vintasoft.Imaging.Pdf.PdfDocument document)
    {
        System.Console.WriteLine("Searching the digits in text of PDF document.");
    
        for (int i = 0; i < document.Pages.Count; i++)
        {
            Vintasoft.Imaging.Text.TextRegion[] textRegions = 
                AdvancedDigitsSearchOnPdfPage(document.Pages[i]);
            if (textRegions != null)
            {
                for (int j = 0; j < textRegions.Length; j++)
                {
                    System.Console.WriteLine(string.Format("- Text={0}, Rectangle={1}",
                        textRegions[j].TextContent,
                        textRegions[j].Rectangle));
                }
            }
        }
    
        System.Console.WriteLine("Searching the digits in text of PDF document is finished.");
    }
    
    /// <summary>
    /// Searches digits on PDF page.
    /// </summary>
    /// <param name="page">PDF page where digits should be searched.</param>
    /// <returns>An array of text regions on PDF page where text was found.</returns>
    public Vintasoft.Imaging.Text.TextRegion[] AdvancedDigitsSearchOnPdfPage(
        Vintasoft.Imaging.Pdf.Tree.PdfPage page)
    {
        System.Collections.Generic.List<Vintasoft.Imaging.Text.TextRegion> textRegions = 
            new System.Collections.Generic.List<Vintasoft.Imaging.Text.TextRegion>();
        DigitsSearchEngine digitsSearchEngine = new DigitsSearchEngine();
    
        Vintasoft.Imaging.Text.TextRegion textRegion = null;
        int startIndex = 0;
        do
        {
            // search text
            textRegion = page.TextRegion.FindText(digitsSearchEngine, ref startIndex, false);
            if (textRegion != null)
            {
                // add result
                textRegions.Add(textRegion);
                // shitf start index
                startIndex += textRegion.TextContent.Length;
            }
    
        } while (textRegion != null);
    
        return textRegions.ToArray();
    }
    
    /// <summary>
    /// Class for searching the digits in text of PDF page.
    /// </summary>
    class DigitsSearchEngine : Vintasoft.Imaging.Text.TextSearchEngine
    {
    
        /// <summary>
        /// Searches the first text matching in the string of PDF page.
        /// </summary>
        /// <param name="sourceString">Source string (string of PDF page) where text must be searched.</param>
        /// <param name="startIndex">The zero-based index, in the sourceString, from which text must be searched.</param>
        /// <param name="length">The number of characters, in the sourceString, to analyze.</param>
        /// <param name="rightToLeft">Indicates that text should be searched from right to left.</param>
        /// <returns>
        /// Vintasoft.Imaging.Pdf.Content.TextExtraction.TextSearchResult object that
        /// contains information about searched text if text is found; otherwise, null.
        /// </returns>
        public override Vintasoft.Imaging.Text.TextSearchResult Find(
            string sourceString, int startIndex, int length, bool rightToLeft)
        {
            int startDigitIndex = -1;
            int endDigitIndex = -1;
            int start = 0;
            int end = 0;
    
            // if searching text from the right to the left
            if (rightToLeft)
            {
                start = startIndex + length;
                end = 0;
                for (int index = start - 1; index >= end; index--)
                {
                    if (char.IsDigit(sourceString[index]) && endDigitIndex == -1)
                        endDigitIndex = index + 1;
                    else if (!char.IsDigit(sourceString[index]) && endDigitIndex != -1)
                    {
                        startDigitIndex = index + 1;
                        break;
                    }
                }
                if (endDigitIndex != -1 && startDigitIndex == -1)
                    startDigitIndex = 0;
            }
            // if searching text from the left to the right
            else
            {
                start = startIndex;
                end = startIndex + length;
                for (int index = start; index < end; index++)
                {
                    if (char.IsDigit(sourceString[index]) && startDigitIndex == -1)
                        startDigitIndex = index;
                    else if (!char.IsDigit(sourceString[index]) && startDigitIndex != -1)
                    {
                        endDigitIndex = index;
                        break;
                    }
                }
                if (startDigitIndex != -1 && endDigitIndex == -1)
                    endDigitIndex = end;
            }
    
            // if digit is not found
            if (startDigitIndex == -1)
                return null;
    
            // return the text search result
            return new Vintasoft.Imaging.Text.TextSearchResult(
                startDigitIndex, endDigitIndex - startDigitIndex);
        }
    }
    
    ''' <summary>
    ''' Outputs the information about digits in content of PDF document.
    ''' </summary>
    ''' <param name="document">PDF document where digits should be searched.</param>
    Public Sub SearchDigitsInTextOfPdfDocumentUsingTextSearchEngine(document As Vintasoft.Imaging.Pdf.PdfDocument)
        System.Console.WriteLine("Searching the digits in text of PDF document.")
    
        For i As Integer = 0 To document.Pages.Count - 1
            Dim textRegions As Vintasoft.Imaging.Text.TextRegion() = AdvancedDigitsSearchOnPdfPage(document.Pages(i))
            If textRegions IsNot Nothing Then
                For j As Integer = 0 To textRegions.Length - 1
                    System.Console.WriteLine(String.Format("- Text={0}, Rectangle={1}", textRegions(j).TextContent, textRegions(j).Rectangle))
                Next
            End If
        Next
    
        System.Console.WriteLine("Searching the digits in text of PDF document is finished.")
    End Sub
    
    ''' <summary>
    ''' Searches digits on PDF page.
    ''' </summary>
    ''' <param name="page">PDF page where digits should be searched.</param>
    ''' <returns>An array of text regions on PDF page where text was found.</returns>
    Public Function AdvancedDigitsSearchOnPdfPage(page As Vintasoft.Imaging.Pdf.Tree.PdfPage) As Vintasoft.Imaging.Text.TextRegion()
        Dim textRegions As New System.Collections.Generic.List(Of Vintasoft.Imaging.Text.TextRegion)()
        Dim digitsSearchEngine As New DigitsSearchEngine()
    
        Dim textRegion As Vintasoft.Imaging.Text.TextRegion = Nothing
        Dim startIndex As Integer = 0
        Do
            ' search text
            textRegion = page.TextRegion.FindText(digitsSearchEngine, startIndex, False)
            If textRegion IsNot Nothing Then
                ' add result
                textRegions.Add(textRegion)
                ' shitf start index
                startIndex += textRegion.TextContent.Length
    
            End If
        Loop While textRegion IsNot Nothing
    
        Return textRegions.ToArray()
    End Function
    
    ''' <summary>
    ''' Class for searching the digits in text of PDF page.
    ''' </summary>
    Private Class DigitsSearchEngine
        Inherits Vintasoft.Imaging.Text.TextSearchEngine
    
        ''' <summary>
        ''' Searches the first text matching in the string of PDF page.
        ''' </summary>
        ''' <param name="sourceString">Source string (string of PDF page) where text must be searched.</param>
        ''' <param name="startIndex">The zero-based index, in the sourceString, from which text must be searched.</param>
        ''' <param name="length">The number of characters, in the sourceString, to analyze.</param>
        ''' <param name="rightToLeft">Indicates that text should be searched from right to left.</param>
        ''' <returns>
        ''' Vintasoft.Imaging.Pdf.Content.TextExtraction.TextSearchResult object that
        ''' contains information about searched text if text is found; otherwise, null.
        ''' </returns>
        Public Overrides Function Find(sourceString As String, startIndex As Integer, length As Integer, rightToLeft As Boolean) As Vintasoft.Imaging.Text.TextSearchResult
            Dim startDigitIndex As Integer = -1
            Dim endDigitIndex As Integer = -1
            Dim start As Integer = 0
            Dim [end] As Integer = 0
    
            ' if searching text from the right to the left
            If rightToLeft Then
                start = startIndex + length
                [end] = 0
                For index As Integer = start - 1 To [end] Step -1
                    If Char.IsDigit(sourceString(index)) AndAlso endDigitIndex = -1 Then
                        endDigitIndex = index + 1
                    ElseIf Not Char.IsDigit(sourceString(index)) AndAlso endDigitIndex <> -1 Then
                        startDigitIndex = index + 1
                        Exit For
                    End If
                Next
                If endDigitIndex <> -1 AndAlso startDigitIndex = -1 Then
                    startDigitIndex = 0
                End If
            Else
                ' if searching text from the left to the right
                start = startIndex
                [end] = startIndex + length
                For index As Integer = start To [end] - 1
                    If Char.IsDigit(sourceString(index)) AndAlso startDigitIndex = -1 Then
                        startDigitIndex = index
                    ElseIf Not Char.IsDigit(sourceString(index)) AndAlso startDigitIndex <> -1 Then
                        endDigitIndex = index
                        Exit For
                    End If
                Next
                If startDigitIndex <> -1 AndAlso endDigitIndex = -1 Then
                    endDigitIndex = [end]
                End If
            End If
    
            ' if digit is not found
            If startDigitIndex = -1 Then
                Return Nothing
            End If
    
            ' return the text search result
            Return New Vintasoft.Imaging.Text.TextSearchResult(startDigitIndex, endDigitIndex - startDigitIndex)
        End Function
    End Class
    



    Извлечение текста

    Класс TextRegion позволяет извлекать:
    При извлечении текста из области страницы необходимо указать, как именно он должен быть извлечен. SDK позволяет извлекать текст: По умолчанию текст извлекается полными текстовыми строками.

    Вот C#/VB.NET код, который демонстрирует, как извлечь весь текст из всей PDF страницы:
    public static string ExtractTextFromPdfPage(Vintasoft.Imaging.Pdf.PdfDocument document, int pageIndex)
    {
        return document.Pages[pageIndex].TextRegion.TextContent;
    }
    
    Public Shared Function ExtractTextFromPdfPage(document As Vintasoft.Imaging.Pdf.PdfDocument, pageIndex As Integer) As String
        Return document.Pages(pageIndex).TextRegion.TextContent
    End Function
    


    Также класс TextRegion позволяет извлекать текст из PDF страницы в виде древовидной структуры, т.е. можно получить область, представляющую весь текст всей страницы - PdfPage.TextRegion, затем все текстовые строки - TextRegion.Lines, затем все символы текстовой строки - TextRegionLine.Symbols.