Вот C#/VB.NET код, который демонстрирует, как фильтровать результаты распознавания.
''' <summary> ''' Recognizes text in images, ''' removes words with low confidence from recognized text and ''' returns recognized text. ''' </summary> ''' <param name="filename">The name of the file containing image to OCR.</param> Public Function RecognizeTextAndFilterRecognitionResult(filename As String) As String ' minimum confidence Const MIN_CONFIDENCE As Single = 75F ' create image collection Using images As New Vintasoft.Imaging.ImageCollection() ' add images from file to image collection images.Add(filename) ' create tesseract OCR engine Using tesseractOcr As New Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr() ' create tesseract OCR settings Dim settings As New Vintasoft.Imaging.Ocr.OcrEngineSettings(Vintasoft.Imaging.Ocr.OcrLanguage.English) tesseractOcr.Init(settings) ' create result builder Dim result As New System.Text.StringBuilder() ' for each image in image collection For Each image As Vintasoft.Imaging.VintasoftImage In images ' recognize the image Dim page As Vintasoft.Imaging.Ocr.Results.OcrPage = tesseractOcr.Recognize(image) ' get all words in recognized text Dim ocrObjects As Vintasoft.Imaging.Ocr.Results.OcrObject() = page.GetObjects(Vintasoft.Imaging.Ocr.OcrObjectType.Word) ' create list of words to remove Dim removeObjects As New System.Collections.Generic.List(Of Vintasoft.Imaging.Ocr.Results.OcrObject)() ' for each word For Each word As Vintasoft.Imaging.Ocr.Results.OcrObject In ocrObjects ' if word confidence is less than minimum confidence If word.Confidence < MIN_CONFIDENCE Then ' add word to a list of words to remove removeObjects.Add(word) End If Next ' validate recognition results (remove words with low confidence) Dim editor As New Vintasoft.Imaging.Ocr.Results.OcrResultsEditor(page) editor.RemoveObjects(removeObjects.ToArray()) editor.ValidateResults() ' get recognized text Dim text As String = page.GetText() ' add recognized text to result result.Append(text) result.AppendLine() Next ' dispose images and clear image collection images.ClearAndDisposeItems() ' return result Return result.ToString() End Using End Using End Function
/// <summary> /// Recognizes text in images, /// removes words with low confidence from recognized text and /// returns recognized text. /// </summary> /// <param name="filename">The name of the file containing image to OCR.</param> public string RecognizeTextAndFilterRecognitionResult(string filename) { // minimum confidence const float MIN_CONFIDENCE = 75.0f; // create image collection using (Vintasoft.Imaging.ImageCollection images = new Vintasoft.Imaging.ImageCollection()) { // add images from file to image collection images.Add(filename); // create tesseract OCR engine using (Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr tesseractOcr = new Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr()) { // create tesseract OCR settings Vintasoft.Imaging.Ocr.OcrEngineSettings settings = new Vintasoft.Imaging.Ocr.OcrEngineSettings(Vintasoft.Imaging.Ocr.OcrLanguage.English); tesseractOcr.Init(settings); // create result builder System.Text.StringBuilder result = new System.Text.StringBuilder(); // for each image in image collection foreach (Vintasoft.Imaging.VintasoftImage image in images) { // recognize the image Vintasoft.Imaging.Ocr.Results.OcrPage page = tesseractOcr.Recognize(image); // get all words in recognized text Vintasoft.Imaging.Ocr.Results.OcrObject[] ocrObjects = page.GetObjects( Vintasoft.Imaging.Ocr.OcrObjectType.Word); // create list of words to remove System.Collections.Generic.List<Vintasoft.Imaging.Ocr.Results.OcrObject> removeObjects = new System.Collections.Generic.List<Vintasoft.Imaging.Ocr.Results.OcrObject>(); // for each word foreach (Vintasoft.Imaging.Ocr.Results.OcrObject word in ocrObjects) { // if word confidence is less than minimum confidence if (word.Confidence < MIN_CONFIDENCE) // add word to a list of words to remove removeObjects.Add(word); } // validate recognition results (remove words with low confidence) Vintasoft.Imaging.Ocr.Results.OcrResultsEditor editor = new Vintasoft.Imaging.Ocr.Results.OcrResultsEditor(page); editor.RemoveObjects(removeObjects.ToArray()); editor.ValidateResults(); // get recognized text string text = page.GetText(); // add recognized text to result result.Append(text); result.AppendLine(); } // dispose images and clear image collection images.ClearAndDisposeItems(); // return result return result.ToString(); } } }
Целевые платформы: .NET 9; .NET 8; .NET 7; .NET 6; .NET Framework 4.8, 4.7, 4.6, 4.5, 4.0, 3.5