OCR: Как получить изображение от TWAIN сканера и создать searchable PDFдокумент?
В этом разделе
Вот C#/VB.NET код, который демонстрирует, как получить изображение от TWAIN сканера, распознать текст в отсканированном изображении и создать searchable PDF документ поиска на основе результатов распознавания текста:
class ScanAndConvertToSearchablePdfDcoument
{
/// <summary>
/// Acquires image from scanner,
/// recognizes text on images and
/// saves the result as a searchable PDF document.
/// </summary>
/// <remarks>
/// VintaSoft Twain .NET SDK (Vintasoft.Twain.dll),
/// VintaSoft Imaging .NET SDK (Standard) (Vintasoft.Imaging.dll),
/// VintaSoft Document Cleanup .NET Plug-in (Vintasoft.Imaging.DocCleanup.dll),
/// VintaSoft OCR .NET Plug-in (Vintasoft.Imaging.Ocr.dll, Vintasoft.Imaging.Ocr.Tesseract.dll),
/// VintaSoft PDF .NET Plug-in (Vintasoft.Imaging.Pdf.dll, Vintasoft.Imaging.Pdf.Ocr.dll) and
/// and VintaSoft PDF .NET Plug-in (Writer) (Vintasoft.Pdf.dll) are necessary
/// for executing this sample.
/// </remarks>
public static void ScanImagesAndSaveAsSearchablePdfDocument(
Vintasoft.Imaging.Ocr.OcrLanguage language,
string outputPdfFilename)
{
System.Console.WriteLine("Create TWAIN device manager...");
using (Vintasoft.Twain.DeviceManager deviceManager =
new Vintasoft.Twain.DeviceManager())
{
// create PDF document
using (Vintasoft.Imaging.Pdf.PdfDocument pdfDocument =
new Vintasoft.Imaging.Pdf.PdfDocument(outputPdfFilename, Vintasoft.Imaging.Pdf.PdfFormat.Pdf_14))
{
// create PDF document builder
Vintasoft.Imaging.Pdf.Ocr.PdfDocumentBuilder documentBuilder =
new Vintasoft.Imaging.Pdf.Ocr.PdfDocumentBuilder(pdfDocument);
documentBuilder.ImageCompression = Vintasoft.Imaging.Pdf.PdfCompression.Auto;
documentBuilder.PageCreationMode = Vintasoft.Imaging.Pdf.Ocr.PdfPageCreationMode.ImageOverText;
System.Console.WriteLine("Create Tesseract OCR engine...");
using (Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr tesseractOcr =
new Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr())
{
// create OCR engine manager
Vintasoft.Imaging.Ocr.OcrEngineManager engineManager =
new Vintasoft.Imaging.Ocr.OcrEngineManager(tesseractOcr);
Vintasoft.Imaging.Ocr.OcrEngineSettings settings =
new Vintasoft.Imaging.Ocr.OcrEngineSettings(language);
System.Console.WriteLine("Open TWAIN device manager...");
deviceManager.Open();
Vintasoft.Twain.Device device = deviceManager.DefaultDevice;
Vintasoft.Twain.AcquireModalState acquireState;
do
{
System.Console.WriteLine("Acquire image from scanner...");
acquireState = device.AcquireModal();
if (acquireState == Vintasoft.Twain.AcquireModalState.ImageAcquired)
{
// create VintasoftImage
using (Vintasoft.Imaging.VintasoftImage image =
new Vintasoft.Imaging.VintasoftImage(device.AcquiredImage.GetAsVintasoftBitmap(), true))
{
// preprocess image
// BorderClear, Despeckle, Deskew, Segmentation
System.Console.WriteLine("Preprocess the image...");
Vintasoft.Imaging.ImageProcessing.Document.OcrPreprocessingCommand preprocessing =
new Vintasoft.Imaging.ImageProcessing.Document.OcrPreprocessingCommand();
preprocessing.Binarization = null;
preprocessing.ExecuteInPlace(image);
// recognize image
System.Console.WriteLine("Recognize the image...");
Vintasoft.Imaging.Ocr.Results.OcrPage page = engineManager.Recognize(image, settings,
preprocessing.SegmentationTextRegions);
// add page to PDF document
System.Console.WriteLine("Add page to PDF document...");
documentBuilder.AddPage(image, page);
}
// dispose the acquired image
device.AcquiredImage.Dispose();
}
}
while (acquireState != Vintasoft.Twain.AcquireModalState.None);
System.Console.WriteLine("Save changes in PDF document...");
pdfDocument.SaveChanges();
}
}
}
}
}
Class ScanAndConvertToSearchablePdfDcoument
''' <summary>
''' Acquires image from scanner,
''' recognizes text on images and
''' saves the result as a searchable PDF document.
''' </summary>
''' <remarks>
''' VintaSoft Twain .NET SDK (Vintasoft.Twain.dll),
''' VintaSoft Imaging .NET SDK (Standard) (Vintasoft.Imaging.dll),
''' VintaSoft Document Cleanup .NET Plug-in (Vintasoft.Imaging.DocCleanup.dll),
''' VintaSoft OCR .NET Plug-in (Vintasoft.Imaging.Ocr.dll, Vintasoft.Imaging.Ocr.Tesseract.dll),
''' VintaSoft PDF .NET Plug-in (Vintasoft.Imaging.Pdf.dll, Vintasoft.Imaging.Pdf.Ocr.dll) and
''' and VintaSoft PDF .NET Plug-in (Writer) (Vintasoft.Pdf.dll) are necessary
''' for executing this sample.
''' </remarks>
Public Shared Sub ScanImagesAndSaveAsSearchablePdfDocument(language As Vintasoft.Imaging.Ocr.OcrLanguage, outputPdfFilename As String)
System.Console.WriteLine("Create TWAIN device manager...")
Using deviceManager As New Vintasoft.Twain.DeviceManager()
' create PDF document
Using pdfDocument As New Vintasoft.Imaging.Pdf.PdfDocument(outputPdfFilename, Vintasoft.Imaging.Pdf.PdfFormat.Pdf_14)
' create PDF document builder
Dim documentBuilder As New Vintasoft.Imaging.Pdf.Ocr.PdfDocumentBuilder(pdfDocument)
documentBuilder.ImageCompression = Vintasoft.Imaging.Pdf.PdfCompression.Auto
documentBuilder.PageCreationMode = Vintasoft.Imaging.Pdf.Ocr.PdfPageCreationMode.ImageOverText
System.Console.WriteLine("Create Tesseract OCR engine...")
Using tesseractOcr As New Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr()
' create OCR engine manager
Dim engineManager As New Vintasoft.Imaging.Ocr.OcrEngineManager(tesseractOcr)
Dim settings As New Vintasoft.Imaging.Ocr.OcrEngineSettings(language)
System.Console.WriteLine("Open TWAIN device manager...")
deviceManager.Open()
Dim device As Vintasoft.Twain.Device = deviceManager.DefaultDevice
Dim acquireState As Vintasoft.Twain.AcquireModalState
Do
System.Console.WriteLine("Acquire image from scanner...")
acquireState = device.AcquireModal()
If acquireState = Vintasoft.Twain.AcquireModalState.ImageAcquired Then
' create VintasoftImage
Using image As New Vintasoft.Imaging.VintasoftImage(device.AcquiredImage.GetAsVintasoftBitmap(), True)
' preprocess image
' BorderClear, Despeckle, Deskew, Segmentation
System.Console.WriteLine("Preprocess the image...")
Dim preprocessing As New Vintasoft.Imaging.ImageProcessing.Document.OcrPreprocessingCommand()
preprocessing.Binarization = Nothing
preprocessing.ExecuteInPlace(image)
' recognize image
System.Console.WriteLine("Recognize the image...")
Dim page As Vintasoft.Imaging.Ocr.Results.OcrPage = engineManager.Recognize(image, settings, preprocessing.SegmentationTextRegions)
' add page to PDF document
System.Console.WriteLine("Add page to PDF document...")
documentBuilder.AddPage(image, page)
End Using
' dispose the acquired image
device.AcquiredImage.Dispose()
End If
Loop While acquireState <> Vintasoft.Twain.AcquireModalState.None
System.Console.WriteLine("Save changes in PDF document...")
pdfDocument.SaveChanges()
End Using
End Using
End Using
End Sub
End Class