Распознавание текста с помощью .NET приложения для Linux
Категория: Imaging; OCR; .NET; Linux
23 декабря 2022


dotnet new console --framework net6.0
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>net6.0</TargetFramework>
<RootNamespace>ConsoleApp1</RootNamespace>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
</PropertyGroup>
<ItemGroup>
<PackageReference Include="SkiaSharp" Version="2.88.0" />
<PackageReference Include="SkiaSharp.NativeAssets.Linux" Version="2.88.0" />
<PackageReference Include="Vintasoft.Imaging" Version="12.1.5.1" />
<PackageReference Include="Vintasoft.Imaging.Drawing.SkiaSharp" Version="12.1.5.1" />
<PackageReference Include="Vintasoft.Imaging.DocCleanup" Version="7.1.5.1" />
<PackageReference Include="Vintasoft.Imaging.Ocr" Version="7.1.5.1" />
<PackageReference Include="Vintasoft.Imaging.Ocr.Tesseract" Version="7.1.5.1" />
<PackageReference Include="Vintasoft.Imaging.Pdf" Version="9.1.5.1" />
<PackageReference Include="Vintasoft.Imaging.Pdf.Ocr" Version="9.1.5.1" />
<PackageReference Include="Vintasoft.Shared" Version="3.3.1.1" />
</ItemGroup>
<ItemGroup>
<Content Include="OCR.tif">
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
</Content>
</ItemGroup>
</Project>

namespace ConsoleApp1
{
class Program
{
static void Main(string[] args)
{
Vintasoft.Imaging.ImagingGlobalSettings.Register("%EVAL_LIC_USER_NAME%", "%EVAL_LIC_USER_EMAIL%", "%EVAL_LIC_DATE%", "%EVAL_LIC_REG_CODE%");
string imageFilePath = "OCR.tif";
string tesseractOcrPath = "TesseractOCR";
// create the OCR engine
using (Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr tesseractOcr = new Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr(tesseractOcrPath))
{
// specify that OCR engine will recognize English text
Vintasoft.Imaging.Ocr.OcrLanguage language = Vintasoft.Imaging.Ocr.OcrLanguage.English;
// create the OCR engine settings
Vintasoft.Imaging.Ocr.Tesseract.TesseractOcrSettings settings = new Vintasoft.Imaging.Ocr.Tesseract.TesseractOcrSettings(language);
// initialize the OCR engine
tesseractOcr.Init(settings);
// load an image with text
using (Vintasoft.Imaging.VintasoftImage image = new Vintasoft.Imaging.VintasoftImage(imageFilePath))
{
// preprocess image before text recognition
// remove noise from image
Vintasoft.Imaging.ImageProcessing.Document.DespeckleCommand despeckleCommand = new Vintasoft.Imaging.ImageProcessing.Document.DespeckleCommand();
despeckleCommand.ExecuteInPlace(image);
// remove lines from image
Vintasoft.Imaging.ImageProcessing.Document.LineRemovalCommand lineRemovalCommand = new Vintasoft.Imaging.ImageProcessing.Document.LineRemovalCommand();
lineRemovalCommand.ExecuteInPlace(image);
// specify an image with text
tesseractOcr.SetImage(image);
// recognize text in image
Vintasoft.Imaging.Ocr.Results.OcrPage ocrResult = tesseractOcr.Recognize();
// create PDF document
using (Vintasoft.Imaging.Pdf.PdfDocument pdfDocument = new Vintasoft.Imaging.Pdf.PdfDocument("OCR.pdf", Vintasoft.Imaging.Pdf.PdfFormat.Pdf_14))
{
// create PDF document builder
Vintasoft.Imaging.Pdf.Ocr.PdfDocumentBuilder documentBuilder = new Vintasoft.Imaging.Pdf.Ocr.PdfDocumentBuilder(pdfDocument);
documentBuilder.ImageCompression = Vintasoft.Imaging.Pdf.PdfCompression.Auto;
documentBuilder.PageCreationMode = Vintasoft.Imaging.Pdf.Ocr.PdfPageCreationMode.ImageOverText;
// add OCR result to the PDF document
documentBuilder.AddPage(image, ocrResult);
// save changes in PDF document
pdfDocument.SaveChanges();
}
// clear the image
tesseractOcr.ClearImage();
}
// shutdown the OCR engine
tesseractOcr.Shutdown();
}
}
}
}



dotnet build Recognize_Text_In_Image.csproj

dotnet ./Recognize_Text_In_Image.dll