环境搭建
参考:
Tesseract OCR : .NET Tesseract OCR - 掘金 (juejin.cn)
O2S : .NET 以O2S方式 PDF 转图片 - 掘金 (juejin.cn)
代码整合
创建 OCRHelper.cs
using O2S.Components.PDFRender4NET;
using System.Drawing;
using System.Text.RegularExpressions;
using Tesseract;
namespace OCR_8
{
public static class OCRHelper
{
private static string imagePath = $"{Environment.CurrentDirectory}/ocr_file/image_{GuidTo16String()}";
private static string tesseractPath = $@"Z:\.net_project\OCR_8\OCR_8\tesseract";
public static string Scan(string path)
{
CreateDir();
PDFToPng(path);
return PngToText();
}
/// <summary>
/// 根据GUID获取16位的唯一字符串
/// </summary>
/// <param name=\"guid\"></param>
/// <returns></returns>
public static string GuidTo16String()
{
long i = 1;
foreach (byte b in Guid.NewGuid().ToByteArray())
i *= ((int)b + 1);
return string.Format("{0:x}", i - DateTime.Now.Ticks);
}
private static void SaveText(string result)
{
string basePath = Environment.CurrentDirectory + "/ocr_file";
try
{
//Pass the filepath and filename to the StreamWriter Constructor
StreamWriter sw = new StreamWriter(basePath + "/text/result.txt");
//Write a line of text
sw.WriteLine(result);
//Close the file
sw.Close();
}
catch (Exception e)
{
Console.WriteLine("Exception: " + e.Message);
}
finally
{
Console.WriteLine("Executing finally block.");
}
}
private static string PngToText()
{
DirectoryInfo directory = new DirectoryInfo(imagePath);
string res = "";
//获取文件下的文件信息
FileInfo[] files = directory.GetFiles();
for (int i = 0; i < files.Length; i++)
{
Console.WriteLine("current is:" + (i+1));
string filePath = files[i].FullName;
res += ScanPng(filePath);
}
try
{
Directory.Delete(imagePath, true);
}
catch (Exception e)
{
Console.WriteLine("The process failed: {0}", e.Message);
}
return handleResult(res);
}
private static string handleResult(string result)
{
Regex replaceSpace = new Regex(@"\s{1,}", RegexOptions.IgnoreCase);
result = replaceSpace.Replace(result, " ").Trim();
return result;
}
private static string ScanPng(string path)
{
using TesseractEngine engine = new TesseractEngine(tesseractPath, "eng");
Pix pix = Pix.LoadFromFile(path);
Page page = engine.Process(pix);
return page.GetText();
}
private static void PDFToPng(string path)
{
PDFTranImgHelp.ConvertPDF2Image(
path,
imagePath + "\\",
"",
1,
99,
System.Drawing.Imaging.ImageFormat.Png,
Definition.Ten);
}
private static void CreateDir()
{
CreateDirByPath(imagePath);
}
private static void CreateDirByPath(string path)
{
try
{
// Determine whether the directory exists.
if (Directory.Exists(path))
{
Console.WriteLine("That path exists already.");
return;
}
// Try to create the directory.
DirectoryInfo di = Directory.CreateDirectory(path);
Console.WriteLine("The directory was created successfully at {0}.", Directory.GetCreationTime(path));
}
catch (Exception e)
{
Console.WriteLine("The process failed: {0}", e.ToString());
}
}
public enum Definition
{
One = 1, Two = 2, Three = 3, Four = 4, Five = 5, Six = 6, Seven = 7, Eight = 8, Nine = 9, Ten = 10
}
public class PDFTranImgHelp
{
/// <summary>
/// 将PDF文档转换为图片的方法
/// </summary>
/// <param name="pdfInputPath">PDF文件路径</param>
/// <param name="imageOutputPath">图片输出路径</param>
/// <param name="imageName">生成图片的名字</param>
/// <param name="startPageNum">从PDF文档的第几页开始转换</param>
/// <param name="endPageNum">从PDF文档的第几页开始停止转换</param>
/// <param name="imageFormat">设置所需图片格式</param>
/// <param name="definition">设置图片的清晰度,数字越大越清晰</param>
public static void ConvertPDF2Image(
string pdfInputPath,
string imageOutputPath,
string imageName,
int startPageNum,
int endPageNum,
System.Drawing.Imaging.ImageFormat imageFormat,
Definition definition)
{
PDFFile pdfFile = PDFFile.Open(pdfInputPath);
if (!Directory.Exists(imageOutputPath))
{
Directory.CreateDirectory(imageOutputPath);
}
// validate pageNum
if (startPageNum <= 0)
{
startPageNum = 1;
}
if (endPageNum > pdfFile.PageCount)
{
endPageNum = pdfFile.PageCount;
}
if (startPageNum > endPageNum)
{
int tempPageNum = startPageNum;
startPageNum = endPageNum;
endPageNum = startPageNum;
}
// start to convert each page
for (int i = startPageNum; i <= endPageNum; i++)
{
Bitmap pageImage = pdfFile.GetPageImage(i - 1, 56 * (int)definition);
pageImage.Save(imageOutputPath + imageName + i.ToString() + "." + imageFormat.ToString(), imageFormat);
pageImage.Dispose();
}
pdfFile.Dispose();
}
}
}
}
注意:
测试
OCRHelper.Scan(path)
: 根据pdf文件的地址,读取并打印内容