.NET PDF 转文字（Tesseract OCR + O2S）

飞宇千虹

已于 2023-09-07 21:31:53 修改

阅读量105

点赞数

文章标签： .net pdf ocr c# 开发语言

于 2023-09-06 00:14:42 首次发布

本文链接：https://blog.csdn.net/weixin_45924250/article/details/132712951

版权

环境搭建

参考：

Tesseract OCR : .NET Tesseract OCR - 掘金 (juejin.cn)

O2S : .NET 以O2S方式 PDF 转图片 - 掘金 (juejin.cn)

代码整合

创建 OCRHelper.cs



using O2S.Components.PDFRender4NET;
using System.Drawing;
using System.Text.RegularExpressions;
using Tesseract;

namespace OCR_8
{
    public static class OCRHelper
    {
        private static string imagePath = $"{Environment.CurrentDirectory}/ocr_file/image_{GuidTo16String()}";
        private static string tesseractPath = $@"Z:\.net_project\OCR_8\OCR_8\tesseract";

        public static string Scan(string path)
        {
            CreateDir();
            PDFToPng(path);
            return PngToText();
        }

        /// <summary>  
        /// 根据GUID获取16位的唯一字符串  
        /// </summary>  
        /// <param name=\"guid\"></param>  
        /// <returns></returns>  
        public static string GuidTo16String()
        {
            long i = 1;
            foreach (byte b in Guid.NewGuid().ToByteArray())
                i *= ((int)b + 1);
            return string.Format("{0:x}", i - DateTime.Now.Ticks);
        }

        private static void SaveText(string result)
        {
            string basePath = Environment.CurrentDirectory + "/ocr_file";

            try
            {
                //Pass the filepath and filename to the StreamWriter Constructor
                StreamWriter sw = new StreamWriter(basePath + "/text/result.txt");
                //Write a line of text
                sw.WriteLine(result);
                //Close the file
                sw.Close();
            }
            catch (Exception e)
            {
                Console.WriteLine("Exception: " + e.Message);
            }
            finally
            {
                Console.WriteLine("Executing finally block.");
            }
        }

        private static string PngToText()
        {
            DirectoryInfo directory = new DirectoryInfo(imagePath);

            string res = "";
            //获取文件下的文件信息
            FileInfo[] files = directory.GetFiles();
            for (int i = 0; i < files.Length; i++)
            {
                Console.WriteLine("current is:" + (i+1));
                string filePath = files[i].FullName;
                res += ScanPng(filePath);

            }
            try
            {
                Directory.Delete(imagePath, true);
            }
            catch (Exception e)
            {
                Console.WriteLine("The process failed: {0}", e.Message);
            }
            return handleResult(res);
        }

        private static string handleResult(string result)
        {
            Regex replaceSpace = new Regex(@"\s{1,}", RegexOptions.IgnoreCase);
            result = replaceSpace.Replace(result, " ").Trim();

            return result;
        }

        private static string ScanPng(string path)
        {
            using TesseractEngine engine = new TesseractEngine(tesseractPath, "eng");
            Pix pix = Pix.LoadFromFile(path);

            Page page = engine.Process(pix);
            return page.GetText();
        }

        private static void PDFToPng(string path)
        {
            PDFTranImgHelp.ConvertPDF2Image(
                path,
                imagePath + "\\",
                "",
                1,
                99,
                System.Drawing.Imaging.ImageFormat.Png,
                Definition.Ten);
        }



        private static void CreateDir()
        {
            CreateDirByPath(imagePath);
        }

        private static void CreateDirByPath(string path)
        {
            try
            {
                // Determine whether the directory exists.
                if (Directory.Exists(path))
                {
                    Console.WriteLine("That path exists already.");
                    return;
                }

                // Try to create the directory.
                DirectoryInfo di = Directory.CreateDirectory(path);
                Console.WriteLine("The directory was created successfully at {0}.", Directory.GetCreationTime(path));

            }
            catch (Exception e)
            {
                Console.WriteLine("The process failed: {0}", e.ToString());
            }
        }

        public enum Definition
        {
            One = 1, Two = 2, Three = 3, Four = 4, Five = 5, Six = 6, Seven = 7, Eight = 8, Nine = 9, Ten = 10
        }
        public class PDFTranImgHelp
        {
            /// <summary>
            /// 将PDF文档转换为图片的方法
            /// </summary>
            /// <param name="pdfInputPath">PDF文件路径</param>
            /// <param name="imageOutputPath">图片输出路径</param>
            /// <param name="imageName">生成图片的名字</param>
            /// <param name="startPageNum">从PDF文档的第几页开始转换</param>
            /// <param name="endPageNum">从PDF文档的第几页开始停止转换</param>
            /// <param name="imageFormat">设置所需图片格式</param>
            /// <param name="definition">设置图片的清晰度，数字越大越清晰</param>
            public static void ConvertPDF2Image(
                    string pdfInputPath,
                    string imageOutputPath,
                    string imageName,
                    int startPageNum,
                    int endPageNum,
                    System.Drawing.Imaging.ImageFormat imageFormat,
                    Definition definition)
            {
                PDFFile pdfFile = PDFFile.Open(pdfInputPath);
                if (!Directory.Exists(imageOutputPath))
                {
                    Directory.CreateDirectory(imageOutputPath);
                }
                // validate pageNum
                if (startPageNum <= 0)
                {
                    startPageNum = 1;
                }
                if (endPageNum > pdfFile.PageCount)
                {
                    endPageNum = pdfFile.PageCount;
                }
                if (startPageNum > endPageNum)
                {
                    int tempPageNum = startPageNum;
                    startPageNum = endPageNum;
                    endPageNum = startPageNum;
                }
                // start to convert each page
                for (int i = startPageNum; i <= endPageNum; i++)
                {
                    Bitmap pageImage = pdfFile.GetPageImage(i - 1, 56 * (int)definition);
                    pageImage.Save(imageOutputPath + imageName  + i.ToString() + "." + imageFormat.ToString(), imageFormat);
                    pageImage.Dispose();
                }
                pdfFile.Dispose();
            }
        }

    }


}

注意：

测试

OCRHelper.Scan(path): 根据pdf文件的地址，读取并打印内容

飞宇千虹

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
.NET PDF 转文字（Tesseract OCR + O2S）

环境搭建参考：Tesseract OCR :.NET Tesseract OCR - 掘金 (juejin.cn)O2S :.NET 以O2S方式 PDF 转图片 - 掘金 (juejin.cn)代码整合创建 OCRHelper.cs```using O2S.Components.PDFRender4NET;using System.Drawing;using ...
复制链接

扫一扫