C# tessnet2.Tesseract 识别图片验证码内容 OCR

文字识别(Optical Character Recognition,OCR)

腾讯云文字识别(Optical Character Recognition,OCR )
阿里云文字识别
微信小程序 OCR工具   小程序身份证 OCR 识别

使用开源库tesseract,地址:https://github.com/tesseract-ocr/tesseract
tesseract训练的语言包:https://github.com/tesseract-ocr/tessdata
有专门针对C#的封装:https://github.com/charlesw/tesseract
C# Demo地址:https://github.com/charlesw/tesseract-samples

1、HttpWebRequest发送Web请求,获取流文件
2、将文件下载保存到本地
3、读取刚才下载的图片文件
4、NuGet Package Manager -> Tessnet2(NuGet.Tessnet2) -> Install
5、tessnet2.Tesseract识别图片内容

6、下载 tesseract 语言包:https://github.com/tesseract-ocr/tessdata,复制粘贴到项目根目录 

using Petro.Web.App_Start;
using Petro.Web.CoreLibrary;
using System;
using System.Collections.Generic;
using System.Drawing;
using System.IO;
using System.Net;
using System.Text;
using System.Web.Mvc;

namespace Petro.Web.Controllers
{
    public class HomeController : Controller
    {
        string url = "http://www.***.com/Image.aspx?ucode=ucode";

        public ActionResult Tesseract()
        {
            string code = string.Empty;
            string url = string.Empty;
            string fileName = @"\upload\" + GenerateTimeStamp() + ".jpg";
            string pathName = Server.MapPath(fileName);
            HttpWebRequest myRequest = (HttpWebRequest)WebRequest.Create(url);
            myRequest.Method = "GET";

            HttpWebResponse myResponse = null;
            try
            {
                myResponse = (HttpWebResponse)myRequest.GetResponse();
                StreamReader reader = new StreamReader(myResponse.GetResponseStream(), Encoding.UTF8);
                Stream stream = myResponse.GetResponseStream();

                #region 保存下载图片
                MemoryStream ms = null;
                Byte[] buffer = new Byte[myResponse.ContentLength];
                int offset = 0, actuallyRead = 0;
                do
                {
                    actuallyRead = stream.Read(buffer, offset, buffer.Length - offset);
                    offset += actuallyRead;
                }
                while (actuallyRead > 0);
                ms = new MemoryStream(buffer);

                byte[] buffurPic = ms.ToArray();
                System.IO.File.WriteAllBytes(pathName, buffurPic);
                #endregion

                #region 读取目录文件

                FileStream fileStream = new FileStream(pathName, FileMode.Open, FileAccess.Read, FileShare.Read);
                byte[] bytes = new byte[fileStream.Length];
                fileStream.Read(bytes, 0, bytes.Length);
                fileStream.Close();
                Stream streamFile = new MemoryStream(bytes);

                #endregion


                #region 创建Bitmap图片

                Bitmap bitmap1 = (Bitmap)Bitmap.FromStream(streamFile);

                #endregion

                Bitmap bitmap = (Bitmap)bitmap1.Clone();
                UnCodebase ud = new UnCodebase(bitmap);
                bitmap = ud.GrayByPixels("");
                ud.ClearNoise(128, 2);

                tessnet2.Tesseract ocr = new tessnet2.Tesseract();//声明一个OCR类
                ocr.SetVariable("tessedit_char_whitelist", "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
                ocr.Init(Server.MapPath(@"\\tmpe"), "eng", false);
                List<tessnet2.Word> result = ocr.DoOCR(bitmap, Rectangle.Empty);//执行识别操作
                code = result[0].Text;
            }
            catch (Exception ex)
            {

            }
            var resultJson = new
            {
                Code = 200,
                Msg = "Success",
                Data = new
                {
                    Code = code,
                    FilePath = fileName
                },
            };
            return Json(resultJson, JsonRequestBehavior.AllowGet);
        }

        public string GenerateTimeStamp()
        {
            TimeSpan ts = DateTime.UtcNow - new DateTime(1970, 1, 1, 0, 0, 0, 0);
            return Convert.ToInt64(ts.TotalMilliseconds).ToString();
        }
    }
}

*
*
====================【PaddleOCRSharp】====================
1、PaddleOCRSharp.cs

using PaddleOCRSharp;
using System.Drawing;

namespace Web
{
    public class PaddleOCREngineHandler
    {
        public static PaddleOCREngine engine = CreateOCRParameter();

        public static PaddleOCREngine CreateOCRParameter()
        {
            OCRParameter oCRParameter = new OCRParameter();
            oCRParameter.numThread = 6;//预测并发线程数
            oCRParameter.Enable_mkldnn = 1;//web部署该值建议设置为0,否则出错,内存如果使用很大,建议该值也设置为0.
            oCRParameter.cls = 1; //是否执行文字方向分类;默认false
            oCRParameter.det = 1;//是否开启方向检测,用于检测识别180旋转
            oCRParameter.use_angle_cls = 1;//是否开启方向检测,用于检测识别180旋转
            oCRParameter.det_db_score_mode = 1;//是否使用多段线,即文字区域是用多段线还是用矩形,
            oCRParameter.UnClipRatio = 8.6f;

            oCRParameter.MaxSideLen = 960;

            OCRModelConfig config = null;
            PaddleOCREngine engine = new PaddleOCREngine(config, oCRParameter);
            return engine;
        }

        /// <summary>
        /// 读取 byte[]  
        /// </summary>
        /// <param name="imagebyte"></param>
        /// <returns></returns>
        public string OCR2Str(byte[] imagebyte)
        {
            OCRResult ocrResult = engine.DetectText(imagebyte);
            return ocrResult.Text;
        }
        /// <summary>
        /// 读取 图片地址
        /// </summary>
        /// <param name="imagebyte"></param>
        /// <returns></returns>
        public string OCR2Str(string imagefile)
        {
            OCRResult ocrResult = engine.DetectText(imagefile);
            return ocrResult.Text;
        }
        /// <summary>
        /// 读取 image
        /// </summary>
        /// <param name="imagebyte"></param>
        /// <returns></returns>
        public string OCR2Str(Image image)
        {
            OCRResult ocrResult = engine.DetectText(image);
            return ocrResult.Text;
        }
        /// <summary>
        /// 读取图片Base64
        /// </summary>
        /// <param name="imagebase64"></param>
        /// <returns></returns>
        public string DetectTextBase64(string imagebase64)
        {
            OCRResult ocrResult = engine.DetectTextBase64(imagebase64);
            return ocrResult.Text;
        }
    }
}

2、Controller 调用

using Microsoft.AspNetCore.Mvc;
using Microsoft.AspNetCore.Mvc.Rendering;
using System.Net;
using System.Text;

namespace Web.Controllers
{
    [ApiController]
    [Route("[controller]")]
    public class WeatherForecastController : ControllerBase
    {
        [HttpGet]
        public string Get()
        {
            string code = string.Empty;
            string url = "http://www.******.cn/Login/VerifyCode?id=1)";
            string fileName = @"\upload\" + GenerateTimeStamp() + ".jpg";
            string pathName = @"D:\MyFile\新技术\爬虫" + (fileName);
            HttpWebRequest myRequest = (HttpWebRequest)WebRequest.Create(url);
            myRequest.Method = "GET";

            try
            {
                HttpWebResponse myResponse = (HttpWebResponse)myRequest.GetResponse();
                StreamReader reader = new StreamReader(myResponse.GetResponseStream(), Encoding.UTF8);
                Stream stream = myResponse.GetResponseStream();

                #region 保存下载图片
                Byte[] buffer = new Byte[myResponse.ContentLength];
                int offset = 0, actuallyRead = 0;
                do
                {
                    actuallyRead = stream.Read(buffer, offset, buffer.Length - offset);
                    offset += actuallyRead;
                }
                while (actuallyRead > 0);
                MemoryStream ms = new MemoryStream(buffer);

                byte[] buffurPic = ms.ToArray();
                System.IO.File.WriteAllBytes(pathName, buffurPic);
                #endregion

                code = new PaddleOCREngineHandler().OCR2Str(pathName);
            }
            catch (Exception ex)
            {

            }
            return code.Replace("-", "").Replace("_", "").Replace(" ", "").Trim();
        }

        public string GenerateTimeStamp()
        {
            TimeSpan ts = DateTime.UtcNow - new DateTime(1970, 1, 1, 0, 0, 0, 0);
            return Convert.ToInt64(ts.TotalMilliseconds).ToString();
        }
    }
}

*、WinForm

using PaddleOCRSharp;
using System;
using System.IO;
using System.Windows.Forms;

namespace OCR
{
    public partial class Form1 : Form
    {
        public Form1()
        {
            InitializeComponent();
        }

        private void button1_Click(object sender, EventArgs e)
        {
            OpenFileDialog ofd = new OpenFileDialog();
            ofd.Filter = "*.*|*.bmp;*.jpg;*.jpeg;*.tiff;*.tiff;*.png";
            if (ofd.ShowDialog() != DialogResult.OK) return;
            byte[] imagebyte = File.ReadAllBytes(ofd.FileName);

            //OCR参数
            OCRParameter oCRParameter = new OCRParameter();
            oCRParameter.numThread = 6;//预测并发线程数
            oCRParameter.Enable_mkldnn = 1;//web部署该值建议设置为0,否则出错,内存如果使用很大,建议该值也设置为0.
            oCRParameter.cls = 1; //是否执行文字方向分类;默认false
            oCRParameter.det = 1;//是否开启方向检测,用于检测识别180旋转
            oCRParameter.use_angle_cls = 1;//是否开启方向检测,用于检测识别180旋转
            oCRParameter.det_db_score_mode = 1;//是否使用多段线,即文字区域是用多段线还是用矩形,
            oCRParameter.UnClipRatio = 8.6f;

            oCRParameter.MaxSideLen = 960;
            OCRModelConfig config = null;
            //初始化OCR引擎
            PaddleOCREngine engine = new PaddleOCREngine(config, oCRParameter);
            OCRResult ocrResult = engine.DetectText(imagebyte);

            textBox1.Text = ocrResult.Text;
        }
    }
}

*
*
*

  • 1
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 3
    评论
评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值