文字识别(Optical Character Recognition,OCR)
腾讯云文字识别(Optical Character Recognition,OCR )
阿里云文字识别
微信小程序 OCR工具 小程序身份证 OCR 识别
使用开源库tesseract,地址:https://github.com/tesseract-ocr/tesseract
tesseract训练的语言包:https://github.com/tesseract-ocr/tessdata
有专门针对C#的封装:https://github.com/charlesw/tesseract
C# Demo地址:https://github.com/charlesw/tesseract-samples
1、HttpWebRequest发送Web请求,获取流文件
2、将文件下载保存到本地
3、读取刚才下载的图片文件
4、NuGet Package Manager -> Tessnet2(NuGet.Tessnet2) -> Install
5、tessnet2.Tesseract识别图片内容
6、下载 tesseract 语言包:https://github.com/tesseract-ocr/tessdata,复制粘贴到项目根目录
using Petro.Web.App_Start;
using Petro.Web.CoreLibrary;
using System;
using System.Collections.Generic;
using System.Drawing;
using System.IO;
using System.Net;
using System.Text;
using System.Web.Mvc;
namespace Petro.Web.Controllers
{
public class HomeController : Controller
{
string url = "http://www.***.com/Image.aspx?ucode=ucode";
public ActionResult Tesseract()
{
string code = string.Empty;
string url = string.Empty;
string fileName = @"\upload\" + GenerateTimeStamp() + ".jpg";
string pathName = Server.MapPath(fileName);
HttpWebRequest myRequest = (HttpWebRequest)WebRequest.Create(url);
myRequest.Method = "GET";
HttpWebResponse myResponse = null;
try
{
myResponse = (HttpWebResponse)myRequest.GetResponse();
StreamReader reader = new StreamReader(myResponse.GetResponseStream(), Encoding.UTF8);
Stream stream = myResponse.GetResponseStream();
#region 保存下载图片
MemoryStream ms = null;
Byte[] buffer = new Byte[myResponse.ContentLength];
int offset = 0, actuallyRead = 0;
do
{
actuallyRead = stream.Read(buffer, offset, buffer.Length - offset);
offset += actuallyRead;
}
while (actuallyRead > 0);
ms = new MemoryStream(buffer);
byte[] buffurPic = ms.ToArray();
System.IO.File.WriteAllBytes(pathName, buffurPic);
#endregion
#region 读取目录文件
FileStream fileStream = new FileStream(pathName, FileMode.Open, FileAccess.Read, FileShare.Read);
byte[] bytes = new byte[fileStream.Length];
fileStream.Read(bytes, 0, bytes.Length);
fileStream.Close();
Stream streamFile = new MemoryStream(bytes);
#endregion
#region 创建Bitmap图片
Bitmap bitmap1 = (Bitmap)Bitmap.FromStream(streamFile);
#endregion
Bitmap bitmap = (Bitmap)bitmap1.Clone();
UnCodebase ud = new UnCodebase(bitmap);
bitmap = ud.GrayByPixels("");
ud.ClearNoise(128, 2);
tessnet2.Tesseract ocr = new tessnet2.Tesseract();//声明一个OCR类
ocr.SetVariable("tessedit_char_whitelist", "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
ocr.Init(Server.MapPath(@"\\tmpe"), "eng", false);
List<tessnet2.Word> result = ocr.DoOCR(bitmap, Rectangle.Empty);//执行识别操作
code = result[0].Text;
}
catch (Exception ex)
{
}
var resultJson = new
{
Code = 200,
Msg = "Success",
Data = new
{
Code = code,
FilePath = fileName
},
};
return Json(resultJson, JsonRequestBehavior.AllowGet);
}
public string GenerateTimeStamp()
{
TimeSpan ts = DateTime.UtcNow - new DateTime(1970, 1, 1, 0, 0, 0, 0);
return Convert.ToInt64(ts.TotalMilliseconds).ToString();
}
}
}
*
*
====================【PaddleOCRSharp】====================
1、PaddleOCRSharp.cs
using PaddleOCRSharp;
using System.Drawing;
namespace Web
{
public class PaddleOCREngineHandler
{
public static PaddleOCREngine engine = CreateOCRParameter();
public static PaddleOCREngine CreateOCRParameter()
{
OCRParameter oCRParameter = new OCRParameter();
oCRParameter.numThread = 6;//预测并发线程数
oCRParameter.Enable_mkldnn = 1;//web部署该值建议设置为0,否则出错,内存如果使用很大,建议该值也设置为0.
oCRParameter.cls = 1; //是否执行文字方向分类;默认false
oCRParameter.det = 1;//是否开启方向检测,用于检测识别180旋转
oCRParameter.use_angle_cls = 1;//是否开启方向检测,用于检测识别180旋转
oCRParameter.det_db_score_mode = 1;//是否使用多段线,即文字区域是用多段线还是用矩形,
oCRParameter.UnClipRatio = 8.6f;
oCRParameter.MaxSideLen = 960;
OCRModelConfig config = null;
PaddleOCREngine engine = new PaddleOCREngine(config, oCRParameter);
return engine;
}
/// <summary>
/// 读取 byte[]
/// </summary>
/// <param name="imagebyte"></param>
/// <returns></returns>
public string OCR2Str(byte[] imagebyte)
{
OCRResult ocrResult = engine.DetectText(imagebyte);
return ocrResult.Text;
}
/// <summary>
/// 读取 图片地址
/// </summary>
/// <param name="imagebyte"></param>
/// <returns></returns>
public string OCR2Str(string imagefile)
{
OCRResult ocrResult = engine.DetectText(imagefile);
return ocrResult.Text;
}
/// <summary>
/// 读取 image
/// </summary>
/// <param name="imagebyte"></param>
/// <returns></returns>
public string OCR2Str(Image image)
{
OCRResult ocrResult = engine.DetectText(image);
return ocrResult.Text;
}
/// <summary>
/// 读取图片Base64
/// </summary>
/// <param name="imagebase64"></param>
/// <returns></returns>
public string DetectTextBase64(string imagebase64)
{
OCRResult ocrResult = engine.DetectTextBase64(imagebase64);
return ocrResult.Text;
}
}
}
2、Controller 调用
using Microsoft.AspNetCore.Mvc;
using Microsoft.AspNetCore.Mvc.Rendering;
using System.Net;
using System.Text;
namespace Web.Controllers
{
[ApiController]
[Route("[controller]")]
public class WeatherForecastController : ControllerBase
{
[HttpGet]
public string Get()
{
string code = string.Empty;
string url = "http://www.******.cn/Login/VerifyCode?id=1)";
string fileName = @"\upload\" + GenerateTimeStamp() + ".jpg";
string pathName = @"D:\MyFile\新技术\爬虫" + (fileName);
HttpWebRequest myRequest = (HttpWebRequest)WebRequest.Create(url);
myRequest.Method = "GET";
try
{
HttpWebResponse myResponse = (HttpWebResponse)myRequest.GetResponse();
StreamReader reader = new StreamReader(myResponse.GetResponseStream(), Encoding.UTF8);
Stream stream = myResponse.GetResponseStream();
#region 保存下载图片
Byte[] buffer = new Byte[myResponse.ContentLength];
int offset = 0, actuallyRead = 0;
do
{
actuallyRead = stream.Read(buffer, offset, buffer.Length - offset);
offset += actuallyRead;
}
while (actuallyRead > 0);
MemoryStream ms = new MemoryStream(buffer);
byte[] buffurPic = ms.ToArray();
System.IO.File.WriteAllBytes(pathName, buffurPic);
#endregion
code = new PaddleOCREngineHandler().OCR2Str(pathName);
}
catch (Exception ex)
{
}
return code.Replace("-", "").Replace("_", "").Replace(" ", "").Trim();
}
public string GenerateTimeStamp()
{
TimeSpan ts = DateTime.UtcNow - new DateTime(1970, 1, 1, 0, 0, 0, 0);
return Convert.ToInt64(ts.TotalMilliseconds).ToString();
}
}
}
*、WinForm
using PaddleOCRSharp;
using System;
using System.IO;
using System.Windows.Forms;
namespace OCR
{
public partial class Form1 : Form
{
public Form1()
{
InitializeComponent();
}
private void button1_Click(object sender, EventArgs e)
{
OpenFileDialog ofd = new OpenFileDialog();
ofd.Filter = "*.*|*.bmp;*.jpg;*.jpeg;*.tiff;*.tiff;*.png";
if (ofd.ShowDialog() != DialogResult.OK) return;
byte[] imagebyte = File.ReadAllBytes(ofd.FileName);
//OCR参数
OCRParameter oCRParameter = new OCRParameter();
oCRParameter.numThread = 6;//预测并发线程数
oCRParameter.Enable_mkldnn = 1;//web部署该值建议设置为0,否则出错,内存如果使用很大,建议该值也设置为0.
oCRParameter.cls = 1; //是否执行文字方向分类;默认false
oCRParameter.det = 1;//是否开启方向检测,用于检测识别180旋转
oCRParameter.use_angle_cls = 1;//是否开启方向检测,用于检测识别180旋转
oCRParameter.det_db_score_mode = 1;//是否使用多段线,即文字区域是用多段线还是用矩形,
oCRParameter.UnClipRatio = 8.6f;
oCRParameter.MaxSideLen = 960;
OCRModelConfig config = null;
//初始化OCR引擎
PaddleOCREngine engine = new PaddleOCREngine(config, oCRParameter);
OCRResult ocrResult = engine.DetectText(imagebyte);
textBox1.Text = ocrResult.Text;
}
}
}
*
*
*