利用MODI作图像中文字提取

30 篇文章 1 订阅
1 篇文章 0 订阅

由于MODI组件是Office 2003自带的一个组件,所以在开发的机器中要安装Office 2003以上版本并选择安装MODI组件。在开发过程中,要外部引用MODI组件。在这里我们使用C#来进行MODI程序的开发和示例。
1、添加MODI组件
我们首先需要添加MODI引用到我们的工程文件中去。我们在添加引用库的时候,选择COM组件库,我们可以看到Microsoft Office DocumentImaging 12.0 Type Library组件(对应文件为MDIVWCTL.DLL)。添加成功后,我们可以在VS2013 C#的解决方案管理器的引用栏中看到MODI项,这就说明我们添加MODI成功了。
2、利用MODI组件中的Document对象
Document对象是MODI中最重要的一个对象,它提供了图片的引入、扫描等重要的方法。在这里我们首先创建一个MODI中的doc对象的一个实例:
MODI.Document doc = new MODI.Document();
然后把需要处理的文档图片准备好。图片的格式可以TIFF、BMP、PNG或者JPEG。当然我们有必要首先对图片进行一些必要的处理,尽量让图片干净、清晰。然后可以利用doc对象中的Create()方法,引入图片文档doc.Create(img_Path);其中img_Path为图片文档的路径。
在开发时需要引用如下图:
在这里插入图片描述
相关代码:
C_Upload代码

public class C_Upload : System.Web.UI.Page
    {
        Stopwatch sw = new Stopwatch();
        /// <summary>
        /// Upload_File
        /// </summary>
        /// <param name="filename">上传文件的要是压缩文件</param>
        /// <returns>1--请选择文件后再点击上传</returns>
        /// <returns>2--请上传后缀名为rar或zip的压缩文件</returns>
        /// <returns>3--上传成功</returns>
        /// <returns>4--解压失败</returns>
        public int UploadFile(string filename)
        {
            FileUpload FileUpload1 = new FileUpload();
            int bo = 0;
            string fileName = FileUpload1.PostedFile.FileName;
            if (string.IsNullOrEmpty(fileName))//判断是否有文件上传
            {
                bo = 1;
                return bo;
            }
            string filePath = Server.MapPath("/UploadFiles/");
            int subStart = fileName.LastIndexOf("\\") + 1;
            fileName = fileName.Substring(subStart, fileName.Length - subStart);
            if (fileName.IndexOf("rar") == -1 && fileName.IndexOf("zip") == -1)//判断压缩格式
            {
                bo = 2;
                return bo;
            }
            string path = filePath + fileName;
            string dirName = path.Substring(0, path.Length - 4);//通过压缩文件名得到压缩之后的文件名
            FileUpload1.SaveAs(path);
            bool b;
            Other other = new Other();
            b = other.DecompressionZipOrRar(path, filePath);
            if (b)
            {
                if (Directory.Exists(dirName))
                {
                    other.MoveToParentDirectory(dirName);
                }
                bo = 3;
            }
            else
            {
                bo = 4;
            }
            File.Delete(path);
            return bo;
        }

        int l = 0;

        string[] strTitle = new string[] { "企业注册号", "企业名称", "类型", "住所", "法定代表人", "成立时间", "注册资本", "营业期限", "经营范围", "登记机关", "核准时间" };
        string[] strContent;
        string[][] strC;

        public int ExcelWriter1(String[][] myData, String Path)
        {
            int ttt = 0;
            Microsoft.Office.Interop.Excel.Application excel = new Microsoft.Office.Interop.Excel.Application();
            Microsoft.Office.Interop.Excel.Workbook workbook = excel.Application.Workbooks.Add(true);
            for (int jj = 0; jj < strTitle.Length; jj++)
            {
                excel.Cells[1, jj + 1] = strTitle[jj]; //这里可换成其他数据类型 
            }
            for (int i = 0; i < myData.Length; i++)
            {
                String[] DataRow = myData[i];
                for (int j = 0; j < DataRow.Length; j++)
                {
                    excel.Cells[i + 2, j + 1] = DataRow[j]; //这里可换成其他数据类型 
                }
            }
            object missing = System.Reflection.Missing.Value;

            excel.DisplayAlerts = false;
            workbook.SaveAs(Path);
            workbook.Close(false, missing, missing);
            excel.Quit();
            ttt = 1;
            return ttt;
        }

        string filename = "";
        // 点击“转换”事件
        public void ChildThread(string pathstr)
        {
            filename = "";
            StringBuilder sb1 = new StringBuilder();
            int t = 0;
            // strContent = new string[txt_result.Items.Count + 1];
            //for (t = 0; t < txt_result.Items.Count; t++)
            {
                StringBuilder sb = new StringBuilder();
                string img_Path = pathstr;// txt_result.Items[t].ToString().Trim();  // 图片地址0
                l++;


                if (String.IsNullOrEmpty(img_Path))
                {
                    // MessageBox.Show("请先输入图片地址!");
                    return;
                }
                try
                {
                    MODI.Document doc = new MODI.Document();
                    doc.Create(img_Path);
                    MODI.Image image;
                    MODI.Layout layout;
                    doc.OCR(GetLanuageType("2052"), true, true);  // 识别文字类型
                    for (int i = 0; i < doc.Images.Count; i++)
                    {
                        image = (MODI.Image)doc.Images[i];
                        layout = image.Layout;
                        sb.Append(layout.Text);
                    }

                    sb1.Append("==" + img_Path + "=="); sb1.Append("\r\n");
                    sb1.Append(ReplaceT(sb.ToString()));
                }

                catch (Exception ex)
                {
                    sb1.Append("转换失败!详情:" + ex.Message);
                }
            }
            setText(sb1.ToString(), filename);
            //MessageBox.Show("识别完成:" + l.ToString() + " 个营业执照。");
        }

        public void changeerwei(string pathstr)
        {
            string[] fileNames = Directory.GetFiles(Server.MapPath(pathstr));
            strC = new string[fileNames.Length][];
            for (int i = 0; i < fileNames.Length; i++)  //this.txt_result.Items.Count;
            {
                strC[i] = new string[] { "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11" };
                try
                {
                    MatchCollection matches, matches2;
                    string p18 = "\r\n";
                    string p19 = "\r\n\r\n";
                    matches = Regex.Matches(strContent[i], p18);
                    matches2 = Regex.Matches(strContent[i], p19);
                    int b1 = strContent[i].IndexOf(":");
                    strC[i][0] = strContent[i].Substring(b1 + 1, matches2[0].Index - b1 - 1);
                    string[] ss = Regex.Split(strContent[i], "\r\n\r\n", RegexOptions.IgnoreCase);
                    //strContent[i].Split("\r\n\r\n");
                    for (int j = 1; j <= 11; j++)
                    {
                        int bb = ss[j].IndexOf(":");
                        strC[i][j] = ss[j].Substring(bb + 1);
                    }
                }
                catch { ;}
            }
            Thread.Sleep(5000);
            string paths = @"c:\抓取.xls";

            int b = ExcelWriter1(strC, paths);
            if (b == 1)
            {
                ;
            }
            for (int i = 0; i < fileNames.Length; i++)
            {
                System.IO.File.Delete(fileNames[i]);
            }


        }


        private MODI.MiLANGUAGES GetLanuageType(string sValue)
        {
            switch (sValue)
            {
                case "2052":
                    return MODI.MiLANGUAGES.miLANG_CHINESE_SIMPLIFIED;
                case "5":
                    return MODI.MiLANGUAGES.miLANG_CZECH;
                case "6":
                    return MODI.MiLANGUAGES.miLANG_DANISH;
                case "7":
                    return MODI.MiLANGUAGES.miLANG_GERMAN;
                case "8":
                    return MODI.MiLANGUAGES.miLANG_GREEK;
                case "9":
                    return MODI.MiLANGUAGES.miLANG_ENGLISH;
                case "10":
                    return MODI.MiLANGUAGES.miLANG_SPANISH;
                case "11":
                    return MODI.MiLANGUAGES.miLANG_FINNISH;
                case "12":
                    return MODI.MiLANGUAGES.miLANG_FRENCH;
                case "14":
                    return MODI.MiLANGUAGES.miLANG_HUNGARIAN;
                case "16":
                    return MODI.MiLANGUAGES.miLANG_ITALIAN;
                case "17":
                    return MODI.MiLANGUAGES.miLANG_JAPANESE;
                case "18":
                    return MODI.MiLANGUAGES.miLANG_KOREAN;
                case "19":
                    return MODI.MiLANGUAGES.miLANG_DUTCH;
                case "20":
                    return MODI.MiLANGUAGES.miLANG_NORWEGIAN;
                case "21":
                    return MODI.MiLANGUAGES.miLANG_POLISH;
                case "22":
                    return MODI.MiLANGUAGES.miLANG_PORTUGUESE;
                case "25":
                    return MODI.MiLANGUAGES.miLANG_RUSSIAN;
                case "29":
                    return MODI.MiLANGUAGES.miLANG_SWEDISH;
                case "31":
                    return MODI.MiLANGUAGES.miLANG_TURKISH;
                case "1028":
                    return MODI.MiLANGUAGES.miLANG_CHINESE_TRADITIONAL;
                default:
                    return MODI.MiLANGUAGES.miLANG_ENGLISH;
            }
        }


        private string ReplaceT(string sb)
        {
            StringBuilder ReStr = new StringBuilder();
            ReStr.Append("\r\n");
            try
            {
                string[] str;
                for (int i = 0; i <= strTitle.Length - 1; i++)
                {
                    Regex r = new Regex(strTitle[i]);
                    Match m = r.Match(sb);
                    if (m.Success)
                    {
                        //下面两个取一种即可。
                        sb = sb.Replace(strTitle[i], "");
                    }
                }
                str = sb.ToString().Split(':');
                for (int i = 0; i <= strTitle.Length - 1; i++)
                {

                    ReStr.Append(strTitle[i]);
                    ReStr.Append(":");
                    ReStr.Append(str[i + 1]);
                    ReStr.Append("\r\n"); ReStr.Append("\r\n");
                    if (strTitle[i] == "企业名称")
                    {
                        filename = str[i + 1];
                    }
                }
            }
            catch
            { }
            ReStr.Append("\r\n"); ReStr.Append("\r\n");
            ReStr.Append("==========================================================================================");
            ReStr.Append("\r\n"); ReStr.Append("\r\n");
            return ReStr.ToString();
        }

        private void setText(string content, string filename)
        {
            string FilePath = System.Environment.CurrentDirectory + "\\log\\" + filename + DateTime.Now.ToString("yyyyMMdd") + DateTime.Now.ToString("hhmmss") + ".txt";
            FileStream fs; fs = new FileStream(FilePath, FileMode.Create, FileAccess.Write);


            //fs = File.Open(FilePath,FileMode.Create,FileAccess.Write);
            StreamWriter sw = new StreamWriter(fs);

            string sl;
            sl = content;
            //把textbox里面的值赋给sl
            //StreamWriter se = new StreamWriter(fs);  
            //开始写入
            sw.Write(sl);//写
            // 保存textBox1中所有内容(所有行)
            /* foreach (string line in txt_result.Lines)
             {
                 sw.WriteLine(line);
             }*/
            //关闭文件
            sw.Flush();
            sw.Close();
            fs.Close();
            // 提示用户:文件保存的位置和文件名
            //MessageBox.Show("文件已成功保存到" + FilePath);
        }

    }

调用代码:

   private void button1_Click(object sender, EventArgs e)
        {
            string img_Path = textBox2.Text.Trim();

            if (this.openFileDialog1.ShowDialog() == DialogResult.OK)
            {
                this.listBox1.Text = this.openFileDialog1.FileName;
                string[] ff = this.listBox1.Text.ToString().Split('\\');
                for (int i = 0; i < ff.Length - 1; i++)
                    img_Path += ff[i] + "\\";
                this.listBox1.Text = img_Path;
                string[] f = System.IO.Directory.GetFiles(img_Path);
                for (int j = 0; j < f.Length; j++)
                {
                    this.listBox1.Items.Add(f[j]);
                }
            }
        }

        private void button2_Click(object sender, EventArgs e)
        {
            C_Upload uplib = new C_Upload();
            for (int j = 0; j < this.listBox1.Items.Count; j++)
            {
                uplib.ChildThread(this.listBox1.Items[j].ToString());  
            }
        }

3系统不安装office的情况下调用MODI
实现思路思路步骤:
(1)搞定MODI组件所需文件
(2)完成与MODI的COM组件相关的注册表项安装(添加)
(3)完成与Office相关的注册表项安装(添加)
3.1 MODI文件列表
MOID需要的支撑文件有:接口层:MDIVWCTL.DLL,MSPCORE.DLL,MSPGIMME.DLL,MSO.DLL,LATIN1.SHP等,数据层:JFONT.DAT,LOOKUP.DAT,OCRHC.DAT,OCRVC.DAT,TWGB32.DLL,SCCODE.UNI,SCPRINT.DAT,SCPRINT2.DAT等。
3.2 与MODI的COM组件相关的注册表项添加
COM相关就是与MODI的COM组件相关的注册表项,这个直接用regsvr32导入即可:启动命令行,进入MODI安装文件夹,执行下面的命令:
regsvr32 MDIVWCTL.DLL
regsvr32 MSPCORE.DLL
即可完成MODI COM组件的注册。
3.3 与Office相关的注册表项安装(添加)
[HKEY_CLASSES_ROOT\Installer\Components\61BA386016BD0C340BBEAC273D84FD5F]
“2052”=hex(7):76,00,55,00,70,00,41,00,56,00,53,00,2e,00,7d,00,58,00,25,00,21,
00,21,00,21,00,21,00,21,00,4d,00,4b,00,4b,00,53,00,6b,00,4f,00,43,00,52,00,
5f,00,32,00,30,00,35,00,32,00,3c,00,00,00,00,00
[HKEY_CLASSES_ROOT\Installer\Features\00002109F10040800000000000F01FEC]
“OCR_2052”=""
[HKEY_LOCAL_MACHINE\SOFTWARE\Microsoft\Windows\CurrentVersion\Installer\UserData\S-1-5-18\Products\00002109F10040800000000000F01FEC\Features]
“OCR_2052”="%mEMae,7q9*DXdU@EPi="
[HKEY_CLASSES_ROOT\Installer\Products\00002109710000000000000000F01FEC]
[HKEY_LOCAL_MACHINE\SOFTWARE\Microsoft\Windows\CurrentVersion\Installer\UserData\S-1-5-18\Components\3F745FF6A76FF2F4797DB74FC7B3FD8B]
“00002109710000000000000000F01FEC”=“C:\Program Files\Common Files\Microsoft Shared\MODI\12.0\XPAGE3C.DLL”
部分核心代码如下:

curpath = System.IO.Directory.GetCurrentDirectory();
            curpath = curpath + "\\所必须的DLL";
            propath = System.Environment.GetEnvironmentVariable("ProgramFiles");   
            soupath = propath +  @"\Common Files\microsoft shared\MODI";
            if (Directory.Exists(soupath) == false)
            { Directory.CreateDirectory(soupath); }
               regpath = soupath;
            string soufile, decfile;
            decfile = curpath + @"\1、接口层\MDIVWCTL.DLL";
            soufile = regpath + @"\MDIVWCTL.DLL";
            File.Copy(decfile, soufile, true);
            decfile = curpath + @"\1、接口层\MSO.DLL";
            soufile = regpath + @"\MSO.DLL";
            File.Copy(decfile, soufile, true);
            string command = "regsvr32.exe " + regpath + @"\MDIVWCTL.DLL";
            Process p = new Process();
            p.StartInfo.FileName = "cmd.exe";           
            p.Start();  
            p.StandardInput.WriteLine("cd /d" + regpath);
            p.StandardInput.WriteLine("regsvr32.exe MDIVWCTL.DLL");
            p.StandardInput.WriteLine("exit");
            p.Close();
            if(MessageBox.Show("OCR文件拷贝已经完成,请重启计算机。","提示",MessageBoxButtons.YesNo)==DialogResult.Yes)
            {   command = "shutdown -s -t 5";
                Process p5 = new Process();
                p5.StartInfo.FileName = "cmd.exe";           
                p5.StartInfo.Arguments = "/c " + command;   
                p5.Start();    }

以上是我使用MODI对图像文件中文字的提取。希望对大家有帮助。
下载地址:源码及资源库下载

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

刘二光

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值