爬虫信息后ocr识别

之前博客写了使用PhantomJS爬取信息。

上篇链接:https://blog.csdn.net/qq_15351029/article/details/109305125

因为我爬取的网站可能做了一点恶心爬虫的操作,将部分关键信息设置为图片,因此在爬取的时候是将图片转为base64进行编码存储的。

当使用java程序调用爬虫脚本的时候会将数据爬取出来然后读取写入数据库。但是因为图片存储资源占用较多,且不方便后面做统计查询等。因此需要使用ocr进行识别。我爬取的网站很简单这里仅展示下ocr识别简单图片的信息。至于复杂的等以后遇到这种场景会进行补充。

使用ocr有几种方式

  • 可以使用java调用TesseractOcr的命令方式进行识别
  • 可以使用tess4j进行识别

我这里采用第二种方式。

实现思路是先将base64编码的图片写入本地服务器图片,然后识别图片。
为了减少服务器资源浪费,我这里设置了在识别之后可以删除生成的服务器图片。

依赖:

<dependency>
			<groupId>net.sourceforge.tess4j</groupId>
			<artifactId>tess4j</artifactId>
			<version>3.2.1</version>
			<exclusions>
				<exclusion>
					<groupId>com.sun.jna</groupId>
					<artifactId>jna</artifactId>
				</exclusion>
			</exclusions>
		</dependency>

代码


import net.sourceforge.tess4j.ITesseract;
import net.sourceforge.tess4j.Tesseract;
import net.sourceforge.tess4j.TesseractException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import sun.misc.BASE64Decoder;

import java.io.*;

/**
 * 描述:
 * OCR识别
 *
 * @author jimmy
 * @create 2020-11-04 19:17
 */
public class TesseractOcrUtil {

    private static final Logger log = LoggerFactory.getLogger(TesseractOcrUtil.class);

    static BASE64Decoder decoder = new sun.misc.BASE64Decoder();

    /**
     * base64转图片本地
     * @param imgStr
     * @param path 文件对应文件夹
     * @param fileName 文件名
     * @return
     */
    public static boolean generateImage(String imgStr, String path, String fileName) {
        log.info("TesseractOcrUtil generateImage path:" + path + "\n fileName :" + fileName + "\n imgStr :" + imgStr);
        if (imgStr == null)
            return false;
        try {

            File file = new File(path);
            // 若文件夹不存在则创建文件夹
            if(!file.exists()){
                file.mkdir();
            }

            // 解密
            byte[] b = decoder.decodeBuffer(imgStr);
            // 处理数据
            for (int i = 0; i < b.length; ++i) {
                if (b[i] < 0) {
                    b[i] += 256;
                }
            }
            String fullPath = path + fileName;
            log.info("TesseractOcrUtil generateImage fullPath:" + fullPath);

            OutputStream out = new FileOutputStream(fullPath);
            out.write(b);
            out.flush();
            out.close();
            return true;
        } catch (Exception e) {
            log.error("TesseractOcrUtil generateImage ex:", e);
            return false;
        }
    }


    public static String ocrImg(String path, String fileName, boolean delFlag){
       return TesseractOcrUtil.ocrImg(path + fileName,delFlag);
    }

    /**
     * OCR 识别图片并返回String
     * @param fullPath 文件全路径
     * @param delFlag boolean 是否删除文件 true 删除 false 否
     * @return
     */
    public static String ocrImg(String fullPath, boolean delFlag)  {

        log.info("ocrImg fullPath:" + fullPath + "  delFlag:" + delFlag);
        ITesseract instance = new Tesseract();
       /* instance.setDatapath(System.getProperty("user.dir") + "\\tessdata"); // 语言库位置
        // chi_sim:简体中文,eng:英文
        instance.setLanguage("eng");*/
        String result = null;
        try {
            File file = new File(fullPath);
            result = instance.doOCR(file);
            log.info("ocrImg result:" + result);
            if(delFlag){
                file.delete();
                log.info("ocrImg file delete " );
            }
        } catch (TesseractException e) {
            log.error("ocrImg ex ",e );
        }
        return result;
    }

    /**
     * test
     * @param args
     */
    public static void main(String[] args) {
        String realEndTime = "iVBORw0KGgoAAAANSUhEUgAAAGMAAAAmCAYAAADQriVKAAAACXBIWXMAAA7EAAAOxAGVKw4bAAAF80lEQVRoge2aa0xTZxjH/4okgiAUFamQMBUx6gcQFOQ6J5chOBVRwAFTt0ycwBCNWRYvuEWEmrh5wbKNWxFF2aKoMPwAlYtuJi4iGO4qo4ZLMsTQ1hallWdfxnHlWsqOOx/O71uf933/7zn9nfP2vG2nERGBhxNM/78PgOctvAwOwcvgELwMDsHL4BC8DA7By+AQvAwOwcvgELwMDjHjXU3U3NwCuUIOxyVLIBAIdNpqa+ugVqtHjHF2doKpqemU5tU3u6q6GjdvFGOQBhESEgx/P78ROWXl5TiwPwnTp7NzDbMuo7W1FQcPfgXZs2cAACMjI8TH7cWuXTuZPse++Rbt7e0jxv5ceBmOjo56zdP74gVCQ8OgUCgglZZhjpWV3tlV1dU4ciQZSUmJGHg9gMOHj8Jk5kx4eXkBALRaLZKTjyFs6xbWRAAsy1AolPhibzwsLGYj/0IerKyskJWdjTNnz8HBwQE+Pt5MX39/P8THx+mMt12wQO+5RKKTIBoctW2i7Pz8i9i1cwdCN28GADxta8ONm8WMDIkkD8bGxoiOitL7eAyBVRnFJSXo7e1FhjiduQqTjx5BXV0dLl66pCPD3Nwc79nbGzRPVXU1KiurEBkZgQsX8ke0T5Td0dGJhQsXMq/t7Gzx5MlTAEB3dzeysnMgFqezelcALH+ANzU1wdzcbMRS4+XphQcPajA4OPqVPBnUajVSUlIRG7t7UnfSv7G0tER3dzfzuqOjE0IbGwBAmugk/P394LJy5ZSPdSJYf5rq7381omZqagKtVouenh6m1tfXh0ePHqFdJptU/ukzZzHb3Bw7PokZs89E2RtCgpErycOdO3dRWnoLxcUliIjYhqrqatTU1ODA/qRJHZOhsLpMLVu2DCUlv6JcKmWeTrRaLaTS2wCA/ldvRVVUVKKiohIAYG1tjUOHvsb7vr7j5tfW1uHq1WuQ5OZgxoyxT2Wi7OjoKLweGMB5cQbMzGYhLe0Eli5ditAtYUhISIBAIEB/fz96enpgZ2fH3nJFLKJUKunDoGByX+NJ358+QwUFlykqOobWB4eQk7MLdXV1ERFRS0sLtbW1UVNTM10rKqKAwCByXeVGLS0tY2ZrNBoK3RJGqWkipnblSiE5ObvQ895epmZINhFR+nkxRUR+TEREmZlZ5LrKjbx9fMk/IJAaGxun8raMCasyiIg6OjopLj6BXFe5kaeXD6WmiSgnJ5ecV7qSWq0edUzr48fk5OxCItHJMXPF4gwKCAwilUrF1EaTYUi2TCaj1W5rqL6+gerq6miNhxf92d5ORETHj6dQVHTMRKdtEKzvM2xtFyD93FmdWkrKCQiFNjAxMRl1zBIHB8ybNxedXV2jtsvlcmTn5EIgsERcXAJT/+ufz6DExH3w9vbGntjdk84GgNQ0ETZt/AgrVixHRsYP8PTwYJ7GwiPCER4eCa1WO+7SaAjvbAc+xODgIO7c/Q2+vj7j9pHLFRBYCkZtNzIy0tk0DtHQ0IjOzk44OTlh8aJFBmWXlZWjqakZorQ0AMBLlQompm8vGmNjYwDAmzdv/nMZrC9Tw8nMzCIX19XU1tZGRERyuYL6+vp0+qSfF5OTswtJpbeZ2rWiIpJI8sbNHr5M6Zs9hEqlooDAILp+/YbOvOuDQ2hgYICIiAoKLtPWbRGTOGP9Yf3O+PSzz7F2rS/MZpnh93v3UF4uRdK+RGaTJZPJsDt2Dzw9PDBn7hw0Njaivr4BIcHBWLfuAyZHLM7A8+e9CA/fNubyNhx9s4fIyPgRQqENNm3ayNQ2hISgsPAXbN8eBXt7e9z/4z6+O3Vqiu/K6EwjYvd/U/EJX6K+vgEajQYODosRExOt8yWcQqGERCJBc0srlEol5s+3hp/fOqwPCtLJqXn4EKqXKp1d+3BKS2/hp8ws5ElyYGFhoXf2EOVSKVYsXw6hUKhT12g0qKysglKphLu7O2xtDdtcTgTrMnj0h/89g0PwMjgEL4ND8DI4BC+DQ/AyOAQvg0PwMjgEL4ND8DI4BC+DQ/AyOMTfrTP0mXBpZdAAAAAASUVORK5CYII=";

        String flyCode = "3U8221";
        String flyDate = "2020-10-24";
        String type = "realEndTime";
        String fileName = flyCode + "_" + flyDate + "_" + type + ".png";

        String path = "c:\\tmp\\";
        String linuxPath = "/app/temp/ocr/";
        String realPath = "";

        if (System.getProperty("os.name").toUpperCase().indexOf("WINDOWS") != -1) {
            realPath = path;
        } else {
            realPath = linuxPath;
        }

        TesseractOcrUtil util = new TesseractOcrUtil();
        try {
            // 本地生成文件
            util.generateImage(realEndTime, realPath, fileName);
            // ocr识别
            String realTime =  util.ocrImg(realPath + fileName, false);
            System.out.println(realTime);
        }catch (Exception e){

            System.out.println(e);
        }
        System.out.println("---------");
    }
}
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值