百度云文字识别OCR【表格】

1、我转换的是表格,谢天鞋底有个接口叫:

表格文字识别接口

自动识别表格线及表格内容,结构化输出表头、表尾及每个单元格的文字内容。

表格文字识别接口为异步接口,分为两个API:提交请求接口、获取结果接口。

需要注意两点:

1、只有每天50的免费使用量

2、特别注意:表格【边框】必须有【实线】,excel的那种线也是不可识别

3、背景干净最好,这样识别率比较高

4、数字的、和,是分不清的
5、长图片不可识别

我实现的代码,有大神觉得不好的地方,万万指出:

 

@Controller
public class OCRController extends BaseController {

	private static final Log logger = LogFactory.getLog(OCRController.class);

 

	public JSONObject sample(AipOcr client, String image) {
		// 传入可选参数调用接口
		HashMap<String, String> options = new HashMap<String, String>();
		// options.put("recognize_granularity", "big");
		options.put("detect_direction", "true");
		// options.put("vertexes_location", "true");
		// 参数为本地路径
		JSONObject res = client.basicAccurateGeneral(image, options);
		return res;

	}

	public JSONObject sample1(AipOcr client, String image) {
		// 传入可选参数调用接口
		HashMap<String, String> options = new HashMap<String, String>();

		// 参数为本地路径
		JSONObject res = client.form(image, options);
		return res;
	}

	public static JSONObject sample2(AipOcr client, String image) {
		// 传入可选参数调用接口
		HashMap<String, String> options = new HashMap<String, String>();

		// 参数为本地路径

		JSONObject res = client.tableRecognitionAsync(image, options);
		return res;

		/*
		 * // 参数为二进制数组 byte[] file = readFile("test.jpg"); res =
		 * client.tableRecognitionAsync(file, options);
		 * System.out.println(res.toString(2));
		 */
	}

	public static JSONObject sample3(AipOcr client, String requestId) {
		// 传入可选参数调用接口
		HashMap<String, String> options = new HashMap<String, String>();
		options.put("result_type", "json");

		// 表格识别结果
		JSONObject res = client.tableResultGet(requestId, options);
		return res;

	}

 
	public static void main(String[] args) throws JSONException {

		// 初始化一个AipOcr
		AipOcr client = new AipOcr(APP_ID, API_KEY, SECRET_KEY);
		// 可选:设置网络连接参数
		client.setConnectionTimeoutInMillis(2000);
		client.setSocketTimeoutInMillis(60000);
		// 调用接口
		String path = "d:\\a2.png";

	 
		Map<String, Object> json = new HashMap<String, Object>();
		String reqestId = getReqestId(json, path);
		System.out.println(reqestId);
		System.out.println("===================1==============");
		net.sf.json.JSONArray ja = new net.sf.json.JSONArray();
		net.sf.json.JSONArray headerArr = new net.sf.json.JSONArray();
		imageTransformation(ja, headerArr, reqestId, "20190101-天马.png");

		System.out.println("==================2===============");

	}

 
	private void drawRect(String path) {
		File _file = new File(path); // 读入文件
		try {
			// 构造Image对象
			Image src = javax.imageio.ImageIO.read(_file);
			int width = src.getWidth(null); // 得到源图宽
			int height = src.getHeight(null); // 得到源图长
			BufferedImage image = new BufferedImage(width, height, BufferedImage.TYPE_INT_RGB);
			Graphics graphics = image.getGraphics();
			graphics.drawImage(src, 0, 0, width, height, null); // 绘制图
			// 背景那么干净,取灰度,找个阈值,二值化一下,然后腐蚀,系数调大点,寻找黑色区域的轮廓的外接正矩形

			// 画边框
			graphics.setColor(Color.BLACK);
			graphics.drawRect(1, 0, width - 1, height - 1);
			// graphics.drawRect(1, 1, width - 1, height - 1);
			// graphics.drawRect(0, 0, width-2, height- 2);
			FileOutputStream out = new FileOutputStream(path); // 输出到文件流
			JPEGImageEncoder encoder = JPEGCodec.createJPEGEncoder(out);
			encoder.encode(image);
			out.close();
		} catch (IOException e) {
			e.printStackTrace();
		}
	}

  
 
	private void delFile(String pic, String name) {
		logger.info("============================删除原文件3个:" + pic + "=========================");
		// 删除原文件3个
			String prepareForUpload ="";
			String filename ="";
			String[] arr = filename.split(".");
			if (arr != null && arr.length > 1) {
				if ("0".equals(name)) {
					filename = arr[0].substring(0, arr[0].length() - 1) + "2" + arr[1];
					logger.info("============================路径名+文件2号图=删除文件:" + filename + "=========================");
					UploadHelper.delTempFile(filename, prepareForUpload);
					filename = arr[0].substring(0, arr[0].length() - 1) + "1" + arr[1];
					logger.info("============================路径名+文件1号图=删除文件:" + filename + "=========================");
					UploadHelper.delTempFile(filename, prepareForUpload);
				}
				filename = arr[0].substring(0, arr[0].length() - 1) + arr[1];
				logger.info("============================路径名+文件原图=删除文件:" + filename + "=========================");
				UploadHelper.delTempFile(filename, prepareForUpload);
			}
	}

	 

	private static Map<String, Object> imageTransformation(net.sf.json.JSONArray ja, net.sf.json.JSONArray headerArr, String requestId, String fileName)
			throws JSONException {
		Map<String, Object> map = new HashMap<String, Object>();
		List<String> worng = new ArrayList<String>();
		String str = toRequestOCR(requestId);
		if (StringUtils.isBlank(str)) {
			map.put("code", "数据为空");
			return map;
		}
	 
		List<OcrDTO> list = getList(str, fileName);
	 
		getHeader(headerArr, list);

	 
		return map;
	}

	/**
	 * 获取主数据
	 * 
	 * @param ja
	 * @param list
	 */
	private static List<String> getBody(net.sf.json.JSONArray ja, List<OcrDTO> list) {
		logger.info("=======================获取主数据===========================");
		List<OcrDTO> t = new ArrayList<OcrDTO>();
		for (int i = 0; i < list.size(); i++) {
			t.add(list.get(i));
		}
		for (int i = 0; i < list.size() - 1; i++) {
			for (int j = list.size() - 1; j > i; j--) {
				if (list.get(j).getColumn() == (list.get(i).getColumn())) {
					list.remove(j);
				}
			}
		}

		net.sf.json.JSONObject column = new net.sf.json.JSONObject();
		for (int i = 0; i < list.size(); i++) {
			net.sf.json.JSONArray rowJsonArray = new net.sf.json.JSONArray();
			Map<String, Integer> m = new HashMap<String, Integer>();

			for (int j = 0; j < t.size(); j++) {
				if (list.get(i).getColumn() == t.get(j).getColumn() && list.get(i).getRow() != t.get(j).getRow()) {
					column.put("row", t.get(j).getRow());
					column.put("name", t.get(j).getWord());
					column.put("flag", 0);
					if (!m.containsKey("row" + t.get(0).getRow())) {
						m.put("row" + t.get(j).getRow(), t.get(j).getRow());
						rowJsonArray.add(column);
					}
				}
			}
			ja.add(rowJsonArray);
		}
		List<String> wrong = new ArrayList<String>();
		for (int i = 0; i < ja.size(); i++) {
			net.sf.json.JSONArray arr = (net.sf.json.JSONArray) ja.get(i);
			wrong = isWrong(arr);
			logger.info("=========================for:wrong" + wrong + "============================");
			if (CollectionUtils.isNotEmpty(wrong)) {
				for (int j = 0; j < wrong.size(); j++) {
					net.sf.json.JSONObject obj = (net.sf.json.JSONObject) arr.get(Integer.parseInt(wrong.get(j)));
					obj.put("flag", 1);
				}

				break;
			}
		}
		logger.info("=========================wrong" + wrong + "============================");
		return wrong;

	}

	/**
	 * 获取头部信息
	 * 
	 * @param headerArr
	 * @param list
	 */
	private static void getHeader(net.sf.json.JSONArray headerArr, List<OcrDTO> list) {
		net.sf.json.JSONObject head = new net.sf.json.JSONObject();
		net.sf.json.JSONArray harr = new net.sf.json.JSONArray();
		for (int i = 0; i < list.size(); i++) {
			if (list.get(i).getRow() == 0) {

				harr.add(list.get(i).getWord());
			}
		}

		/*
		 * if (null != harr) { harr.add("日期"); harr.add("类目"); }
		 */
		head.put("head", harr);
		headerArr.add(head);

	}

	/**
	 * 获取所有的数据,并将数据存到list
	 * 
	 * @param str
	 * @return
	 */
	private static List<OcrDTO> getList(String str, String fileName) {
		List<OcrDTO> list = new ArrayList<OcrDTO>();
		net.sf.json.JSONObject fromObject = net.sf.json.JSONObject.fromObject(str);
		net.sf.json.JSONArray jsonArray = fromObject.getJSONArray("forms");
		net.sf.json.JSONArray jsonbody = null;
		for (int i = 0; i < jsonArray.size(); i++) {
			net.sf.json.JSONObject json1 = (net.sf.json.JSONObject) jsonArray.get(i);
			jsonbody = json1.getJSONArray("body");
		}

		for (int i = 0; i < jsonbody.size(); i++) {
			net.sf.json.JSONObject json1 = (net.sf.json.JSONObject) jsonbody.get(i);
			String column = json1.getString("column");
			String row = json1.getString("row");
			String word = json1.getString("word");

			OcrDTO ocr = new OcrDTO(Integer.parseInt(row.substring(1, row.length() - 1)), Integer.parseInt(column.substring(1, column.length() - 1)), word);
			list.add(ocr);
		}
		int num = 0;
		int colum = 0;
		for (int i = 0; i < jsonbody.size(); i++) {
			net.sf.json.JSONObject json1 = (net.sf.json.JSONObject) jsonbody.get(i);
			String column = json1.getString("column");
			int col = Integer.parseInt(column.substring(1, column.length() - 1));
			if (col == 0) {
				num += 1;
			}
			colum = col;
			if (col > colum) {
				colum = col;
			}
		}

		String[] arr = fileName.split("-");
		for (int i = 0; i < num; i++) {
			if (i == 0) {
				OcrDTO ocr = new OcrDTO(i, colum + 2, "日期");
				list.add(ocr);
				continue;

			}
			String word = arr[0];
			OcrDTO ocr = new OcrDTO(i, colum + 2, word);
			list.add(ocr);
		}

		for (int i = 0; i < num; i++) {
			if (i == 0) {
				OcrDTO ocr = new OcrDTO(i, colum + 3, "类目");
				list.add(ocr);
				continue;
			}
			String word = arr[1].substring(0, arr[1].length() - 4);

			OcrDTO ocr = new OcrDTO(i, colum + 3, word);
			list.add(ocr);
		}

		Collections.sort(list, new Comparator<OcrDTO>() {

			@Override
			public int compare(OcrDTO o1, OcrDTO o2) {
				if (o1.getColumn() == o2.getColumn()) {
					return o1.getRow() - (o2.getRow());
				}
				return o1.getColumn() - o2.getColumn();
			}
		});
		//logger.info("=====================list:" + list + "========================");

		return list;
	}

	/**
	 * 开始请求ocr接口,获取返回
	 * 
	 * @param requestId
	 * @return
	 * @throws JSONException
	 */
	private static String toRequestOCR(String requestId) throws JSONException {

		String[] arr = requestId.split(" ");
		AipOcr client = new AipOcr(APP_ID, API_KEY, SECRET_KEY);
		if (arr[0].equals("1")) {
			client = new AipOcr(APP_ID1, API_KEY1, SECRET_KEY1);
		}
		if (arr[0].equals("2")) {
			client = new AipOcr(APP_ID2, API_KEY2, SECRET_KEY2);
		}
		if (arr[0].equals("3")) {
			client = new AipOcr(APP_ID3, API_KEY3, SECRET_KEY3);
		}
		if (arr[0].equals("4")) {
			client = new AipOcr(APP_ID4, API_KEY4, SECRET_KEY4);
		}
		requestId = arr[1];
		JSONObject res = sample3(client, requestId);
		String err = res.toString(2);

		if (!res.has("result")) {
			res = isb(client, requestId);
		}

		if (res.toString().contains("error_msg")) {
			logger.info("=====================err" + err + "========================");
			return null;
		}
		if (!res.has("result")) {
			return null;
		}
		String str = cycleRequest(requestId, client, err, res);
		return str;
	}

	/**
	 * 假如报错,循环请求
	 * 
	 * @param client
	 * @param requestId
	 * @return
	 */
	private static JSONObject isb(AipOcr client, String requestId) {
		try {
			Thread.sleep(5000);
		} catch (InterruptedException e) {
			e.printStackTrace();
		}
		JSONObject res1 = sample3(client, requestId);
		if (res1.toString().contains("error_msg")) {
			boolean f = true;
			int i = 0;
			while (f) {
				try {
					Thread.sleep(5000);
				} catch (InterruptedException e) {
					e.printStackTrace();
				}
				i += 1;
				res1 = sample3(client, requestId);
				logger.info("假如报错,循环请求" + res1.toString());
				if (!res1.toString().contains("error_msg")) {
					f = false;
				}
				if (i > 5) {
					f = false;
				}
			}
		}
		return res1;
	}

	/**
	 * 假如未完成,继续请求
	 * 
	 * @param requestId
	 * @param client
	 * @param err
	 * @param json
	 * @return
	 * @throws JSONException
	 */
	/**
	 * @param requestId
	 * @param client
	 * @param err
	 * @param res
	 * @return
	 * @throws JSONException
	 */
	private static String cycleRequest(String requestId, AipOcr client, String err, JSONObject res) throws JSONException {
		JSONObject json = res.getJSONObject("result");
		logger.info("=====================假如未完成,继续请求err" + err + "========================");
		int num = 0;
		while (true) {
			num += 1;
			String str = json.getString("ret_msg");
			if ("已完成".equals(str)) {
				break;
			}
			try {
				Thread.sleep(10000);
			} catch (InterruptedException e) {
				e.printStackTrace();
			}
			res = sample3(client, requestId);
			// 假如错误的
			if (res.toString().contains("error_msg")) {
				err = res.toString();
				break;
			}

			if (res.has("result"))
				json = res.getJSONObject("result");
			// 假如正确的
			if (json.has("ret_msg")) {
				str = json.getString("ret_msg");
				if ("未开始".equals(str) || "进行中".equals(str)) {
					logger.info("===============进行中未开始err=" + str + "=========================");
					try {
						Thread.sleep(10000);
					} catch (InterruptedException e) {
						e.printStackTrace();
					}
				} else if ("已完成".equals(str)) {
					err = res.toString();
					// logger.info("===============已完成err=" + err +
					// "=========================");
					break;
				}

			}
			if (num > 8) {
				err = res.toString();
				break;
			}
			logger.info("=====================while最后一行假如未完成,继续请求ret_msg" + str + "========================");
		}
		if (err.contains("error_msg")) {
			logger.info("=====================err" + err + "========================");
			return null;
		}

		String str = null;
		if (json.has("result_data")) {
			str = json.getString("result_data");
			// logger.info("=====================json.has(result_data)" +
			// json.toString() + "========================");
			return str;
		}

		// logger.info("=====================假如未完成,继续请求strisEmpty" + str +
		// "========================");
		return str;
	}
}


 

 

 

评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值