/**
* Aliyun OCR 请求工具类
* @author Albert
* @date2020-03-15
* @version 1.0
*/
public class HttpUtils {
/**
* get
*
* @param host
* @param path
* @param method
* @param headers
* @param querys
* @return
* @throws Exception
*/
public static HttpResponse doGet(String host, String path, String method,
Map<String, String> headers,
Map<String, String> querys)
throws Exception {
HttpClient httpClient = wrapClient(host);
HttpGet request = new HttpGet(buildUrl(host, path, querys));
for (Map.Entry<String, String> e : headers.entrySet()) {
request.addHeader(e.getKey(), e.getValue());
}
return httpClient.execute(request);
}
/**
* post form
*
* @param host
* @param path
* @param method
* @param headers
* @param querys
* @param bodys
* @return
* @throws Exception
*/
public static HttpResponse doPost(String host, String path, String method,
Map<String, String> headers,
Map<String, String> querys,
Map<String, String> bodys)
throws Exception {
HttpClient httpClient = wrapClient(host);
HttpPost request = new HttpPost(buildUrl(host, path, querys));
for (Map.Entry<String, String> e : headers.entrySet()) {
request.addHeader(e.getKey(), e.getValue());
}
if (bodys != null) {
List<NameValuePair> nameValuePairList = new ArrayList<NameValuePair>();
for (String key : bodys.keySet()) {
nameValuePairList.add(new BasicNameValuePair(key, bodys.get(key)));
}
UrlEncodedFormEntity formEntity = new UrlEncodedFormEntity(nameValuePairList, "utf-8");
formEntity.setContentType("application/x-www-form-urlencoded; charset=UTF-8");
request.setEntity(formEntity);
}
return httpClient.execute(request);
}
/**
* Post String
*
* @param host
* @param path
* @param method
* @param headers
* @param querys
* @param body
* @return
* @throws Exception
*/
public static HttpResponse doPost(String host, String path, String method,
Map<String, String> headers,
Map<String, String> querys,
String body)
throws Exception {
HttpClient httpClient = wrapClient(host);
HttpPost request = new HttpPost(buildUrl(host, path, querys));
for (Map.Entry<String, String> e : headers.entrySet()) {
request.addHeader(e.getKey(), e.getValue());
}
if (StringUtils.isNotBlank(body)) {
request.setEntity(new StringEntity(body, "utf-8"));
}
return httpClient.execute(request);
}
/**
* Post stream
*
* @param host
* @param path
* @param method
* @param headers
* @param querys
* @param body
* @return
* @throws Exception
*/
public static HttpResponse doPost(String host, String path, String method,
Map<String, String> headers,
Map<String, String> querys,
byte[] body)
throws Exception {
HttpClient httpClient = wrapClient(host);
HttpPost request = new HttpPost(buildUrl(host, path, querys));
for (Map.Entry<String, String> e : headers.entrySet()) {
request.addHeader(e.getKey(), e.getValue());
}
if (body != null) {
request.setEntity(new ByteArrayEntity(body));
}
return httpClient.execute(request);
}
/**
* Put String
* @param host
* @param path
* @param method
* @param headers
* @param querys
* @param body
* @return
* @throws Exception
*/
public static HttpResponse doPut(String host, String path, String method,
Map<String, String> headers,
Map<String, String> querys,
String body)
throws Exception {
HttpClient httpClient = wrapClient(host);
HttpPut request = new HttpPut(buildUrl(host, path, querys));
for (Map.Entry<String, String> e : headers.entrySet()) {
request.addHeader(e.getKey(), e.getValue());
}
if (StringUtils.isNotBlank(body)) {
request.setEntity(new StringEntity(body, "utf-8"));
}
return httpClient.execute(request);
}
/**
* Put stream
* @param host
* @param path
* @param method
* @param headers
* @param querys
* @param body
* @return
* @throws Exception
*/
public static HttpResponse doPut(String host, String path, String method,
Map<String, String> headers,
Map<String, String> querys,
byte[] body)
throws Exception {
HttpClient httpClient = wrapClient(host);
HttpPut request = new HttpPut(buildUrl(host, path, querys));
for (Map.Entry<String, String> e : headers.entrySet()) {
request.addHeader(e.getKey(), e.getValue());
}
if (body != null) {
request.setEntity(new ByteArrayEntity(body));
}
return httpClient.execute(request);
}
/**
* Delete
*
* @param host
* @param path
* @param method
* @param headers
* @param querys
* @return
* @throws Exception
*/
public static HttpResponse doDelete(String host, String path, String method,
Map<String, String> headers,
Map<String, String> querys)
throws Exception {
HttpClient httpClient = wrapClient(host);
HttpDelete request = new HttpDelete(buildUrl(host, path, querys));
for (Map.Entry<String, String> e : headers.entrySet()) {
request.addHeader(e.getKey(), e.getValue());
}
return httpClient.execute(request);
}
private static String buildUrl(String host, String path, Map<String, String> querys) throws UnsupportedEncodingException {
StringBuilder sbUrl = new StringBuilder();
sbUrl.append(host);
if (!StringUtils.isBlank(path)) {
sbUrl.append(path);
}
if (null != querys) {
StringBuilder sbQuery = new StringBuilder();
for (Map.Entry<String, String> query : querys.entrySet()) {
if (0 < sbQuery.length()) {
sbQuery.append("&");
}
if (StringUtils.isBlank(query.getKey()) && !StringUtils.isBlank(query.getValue())) {
sbQuery.append(query.getValue());
}
if (!StringUtils.isBlank(query.getKey())) {
sbQuery.append(query.getKey());
if (!StringUtils.isBlank(query.getValue())) {
sbQuery.append("=");
sbQuery.append(URLEncoder.encode(query.getValue(), "utf-8"));
}
}
}
if (0 < sbQuery.length()) {
sbUrl.append("?").append(sbQuery);
}
}
return sbUrl.toString();
}
private static HttpClient wrapClient(String host) {
HttpClient httpClient = new DefaultHttpClient();
if (host.startsWith("https://")) {
sslClient(httpClient);
}
return httpClient;
}
private static void sslClient(HttpClient httpClient) {
try {
SSLContext ctx = SSLContext.getInstance("TLS");
X509TrustManager tm = new X509TrustManager() {
public X509Certificate[] getAcceptedIssuers() {
return null;
}
public void checkClientTrusted(X509Certificate[] xcs, String str) {
}
public void checkServerTrusted(X509Certificate[] xcs, String str) {
}
};
ctx.init(null, new TrustManager[] { tm }, null);
SSLSocketFactory ssf = new SSLSocketFactory(ctx);
ssf.setHostnameVerifier(SSLSocketFactory.ALLOW_ALL_HOSTNAME_VERIFIER);
ClientConnectionManager ccm = httpClient.getConnectionManager();
SchemeRegistry registry = ccm.getSchemeRegistry();
registry.register(new Scheme("https", 443, ssf));
} catch (KeyManagementException ex) {
throw new RuntimeException(ex);
} catch (NoSuchAlgorithmException ex) {
throw new RuntimeException(ex);
}
}
}
/**
* OCR识别 返回字符串显示
*
* @author Albert
* @date 2020-03-16
* @version 1.0
*
*/
public class OcrAliyunBO implements OcrAliyunIBPO {
private String OcrPath = PropertiesUtils.readValue("aliyun.ocr.albert.OcrPath");
// "C:\\temp\\";
/**
* OCR识别 返回字符串显示
*/
public List<String[]> AliyunOcr(File file) {
List<String[]> list = new ArrayList<String[]>();
// 2、印刷文字识别-表格识别/OCR文字识别
String host = PropertiesUtils.readValue("aliyun.ocr.albert.host");
String path = PropertiesUtils.readValue("aliyun.ocr.albert.path");
// 输入appcode
String appcode = PropertiesUtils.readValue("aliyun.ocr.albert.appcode");
String format= PropertiesUtils.readValue("aliyun.ocr.albert.format");
String imgFile = file.getAbsolutePath();
Boolean is_old_format = false;// 如果文档的输入中含有inputs字段,设置为True, 否则设置为False
// 请根据线上文档修改configure字段
JSONObject configObj = new JSONObject();
/*
* 参数说明: 1. format 输出格式:html/json/xlsx; 2. finance 是否使用财务报表模型: true/false; 3.
* dir_assure 图片方向是否确定是正向的: true(确定)/false(不确定) 4. line_less:是否无线条:
* true(无线条,或者只有横线没有竖线)/false(有线条) 5. skip_detection:
* 是否跳过检测,如果没有检测到表格,可以设置"skip_detection”:true
*/
configObj.put("format", format);
configObj.put("finance", false);
configObj.put("dir_assure", false);
String config_str = configObj.toString();
String method = "POST";
Map<String, String> headers = new HashMap<String, String>();
// 最后在header中的格式(中间是英文空格)为Authorization:APPCODE 83359fd73fe94948385f570e3c139105
headers.put("Authorization", "APPCODE " + appcode);
headers.put("Content-Type", "application/json; charset=UTF-8");
Map<String, String> querys = new HashMap<String, String>();
// 拼装请求body的json字符串
JSONObject requestObj = new JSONObject();
String bodys = null;
// 对图像进行base64编码
String imgBase64 = "";
try {
imgBase64 = AliyunUtils.changeToBase64(imgFile);
} catch (Exception e2) {
// TODO Auto-generated catch block
e2.printStackTrace();
}
try {
if (is_old_format) {
JSONObject obj = new JSONObject();
obj.put("image", getParam(50, imgBase64));
if (config_str.length() > 0) {
obj.put("configure", getParam(50, config_str));
}
JSONArray inputArray = new JSONArray();
inputArray.add(obj);
requestObj.put("inputs", inputArray);
} else {
requestObj.put("image", imgBase64);
if (config_str.length() > 0) {
requestObj.put("configure", config_str);
}
}
} catch (JSONException e) {
e.printStackTrace();
}
bodys = requestObj.toString();
try {
HttpResponse response = HttpUtils.doPost(host, path, method, headers, querys, bodys);
int stat = response.getStatusLine().getStatusCode();
if (stat != 200) {
System.out.println("Http code: " + stat);
System.out.println("http header error msg: " + response.getFirstHeader("X-Ca-Error-Message"));
System.out.println("Http body error msg:" + EntityUtils.toString(response.getEntity()));
if (stat == 413) {
String message[] = { "erro", "请求资源超过限制:大小不超过5M,最短边至少15px,最长边最大4096px,支持jpg/png/bmp格式" };
list.add(message);
} else {
String message[] = { "erro", "请求参数输入不正确!" };
list.add(message);
}
return list;
}
String res = EntityUtils.toString(response.getEntity());
JSONObject res_obj = JSON.parseObject(res);
Long fileName = System.currentTimeMillis();
if (is_old_format) {
JSONArray outputArray = res_obj.getJSONArray("outputs");
String output = outputArray.getJSONObject(0).getJSONObject("outputValue").getString("dataValue");
JSONObject out = JSON.parseObject(output);
// System.out.println(out.toJSONString());
} else {
// System.out.println(res_obj.toJSONString());
String tmp_base64path = (OcrPath + fileName).concat(".")+format;
File tmp_base64file = new File(tmp_base64path);
if (!tmp_base64file.exists()) {
tmp_base64file.getParentFile().mkdirs();
}
tmp_base64file.createNewFile();
// write
FileWriter fw = new FileWriter(tmp_base64file, true);
BufferedWriter bw = new BufferedWriter(fw);
bw.write(res_obj.getString("tables"));
bw.flush();
bw.close();
fw.close();
list = readTextData(tmp_base64path, true);
}
} catch (Exception e) {
e.printStackTrace();
}
return list;
}
public static JSONObject getParam(int type, String dataValue) {
JSONObject obj = new JSONObject();
try {
obj.put("dataType", type);
obj.put("dataValue", dataValue);
} catch (JSONException e) {
e.printStackTrace();
}
return obj;
}
}
/**
* 读取识别之后的数据(HTML文件)
*
* @param txtFilePath
* @param isDelete
* 解析完成之后是否删除文件
* @return
*/
private List<String[]> readTextData(String txtFilePath, boolean isDelete) {
// isDelete = false;// 测试时使用(保留识别结果原件)
List<String[]> list = new ArrayList<String[]>();
try {
String code = "UTF-8";
File file = new File(txtFilePath);
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file), code));
String lineTxt = null;
while ((lineTxt = br.readLine()) != null) {
System.out.println(lineTxt);
String text1 = lineTxt.replace("<tr><td colspan=\"1\" rowspan=\"1\">", "");
String text2 = text1.replace("<tr><td colspan=\"2\" rowspan=\"1\">", "");
String text3 = text2.replace("<tr><td colspan=\"2\" rowspan=\"2\">", "");
String text4 = text3.replace("</td><td colspan=\"2\" rowspan=\"1\">",
"</td><td colspan=\"1\" rowspan=\"1\">");
String[] split = text4.split("</td><td colspan=\"1\" rowspan=\"1\">");
list.add(split);
// System.out.println(split.length);
}
br.close();
if (isDelete) {
if (!file.delete()) {
System.out.println("文件删除失败!" + txtFilePath);
}
}
} catch (Exception e) {
e.printStackTrace();
}
return list;
}
}
/**
* 说明 : 读取配置文件工具类
* @time 2020年03月16日
* @author Albert
* @date 2020-03-16
* @version 1.0
*
*/
public class PropertiesUtils {
/**
* 根据key读取value
*
* @param filePath
* @param key
* @return
*/
public static String readValue( String key) {
Properties prop = getOcrProperties();
return prop.getProperty(key.trim());
}
/**
* 获取配置文件
*
* @param filePath
* @return
*/
public static Map<String, String> getPropertiesMap() {
/*if (null == filePath)
return null;*/
Properties properties = getOcrProperties();
//getProperties(filePath);
Map<String, String> map = new HashMap<String, String>();
if (null == properties)
return map;
Set<Object> keySet = properties.keySet();
for (Object key : keySet) {
if (null == key)
continue;
Object object = properties.get(key);
String value = null == object ? null : object.toString();
String keyStr = key.toString();
// System.out.println(keyStr + ":" + value);
map.put(keyStr, value);
}
return map;
}
/**
* 2、获取配置文件 OCR
*
* @param filePath
* @return
*/
public static Properties getOcrProperties() {
InputStream inputStream;
ClassLoader cl = PropertiesUtils.class .getClassLoader();
if (cl != null ) {
inputStream = cl.getResourceAsStream( "ocr-config.properties" );
} else {
inputStream = ClassLoader.getSystemResourceAsStream( "ocr-config.properties" );
}
Properties dbProps = new Properties();
try {
dbProps.load(new InputStreamReader(inputStream, "UTF-8"));
inputStream.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}finally {
try {
if (inputStream != null) {
inputStream.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
return dbProps;
}
}
#配置信息
#ocr-config.properties
#create by Albert 2020-03-16
#OCR interface production property
aliyun.ocr.albert.host=https\://form.market.alicloudapi.com
aliyun.ocr.albert.path=/api/predict/ocr_table_parse
aliyun.ocr.albert.appcode=XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
aliyun.ocr.albert.OcrPath=C\:\\temp\\
aliyun.ocr.albert.format=html