引入百度AI识别验证码
要识别验证码,java开源的OCR识别引擎Tesseract对于简单的工整字体识别率较高,加点倾斜啥的稍微复杂就不行。
百度AI识别,一天免费500次调用,很赞。
案例添加到github上,有兴趣可以看看: crawler
1.首先要获取下access_token,然后在控制台进入文字识别模块,添加一个应用,需要用到APIKEY和SECRETKEY
/**
* 重要提示代码中所需工具类
* FileUtil,Base64Util,HttpUtil,GsonUtils请从
* https://ai.baidu.com/file/658A35ABAB2D404FBF903F64D47C1F72
* https://ai.baidu.com/file/C8D81F3301E24D2892968F09AE1AD6E2
* https://ai.baidu.com/file/544D677F5D4E4F17B4122FBD60DB82B3
* https://ai.baidu.com/file/470B3ACCA3FE43788B5A963BF0B625F3
* 下载
*/
public static String webImage() {
// 请求url
String url = "https://aip.baidubce.com/rest/2.0/ocr/v1/webimage";
try {
// 本地文件路径
String filePath = "[本地文件路径]";
byte[] imgData = FileUtil.readFileByBytes(filePath);
String imgStr = Base64Util.encode(imgData);
String imgParam = URLEncoder.encode(imgStr, "UTF-8");
String param = "image=" + imgParam;
// 注意这里仅为了简化编码每一次请求都去获取access_token,线上环境access_token有过期时间, 客户端可自行缓存,过期后重新获取。
String accessToken = "[调用鉴权接口获取的token]";
String result = HttpUtil.post(url, accessToken, param);
System.out.println(result);
return result;
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
2.官方的是识别本地的图片,添加到我的取数项目里,我想要的是直接识别登录页的验证码图片,那就需要读取到url转成byte[] imgData再交给百度去识别。
/**
* 获取网络图片二进制数组
* 默认转成jpg类型
* @param client 客户端
* @param url 图片链接
* @return 二进制数组
* @throws IOException
*/
public static byte[] getPicByte(CloseableHttpClient client, String url) throws IOException {
HttpGet get = new HttpGet(url);
HttpEntity entity = null;
byte[] imgData=null;
try {
HttpResponse response = client.execute(get);
if (HttpStatus.SC_OK != response.getStatusLine().getStatusCode()) {
return null;
}
entity = response.getEntity();
ByteArrayOutputStream o = new ByteArrayOutputStream ();
//默认转成jpg
ImageIO.write(ImageIO.read(entity.getContent()), "jpg", o);
imgData= o.toByteArray();
} catch (Exception e) {
e.printStackTrace();
} finally {
get.abort();
}
return imgData;
}
/**
* 获取验证码值,默认识别最多3次
* @param client 客户端
* @param picUrl 验证码图片链接
* @param length 验证码识别长度
* @return 验证码
* @throws Exception
*/
public static String getVerifyCode(CloseableHttpClient client, String picUrl, int length) throws Exception {
for (int i = 0; i < 3; i++) {
byte[] imgData = WebUtil.getPicByte(client, picUrl);
String verifyCode = WebImage.webImage2(client, imgData);
verifyCode = StringUtil.getValue(verifyCode, "words\": \"", "\"}", 1);
if (verifyCode.length() == length) {
return verifyCode;
}
}
return "";
}
/**
* 获取验证码值,默认4位的验证码
*/
public static String getVerifyCode(CloseableHttpClient client, String picUrl) throws Exception {
return getVerifyCode(client,picUrl,4);
}
/**
* 传入图片二进制数组进行识别
* @param client 客户端
* @param imgData 图片二进制数组
* @return 识别结果
*/
public static String webImage2(CloseableHttpClient client, byte[] imgData) {
// 请求url
String url = AUTH_URL;
try {
// 图片的Byte数组
String imgStr = Base64Util.encode(imgData);
String imgParam = URLEncoder.encode(imgStr, "UTF-8");
String param = "image=" + imgParam;
// 注意这里仅为了简化编码每一次请求都去获取access_token,线上环境access_token有过期时间, 客户端可自行缓存,过期后重新获取。
String accessToken = AuthService.getAuth();
String result = HttpUtil.post(client,url, accessToken, param);
System.out.println(result);
return result;
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
3.还要修改下官方的AuthService,自己用的话管理下access_token的有效期,我保存在本地的token.txt
public static String getAuth(String ak, String sk) {
File file = new File ("token.txt");
if (!(file.exists ())){
try {
file.createNewFile ();
} catch (IOException e) {
e.printStackTrace ();
}
}
try(FileInputStream fis = new FileInputStream("token.txt")) {
InputStreamReader reader = new InputStreamReader(fis);
BufferedReader br = new BufferedReader(reader);
String line = null;
StringBuilder sb = new StringBuilder();
while ((line = br.readLine()) != null) {
sb.append(line);
}
line = sb.toString();
System.out.println(line);
String[] arr = line.split("@@");
if (arr.length > 1) {
String token = arr[0];
String exp = arr[1];
Date now = new Date();
long t = now.getTime();
long n = t - Long.valueOf(exp);
if (n < EXPIRE_TIME) {
return token;
}
}
} catch (Exception e1) {
e1.printStackTrace();
}
// 获取token地址
String authHost = AUTH_HOST;
String getAccessTokenUrl = authHost
// 1. grant_type为固定参数
+ "grant_type=client_credentials"
// 2. 官网获取的 API Key
+ "&client_id=" + ak
// 3. 官网获取的 Secret Key
+ "&client_secret=" + sk;
try {
URL realUrl = new URL(getAccessTokenUrl);
// 打开和URL之间的连接
HttpURLConnection connection = (HttpURLConnection) realUrl.openConnection();
connection.setRequestMethod("GET");
connection.connect();
// 获取所有响应头字段
Map<String, List<String>> map = connection.getHeaderFields();
// 遍历所有的响应头字段
for (String key : map.keySet()) {
System.err.println(key + "--->" + map.get(key));
}
// 定义 BufferedReader输入流来读取URL的响应
BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()));
String result = "";
String line;
while ((line = in.readLine()) != null) {
result += line;
}
/**
* 返回结果示例
*/
System.err.println("result:" + result);
JSONObject jsonObject = JSONObject.fromObject(result);
String access_token = jsonObject.getString("access_token");
//将token写入文件保存并记录当前日期以便更新
Date date = new Date();
access_token = access_token + "@@" + String.valueOf(date.getTime());
File f = new File("token.txt");
FileOutputStream fos = new FileOutputStream(f);
fos.write(access_token.getBytes());
fos.flush();
fos.close();
return access_token;
} catch (Exception e) {
System.err.printf("获取token失败!");
e.printStackTrace(System.err);
}
return null;
}