爬虫要爬取网站,很重要的一步是识别验证码,只有识别验证码之后才可以进行之后的操作。
识别验证码
下载验证码
下载比较好解决,方式很多。这里不多说,先保存多张验证码。
随便打开一张图片,发现背景图有黑点,会干扰识别。
用百度OCR接口打开这张图片识别到“h千”。
降噪
怎么去除背后的黑点?先降噪。
系统要先装python运行环境,和安装 PIL 模块。
二值化
# 二值化处理
def two_value(parentPath, name, format):
# 打开文件夹中的图片
image = Image.open(parentPath + name + format)
# 灰度图
lim = image.convert('L')
# 灰度阈值设为32,低于这个值的点全部填白色
threshold = 32
table = []
for j in range(256):
if j < threshold:
table.append(0)
else:
table.append(1)
bim = lim.point(table, '1')
twopath = parentPath + "two"
isExists = os.path.exists(twopath)
if not isExists:
os.makedirs(twopath)
bim.save(twopath + "\\" + name + format)
去除单独的黑色像素点
# 去除单独的黑色像素点
def descrambler(parentPath, name, format):
# 去除干扰线
im = Image.open(parentPath + "two\\" + name + format)
# 图像二值化
data = im.getdata()
w, h = im.size
black_point = 0
black_colour = 10
for x in range(1, w - 1):
for y in range(1, h - 1):
mid_pixel = data[w * y + x] # 中央像素点像素值
if mid_pixel < 50: # 找出上下左右四个方向像素点像素值
top_pixel = data[w * (y - 1) + x]
left_pixel = data[w * y + (x - 1)]
down_pixel = data[w * (y + 1) + x]
right_pixel = data[w * y + (x + 1)]
# 判断上下左右的黑色像素点总个数
if top_pixel < black_colour:
black_point += 1
if left_pixel < black_colour:
black_point += 1
if down_pixel < black_colour:
black_point += 1
if right_pixel < black_colour:
black_point += 1
if black_point < 1:
im.putpixel((x, y), 255)
# print(black_point)
black_point = 0
recpath = parentPath + "rec"
isExists = os.path.exists(recpath)
if not isExists:
os.makedirs(recpath)
im.save(parentPath + "rec\\" + name + format)
这里边界上没有黑点,所以跟二值化后的图片没什么区别。
切割图片
# 切割图片
def smartSliceImg(parentPath, name, format, count=4, p_w=3):
'''
:param img:
:param outDir:
:param count: 图片中有多少个图片
:param p_w: 对切割地方多少像素内进行判断
:return:
'''
img = Image.open(parentPath + "rec\\" + name + format)
w, h = img.size
pixdata = img.load()
eachWidth = int(w / count)
beforeX = 0
namepath = parentPath + name
isExists = os.path.exists(namepath)
if not isExists:
os.makedirs(namepath)
for i in range(count):
allBCount = []
nextXOri = (i + 1) * eachWidth
for x in range(nextXOri - p_w, nextXOri + p_w):
if x >= w:
x = w - 1
if x < 0:
x = 0
b_count = 0
for y in range(h):
if pixdata[x, y] == 0:
b_count += 1
allBCount.append({'x_pos': x, 'count': b_count})
sort = sorted(allBCount, key=lambda e: e.get('count'))
nextX = sort[0]['x_pos']
box = (beforeX, 0, nextX, h)
img.crop(box).save(namepath + "\\" + str(i) + format)
beforeX = nextX
Java调用Python
pom.xml
<!-- https://mvnrepository.com/artifact/org.python/jython-standalone -->
<dependency>
<groupId>org.python</groupId>
<artifactId>jython-standalone</artifactId>
<version>2.7.1</version>
</dependency>
Java调用Python
public static void doPython(String path, String name, String format) {
PythonInterpreter interpreter = new PythonInterpreter();
String[] args1 = new String[]{"python", "D:\\python\\pyImage.py", path+"\\", name, format};
try {
Process pr = Runtime.getRuntime().exec(args1);
BufferedReader in = new BufferedReader(new InputStreamReader(
pr.getInputStream()));
String line;
while ((line = in.readLine()) != null) {
System.out.println(line);
}
in.close();
pr.waitFor();
} catch (IOException e) {
e.printStackTrace();
} catch (InterruptedException e) {
e.printStackTrace();
}
}
识别验证码
pom.xml
<!-- https://mvnrepository.com/artifact/net.sourceforge.tess4j/tess4j -->
<dependency>
<groupId>net.sourceforge.tess4j</groupId>
<artifactId>tess4j</artifactId>
<version>4.2.2</version>
</dependency>
<dependency>
<groupId>com.baidu.aip</groupId>
<artifactId>java-sdk</artifactId>
<version>4.8.0</version>
</dependency>
CaptchaUtil.java
public class CaptchaUtil {
private static ITesseract instance = new Tesseract();
static {
File tessDataFolder = LoadLibs.extractTessResources("tessdata");
/**
* 设置训练库的位置
*/
instance.setDatapath(tessDataFolder.getAbsolutePath());
/**
* 简体中文-chi_sim ; 英文-eng
*/
instance.setLanguage("eng");
}
public static void pichandle(String fileName) {
// 要识别的图片
File file = new File(fileName);
if (file.isDirectory()) {
System.out.println("错误路径");
return;
} else {
String fileSmartName = file.getName();
String[] names = fileSmartName.split("\\.");
String name = names[0];
String format = "." + names[1];
String parentPath = file.getParent();
RuntimeFunction.doPython(parentPath, name, format);
}
}
/**
* Tess4J 单个字母识别
*
* @param filePath
* @param format
* @return
*/
public static String singleTess(String filePath, String format) {
StringBuilder captchaBuilder = new StringBuilder();
for (int i = 0; i < 4; i++) {
String fileSliceName = filePath + i + format;
File f = new File(fileSliceName);
String res = doOcr(f);
res = removeSpecial(res);
captchaBuilder.append(res);
}
System.out.println("singleTess : " + captchaBuilder.toString());
String code = captchaBuilder.toString();
if (!entiretyVerifyRes(code)){
code = "";
}
return code;
}
/**
* Baidu OCR 单个字母识别
*
* @param filePath
* @param format
* @return
*/
private static String singleOCR(String filePath, String format) {
StringBuilder captchaBuilder = new StringBuilder();
for (int i = 0; i < 4; i++) {
String fileSliceName = filePath + i + format;
String res = BaiduOCRUtil.textRecognition(fileSliceName);
res = removeSpecial(res);
captchaBuilder.append(res);
}
System.out.println("singleOCR : " + captchaBuilder.toString());
String code = captchaBuilder.toString();
if (!entiretyVerifyRes(code)){
code = "";
}
return code;
}
/**
* Tess4J 整个图片识别
* @param fileName
* @return
*/
public static String entiretyTess(String fileName) {
File file = new File(fileName);
String res = doOcr(file);
res = removeSpecial(res);
System.out.println(" entiretyTess : " + res);
if (!entiretyVerifyRes(res)){
res = "";
}
return res;
}
/**
* Baidu OCR 整个图片识别
* @param fileName
* @return
*/
public static String entiretyOCR(String fileName) {
String res = BaiduOCRUtil.textRecognition(fileName);
res = removeSpecial(res);
System.out.println(" entiretyOCR : " + res);
if (!entiretyVerifyRes(res)){
res = "";
}
return res;
}
/**
* 验证码校验
* @param res
* @return
*/
private static boolean entiretyVerifyRes(String res) {
boolean resFlag = false;
String pattern = "[0-9a-zA-Z]{4}";
if (res != "" && res != null) {
resFlag = res.matches(pattern);
}
return resFlag;
}
/**
* 单个字符校验
* @param res
* @return
*/
private static boolean singleVerityRes(String res) {
boolean resFlag = false;
String pattern = "[0-9a-zA-Z]";
if (res != "" && res != null) {
resFlag = res.matches(pattern);
}
return resFlag;
}
/**
* 去除特殊字符
* @param res
* @return
*/
private static String removeSpecial(String res) {
String regEx = "[\n`~!@#$%^&*()+=|{}':;',\\[\\].<>/?~!@#¥%……&*()——+|{}【】‘;:”“’。, 、?§£¥€@&№]";
String aa = "";
Pattern p = Pattern.compile(regEx);
Matcher m = p.matcher(res);
String newString = m.replaceAll(aa).trim();
return newString;
}
/**
* Tess4J 识别方法
* @param file
* @return
*/
private static String doOcr(File file) {
String result = "";
try {
Long start = System.currentTimeMillis();
result = instance.doOCR(file);
} catch (TesseractException e) {
e.printStackTrace();
}
return result;
}
}
登陆
private static void loginPost() {
createCookie();
CloseableHttpResponse response = null;
HttpPost post = new HttpPost(LOGIN_URL);
post.setHeader("Host", "url");
post.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0");
post.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
post.setHeader("Accept-Language", "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2");
post.setHeader("Accept-Encoding", "gzip, deflate, br");
post.setHeader("Content-Type", "application/x-www-form-urlencoded");
post.setHeader("Connection", "keep-alive");
post.setHeader("Referer", UrlContent.index);
post.setHeader("Cookie", REQUEST_COOKIE);
post.setHeader("Upgrade-Insecure-Requests", "1");
boolean loginSucess = false;
int n = 3;
try {
CaptchaUtil.pichandle(UrlContent.filePath + PIC_NAME);
String[] names = PIC_NAME.split("\\.");
String name = names[0];
String format = "." + names[1];
String code = "";
while (!loginSucess && n > 0) {
//输入验证码
if (n == 3) {
code = CaptchaUtil.entiretyTess(UrlContent.filePath + "rec\\" + PIC_NAME);
TESS_NUM++;
System.out.println(n + " : " + code);
}
if (code == null || code == "" || n == 2) {
if (n == 3) {
n--;
}
code = CaptchaUtil.entiretyOCR(UrlContent.filePath + "rec\\" + PIC_NAME);
OCR_NUM++;
System.out.println(n + " : " + code);
}
if (code == null || code == "" || n == 1) {
if (n == 2) {
n--;
}
code = CaptchaUtil.singleTess(UrlContent.filePath + name + "\\", format);
SINGLE_NUM++;
System.out.println(n + " : " + code);
}
if (code != "") {
String verifyCode = code;
POST_PARAMS.put("verifyCode", verifyCode);
//组织请求参数
List<NameValuePair> paramList = new ArrayList<NameValuePair>();
if (POST_PARAMS != null && !POST_PARAMS.isEmpty()) {
Set<String> keySet = POST_PARAMS.keySet();
for (String key : keySet) {
paramList.add(new BasicNameValuePair(key, POST_PARAMS.get(key)));
}
}
UrlEncodedFormEntity requestEntity = new UrlEncodedFormEntity(paramList, Charsets.UTF_8);
post.setEntity(requestEntity);
Thread.sleep(2000L);
response = httpClient.execute(post);
if (302 == response.getStatusLine().getStatusCode()) {
SUCCESS_NUM++;
System.out.println("登陆成功");
loginSucess = true;
handle302(response);
} else {
System.out.println("登陆失败");
}
}
n--;
}
cookieStore.clear();
System.out.println("END");
} catch (IOException | InterruptedException e) {
e.printStackTrace();
}
}