Java+python识别验证码

最新推荐文章于 2023-01-03 10:16:07 发布

一只拖后腿的程序猿

最新推荐文章于 2023-01-03 10:16:07 发布

阅读量2.4k

点赞数 1

分类专栏： java

本文链接：https://blog.csdn.net/xu622/article/details/97557780

版权

java 专栏收录该内容

11 篇文章 0 订阅

订阅专栏

爬虫要爬取网站，很重要的一步是识别验证码，只有识别验证码之后才可以进行之后的操作。

识别验证码

下载验证码

下载比较好解决，方式很多。这里不多说，先保存多张验证码。
在这里插入图片描述
随便打开一张图片，发现背景图有黑点，会干扰识别。

用百度OCR接口打开这张图片识别到“h千”。

降噪

怎么去除背后的黑点？先降噪。
系统要先装python运行环境，和安装 PIL 模块。

二值化

# 二值化处理
def two_value(parentPath, name, format):
    # 打开文件夹中的图片
    image = Image.open(parentPath + name + format)
    # 灰度图
    lim = image.convert('L')
    # 灰度阈值设为32，低于这个值的点全部填白色
    threshold = 32
    table = []

    for j in range(256):
        if j < threshold:
            table.append(0)
        else:
            table.append(1)

    bim = lim.point(table, '1')
    twopath = parentPath + "two"
    isExists = os.path.exists(twopath)
    if not isExists:
        os.makedirs(twopath)
    bim.save(twopath + "\\" + name + format)

在这里插入图片描述

去除单独的黑色像素点

# 去除单独的黑色像素点
def descrambler(parentPath, name, format):
    # 去除干扰线
    im = Image.open(parentPath + "two\\" + name + format)
    # 图像二值化
    data = im.getdata()
    w, h = im.size
    black_point = 0
    black_colour = 10

    for x in range(1, w - 1):
        for y in range(1, h - 1):
            mid_pixel = data[w * y + x]  # 中央像素点像素值
            if mid_pixel < 50:  # 找出上下左右四个方向像素点像素值
                top_pixel = data[w * (y - 1) + x]
                left_pixel = data[w * y + (x - 1)]
                down_pixel = data[w * (y + 1) + x]
                right_pixel = data[w * y + (x + 1)]

                # 判断上下左右的黑色像素点总个数
                if top_pixel < black_colour:
                    black_point += 1
                if left_pixel < black_colour:
                    black_point += 1
                if down_pixel < black_colour:
                    black_point += 1
                if right_pixel < black_colour:
                    black_point += 1
                if black_point < 1:
                    im.putpixel((x, y), 255)
                # print(black_point)
                black_point = 0

    recpath = parentPath + "rec"
    isExists = os.path.exists(recpath)
    if not isExists:
        os.makedirs(recpath)
    im.save(parentPath + "rec\\" + name + format)

这里边界上没有黑点，所以跟二值化后的图片没什么区别。
在这里插入图片描述

切割图片

# 切割图片
def smartSliceImg(parentPath, name, format, count=4, p_w=3):
    '''
    :param img:
    :param outDir:
    :param count: 图片中有多少个图片
    :param p_w: 对切割地方多少像素内进行判断
    :return:
    '''
    img = Image.open(parentPath + "rec\\" + name + format)
    w, h = img.size
    pixdata = img.load()
    eachWidth = int(w / count)
    beforeX = 0
    namepath = parentPath + name
    isExists = os.path.exists(namepath)
    if not isExists:
        os.makedirs(namepath)
    for i in range(count):

        allBCount = []
        nextXOri = (i + 1) * eachWidth

        for x in range(nextXOri - p_w, nextXOri + p_w):
            if x >= w:
                x = w - 1
            if x < 0:
                x = 0
            b_count = 0
            for y in range(h):
                if pixdata[x, y] == 0:
                    b_count += 1
            allBCount.append({'x_pos': x, 'count': b_count})
        sort = sorted(allBCount, key=lambda e: e.get('count'))

        nextX = sort[0]['x_pos']
        box = (beforeX, 0, nextX, h)

        img.crop(box).save(namepath + "\\" + str(i) + format)
        beforeX = nextX

在这里插入图片描述

Java调用Python

pom.xml

<!-- https://mvnrepository.com/artifact/org.python/jython-standalone -->
<dependency>
    <groupId>org.python</groupId>
    <artifactId>jython-standalone</artifactId>
    <version>2.7.1</version>
</dependency>

Java调用Python

public static void doPython(String path, String name, String format) {
        PythonInterpreter interpreter = new PythonInterpreter();
        String[] args1 = new String[]{"python", "D:\\python\\pyImage.py", path+"\\", name, format};
        try {
            Process pr = Runtime.getRuntime().exec(args1);
            BufferedReader in = new BufferedReader(new InputStreamReader(
                    pr.getInputStream()));
            String line;
            while ((line = in.readLine()) != null) {
                System.out.println(line);
            }
            in.close();
            pr.waitFor();
        } catch (IOException e) {
            e.printStackTrace();
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
    }

识别验证码

pom.xml

<!-- https://mvnrepository.com/artifact/net.sourceforge.tess4j/tess4j -->
<dependency>
    <groupId>net.sourceforge.tess4j</groupId>
    <artifactId>tess4j</artifactId>
    <version>4.2.2</version>
</dependency>

<dependency>
    <groupId>com.baidu.aip</groupId>
    <artifactId>java-sdk</artifactId>
    <version>4.8.0</version>
</dependency>

CaptchaUtil.java

public class CaptchaUtil {
    private static ITesseract instance = new Tesseract();

    static {
        File tessDataFolder = LoadLibs.extractTessResources("tessdata");
        /**
         * 设置训练库的位置
         */
        instance.setDatapath(tessDataFolder.getAbsolutePath());
        /**
         * 简体中文-chi_sim ; 英文-eng
         */
        instance.setLanguage("eng");
    }
    
    public static void pichandle(String fileName) {
        // 要识别的图片
        File file = new File(fileName);

        if (file.isDirectory()) {
            System.out.println("错误路径");
            return;
        } else {
            String fileSmartName = file.getName();
            String[] names = fileSmartName.split("\\.");
            String name = names[0];
            String format = "." + names[1];
            String parentPath = file.getParent();
            RuntimeFunction.doPython(parentPath, name, format);
        }

    }

    /**
     * Tess4J 单个字母识别
     *
     * @param filePath
     * @param format
     * @return
     */
    public static String singleTess(String filePath, String format) {
        StringBuilder captchaBuilder = new StringBuilder();
        for (int i = 0; i < 4; i++) {
            String fileSliceName = filePath + i + format;
            File f = new File(fileSliceName);
            String res = doOcr(f);
            res = removeSpecial(res);
            captchaBuilder.append(res);
        }
        System.out.println("singleTess : " + captchaBuilder.toString());
        String code = captchaBuilder.toString();
        if (!entiretyVerifyRes(code)){
            code = "";
        }
        return code;
    }

    /**
     * Baidu OCR 单个字母识别
     *
     * @param filePath
     * @param format
     * @return
     */
    private static String singleOCR(String filePath, String format) {
        StringBuilder captchaBuilder = new StringBuilder();
        for (int i = 0; i < 4; i++) {
            String fileSliceName = filePath + i + format;
            String res = BaiduOCRUtil.textRecognition(fileSliceName);
            res = removeSpecial(res);
            captchaBuilder.append(res);
        }
        System.out.println("singleOCR : " + captchaBuilder.toString());
        String code = captchaBuilder.toString();
        if (!entiretyVerifyRes(code)){
            code = "";
        }
        return code;
    }

    /**
     * Tess4J 整个图片识别
     * @param fileName
     * @return
     */
    public static String entiretyTess(String fileName) {
        File file = new File(fileName);
        String res = doOcr(file);
        res = removeSpecial(res);
        System.out.println(" entiretyTess : " + res);
        if (!entiretyVerifyRes(res)){
            res = "";
        }
        return res;
    }
    /**
     * Baidu OCR 整个图片识别
     * @param fileName
     * @return
     */
    public static String entiretyOCR(String fileName) {
        String res = BaiduOCRUtil.textRecognition(fileName);
        res = removeSpecial(res);
        System.out.println(" entiretyOCR : " + res);
        if (!entiretyVerifyRes(res)){
            res = "";
        }
        return res;
    }

    /**
     * 验证码校验
     * @param res
     * @return
     */
    private static boolean entiretyVerifyRes(String res) {
        boolean resFlag = false;
        String pattern = "[0-9a-zA-Z]{4}";
        if (res != "" && res != null) {
            resFlag = res.matches(pattern);
        }
        return resFlag;
    }

    /**
     * 单个字符校验
     * @param res
     * @return
     */
    private static boolean singleVerityRes(String res) {
        boolean resFlag = false;
        String pattern = "[0-9a-zA-Z]";
        if (res != "" && res != null) {
            resFlag = res.matches(pattern);
        }
        return resFlag;
    }

    /**
     * 去除特殊字符
     * @param res
     * @return
     */
    private static String removeSpecial(String res) {
        String regEx = "[\n`~!@#$%^&*()+=|{}':;',\\[\\].<>/?~！@#￥%……&*（）——+|{}【】‘；：”“’。， 、？§£¥€＠＆№]";
        String aa = "";
        Pattern p = Pattern.compile(regEx);
        Matcher m = p.matcher(res);
        String newString = m.replaceAll(aa).trim();
        return newString;
    }

    /**
     * Tess4J 识别方法
     * @param file
     * @return
     */
    private static String doOcr(File file) {
        String result = "";
        try {
            Long start = System.currentTimeMillis();
            result = instance.doOCR(file);
        } catch (TesseractException e) {
            e.printStackTrace();
        }
        return result;
    }
}

登陆

private static void loginPost() {
        createCookie();
        CloseableHttpResponse response = null;
        HttpPost post = new HttpPost(LOGIN_URL);
        post.setHeader("Host", "url");
        post.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0");
        post.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
        post.setHeader("Accept-Language", "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2");
        post.setHeader("Accept-Encoding", "gzip, deflate, br");
        post.setHeader("Content-Type", "application/x-www-form-urlencoded");
        post.setHeader("Connection", "keep-alive");
        post.setHeader("Referer", UrlContent.index);
        post.setHeader("Cookie", REQUEST_COOKIE);
        post.setHeader("Upgrade-Insecure-Requests", "1");

        boolean loginSucess = false;
        int n = 3;
        try {
            CaptchaUtil.pichandle(UrlContent.filePath + PIC_NAME);
            String[] names = PIC_NAME.split("\\.");
            String name = names[0];
            String format = "." + names[1];
            String code = "";
            while (!loginSucess && n > 0) {
                //输入验证码
                if (n == 3) {
                    code = CaptchaUtil.entiretyTess(UrlContent.filePath + "rec\\" + PIC_NAME);
                    TESS_NUM++;
                    System.out.println(n + " : " + code);
                }
                if (code == null || code == "" || n == 2) {
                    if (n == 3) {
                        n--;
                    }
                    code = CaptchaUtil.entiretyOCR(UrlContent.filePath + "rec\\" + PIC_NAME);
                    OCR_NUM++;
                    System.out.println(n + " : " + code);
                }
                if (code == null || code == "" || n == 1) {
                    if (n == 2) {
                        n--;
                    }
                    code = CaptchaUtil.singleTess(UrlContent.filePath + name + "\\", format);
                    SINGLE_NUM++;
                    System.out.println(n + " : " + code);
                }
                if (code != "") {
                    String verifyCode = code;
                    POST_PARAMS.put("verifyCode", verifyCode);
                    //组织请求参数
                    List<NameValuePair> paramList = new ArrayList<NameValuePair>();
                    if (POST_PARAMS != null && !POST_PARAMS.isEmpty()) {
                        Set<String> keySet = POST_PARAMS.keySet();
                        for (String key : keySet) {
                            paramList.add(new BasicNameValuePair(key, POST_PARAMS.get(key)));
                        }
                    }
                    UrlEncodedFormEntity requestEntity = new UrlEncodedFormEntity(paramList, Charsets.UTF_8);
                    post.setEntity(requestEntity);
                    Thread.sleep(2000L);
                    response = httpClient.execute(post);
                    if (302 == response.getStatusLine().getStatusCode()) {
                        SUCCESS_NUM++;
                        System.out.println("登陆成功");
                        loginSucess = true;
                        handle302(response);
                    } else {
                        System.out.println("登陆失败");
                    }
                }
                n--;
            }
            cookieStore.clear();
            System.out.println("END");
        } catch (IOException | InterruptedException e) {
            e.printStackTrace();
        }
    }