java模拟爬虫登录业务系统带传统验证码

1、使用的是tess4j识别验证码;

2、使用jsoup模拟浏览器登录请求。

package com.test.tess;

import com.alibaba.fastjson.JSONObject;
import com.fasterxml.jackson.core.SerializableString;
import com.jst.tess.constants.Constants;
import com.jst.tess.util.FileUtils;
import net.sourceforge.tess4j.ITesseract;
import net.sourceforge.tess4j.Tesseract;
import net.sourceforge.tess4j.TesseractException;
import org.apache.struts2.ServletActionContext;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.web.context.request.RequestContextHolder;
import org.springframework.web.context.request.ServletRequestAttributes;
import sun.net.www.http.HttpClient;

import javax.servlet.ServletException;
import javax.servlet.ServletRequest;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import javax.servlet.http.HttpSession;
import java.io.File;
import java.io.IOException;
import java.net.URL;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class test4 extends HttpServlet{

    //登录链接
    private static String baseUrl = "http://192.168.0.20:8080/test/login.jsp";

    //验证码保存路径
    private static String verCodePath = "D:\\img\\codeimg";

    //验证码请求地址
    private static String codeimgurl = "http://192.168.0.20:8080/test/login/getCode.do";


    //登录地址
    private static String loginUrl = "http://192.168.0.20:8080/test/login/login.do";

    //注销地址
    private static String logoutUrl = "http://192.168.0.20:8080/test/login/logout.do";

    //测试数据列表路径
    private static String listUrl = "http://192.168.0.20:8080/test/testList/getList.do";

    //测试数据详情路径
    private static String getOneUrl = "http://192.168.0.20:8080/test/testView/view.do";

    //用户名
    private static String userName = "test";

    //密码
    private static String passWord = "96af831e99ef1788b04c84d0a7782e855d700d4d6e7938722cfbcbaa";

    //判断是否进入首页标识,根据id属性获取
    private static String ifIndexPage = "index-menu";

    //全局session信息
    private static String baseSessions ="";

    public static void main(String[] args) throws IOException, TesseractException {
       
//        login();
//        getList();
//        getOne("9");
        /**测试识别验证码阈值
        byte[] codeimgdata = Jsoup.connect(codeimgurl2).ignoreContentType(true).execute().bodyAsBytes();
        FileUtils.saveImg(codeimgdata, verCodePath, "codeimg.jpg");
        //识别样本输出地址
        String ocrResult = verCodePath+"\\codetmpimgtmp.jpg";
        String OriginalImg = verCodePath+"\\codeimg.jpg";
        //去噪点
        FileUtils.removeBackground(OriginalImg, ocrResult);
        ITesseract instance =new Tesseract();
        //获得Tesseract的文字库
        URL url2 = ClassLoader.getSystemResource("tessdata");
        String tesspath = url2.getPath().substring(1);
        instance.setDatapath(tesspath);//进行读取,默认是英文,如果要使用中文包,加上instance.setLanguage("chi_sim");
        File imgDir =new File(ocrResult);
        String code = instance.doOCR(imgDir);//识别验证码
        code = replaceBlank(code);
        System.out.println("codeLength:"+code.length()+",code:"+code);
         测试识别验证码阈值结束*/
    }

    /**
     *
     * @param url 系统地址 
     * @param user 用户名  
     * @param pwd 密码
     * @param tess4jpath tess4j的地址 如G:\test\Tess4J-3.4.8-src\Tess4J  D:\home\55.png
     * @return
     */
    public Map login(String url, String user, String pwd, String tess4jpath) {
        System.out.println("begin:");
        Map<String,String> map = null;
        Connection.Response LoginResponse = null;
        try {
            LoginResponse = Jsoup.connect(url).method(Connection.Method.GET).execute();
            map = LoginResponse.cookies();//获取会话,登录后需要保持会话
            String sessName = "JSESSIONID";
            String sessions = (String) map.get("JSESSIONID");
            System.out.println("sessions="+sessions);
//            System.out.println("map1:"+map.toString());
//            Document document = LoginResponse.parse();
//            Element element = document.getElementById("varifyCodeImg");
//            String codeimgurl2 = element.attr("id");
//            System.out.println("222222:"+codeimgurl2);
            String codeimgurl = "http://192.168.0.37:8080/test/login/getCode.do";
            String connectPath = "http://192.168.0.37:8080/test/login/login.do";
            String codeimgpath = tess4jpath+"\\codeimg";
            //下载验证码图片
            byte[] codeimgdata = Jsoup.connect(codeimgurl).header("Cookie",sessName + "=" + sessions).ignoreContentType(true).execute().bodyAsBytes();
            FileUtils.saveImg(codeimgdata, codeimgpath, "codeimg.jpg");
            //识别样本输出地址
            String ocrResult = codeimgpath+"\\codetmpimgtmp.jpg";
            String OriginalImg = codeimgpath+"\\codeimg.jpg";
            //去噪点
            FileUtils.removeBackground(OriginalImg, ocrResult);
            ITesseract instance =new Tesseract();
            instance.setDatapath(tess4jpath);
            //获得Tesseract的文字库
            URL url2 = ClassLoader.getSystemResource("tessdata");
            String tesspath = url2.getPath().substring(1);
            instance.setDatapath(tesspath);//进行读取,默认是英文,如果要使用中文包,加上instance.setLanguage("chi_sim");
            File imgDir =new File(OriginalImg);
            String code = instance.doOCR(imgDir);//识别验证码
            code = replaceBlank(code);
            System.out.println("codeLength:"+code.length()+",code:"+code);
            Map datas = new HashMap();
            datas.put("username", user);
            datas.put("loginkey", pwd);
            datas.put("verifycode",code);
//            Connection.Response connection = Jsoup.connect(connectPath).header("Cookie",sessName + "=" + sessions).data(datas).execute();
//            connection.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
//
//            connection.header("Accept-Encoding", "gzip, deflate, br");
//
//            connection.header("Accept-Language", "zh-CN,zh;q=0.9");
//
//            connection.header("Cache-Control", "max-age=0");
//
//            connection.header("Connection", "Keep-Alive");
//
//            connection.header("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8");
//
            connection.header("Host", "http://192.168.0.37:8080");
//
//            connection.header("Cookie", sessions);
//

//            connection.header("Referer", "http://192.168.0.37:8080/test/login.jsp;"+sessions);
//
//            connection.header("Sec-Fetch-Dest", "document");
//
//            connection.header("Sec-Fetch-Mode", "navigate");
//
//            connection.header("Sec-Fetch-Site", "same-origin");
//
//            connection.header("Sec-Fetch-Use", "?1");
//
//            connection.header("Upgrade-Insecure-Requests", "1");
//
//            connection.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36");

//            Response response = conn.ignoreContentType(true).method(Method.POST).data(map).cookies(map).execute();
            Document document=Jsoup.connect(connectPath).header("Cookie",sessName + "=" + sessions).data(datas).post();
//            Connection.Response response1 = connection.data(datas).method(Connection.Method.POST).execute();
//            System.out.println("response:"+document.body());
            String listUrl = "http://192.168.0.37:8080/test/testList/getList.do";
            Map datas2 = new HashMap();
            datas.put("page", 1);
            datas.put("rows", 10);
            datas.put("sort","checkDate");
            datas.put("order","desc");
            Document document2=Jsoup.connect(listUrl).header("Cookie",sessName + "=" + sessions).data(datas2).post();
            String retString = document2.body().text();
            System.out.println(retString);
        } catch (IOException e) {
            map = null;
            e.printStackTrace();
        } catch (TesseractException e) {
            map = null;
            e.printStackTrace();
        }finally {
            System.out.println("map:"+map);
            return map;
        }
    }

    public static String getList()  {
        String sessinId = "JSESSIONID";
        String retString = "";
        try {
//            String sessMess = (String) session.getAttribute(Constants.SESSION_ID);
            System.out.println("session:"+baseSessions);
            Map datas2 = new HashMap();
//        datas2.put("page", 1);
//        datas2.put("rows", 10);
//        datas2.put("sort","checkDate");
//        datas2.put("order","desc");
            if(baseSessions !=null){
                Document document2=Jsoup.connect(listUrl).header("Cookie",sessinId + "=" + baseSessions).data(datas2).post();
                System.out.println(document2.body());
                if(document2.getElementById("verifycode") == null && document2.body() !=null){
                    System.out.println("不需要重新登录!");
                    Document document3=Jsoup.connect(listUrl).
                            header("Cookie",sessinId + "=" + baseSessions).
                            data(datas2).ignoreContentType(true).post();
                    retString = document2.body().text();
                    System.out.println(retString);
                    return retString;
                }else{
                    System.out.println("需要重新登录!");
                    login();
                    getList();
                }
            }else{
                login();
                getList();
            }
        }catch (IOException e){
            System.out.println("进入异常!");
            retString = e.toString();
        }
        return retString;
    }

    public static String getOne(String id){
        String sessinId = "JSESSIONID";
        String retString = "";
        Connection.Response connResponse = null;
        try {
//            String sessMess = (String) session.getAttribute(Constants.SESSION_ID);
            System.out.println("session:"+baseSessions);
            Map datas2 = new HashMap();
            datas2.put("id", id);
            if(baseSessions !=null){
                Document document2=Jsoup.connect(getOneUrl).
                        header("Cookie",sessinId + "=" + baseSessions).
                        data(datas2).timeout(10000).post();
//                System.out.println(document2.body());
                if(document2.getElementById("verifycode") == null && document2.body() !=null){
                    System.out.println("不需要重新登录!");
                    Document document3=Jsoup.connect(getOneUrl).header("Cookie",sessinId + "=" + baseSessions).data(datas2).timeout(10000).post();
                    connResponse = Jsoup.connect(getOneUrl).header("Cookie",sessinId + "=" + baseSessions).data(datas2).timeout(10000).execute();
                    System.out.println(document3);
                    Elements elementtds = document3.select("td");
                    for(Element element : elementtds){
                        String qymc = element.text();
                        System.out.println(qymc);
                    }
                    retString = document3.body().text();
                    System.out.println(retString);
                    return retString;
                }else{
                    System.out.println("需要重新登录!");
                    login();
                    getOne(id);
                }
            }else{
                login();
                getOne(id);
            }
        }catch (IOException e){
            System.out.println("进入异常!"+e.toString());
            retString = e.toString();
        }
        return retString;

    }


    public static JSONObject login(){
        Map<String,String> map = null;
        Connection.Response LoginResponse = null;
        String sessions = "";
        HttpSession session = null;
        JSONObject jsonObject = new JSONObject();
        try {
            LoginResponse = Jsoup.connect(baseUrl).method(Connection.Method.GET).execute();
            map = LoginResponse.cookies();//获取会话,登录后需要保持会话
            String sessName = "JSESSIONID";
            sessions = (String) map.get("JSESSIONID");
            System.out.println("sessions="+sessions);
            //下载验证码图片
            byte[] codeimgdata = Jsoup.connect(codeimgurl).header("Cookie",sessName + "=" + sessions).ignoreContentType(true).execute().bodyAsBytes();
            FileUtils.saveImg(codeimgdata, verCodePath, "codeimg.jpg");
            //识别样本输出地址
            String ocrResult = verCodePath+"\\codetmpimgtmp.jpg";
            String OriginalImg = verCodePath+"\\codeimg.jpg";
            //去噪点
            FileUtils.removeBackground(OriginalImg, ocrResult);
            ITesseract instance =new Tesseract();
            //获得Tesseract的文字库
            URL url2 = ClassLoader.getSystemResource("tessdata");
            String tesspath = url2.getPath().substring(1);
            instance.setDatapath(tesspath);//进行读取,默认是英文,如果要使用中文包,加上instance.setLanguage("chi_sim");
            File imgDir =new File(OriginalImg);
            String code = instance.doOCR(imgDir);//识别验证码
            code = replaceBlank(code);
            System.out.println("codeLength:"+code.length()+",code:"+code);
            Map datas = new HashMap();
            datas.put("username", userName);
            datas.put("loginkey", passWord);
            datas.put("verifycode",code);
            Document document=Jsoup.connect(loginUrl).header("Cookie",sessName + "=" + sessions).data(datas).post();
            System.out.println("response:"+document);
            String ifIndexMess = document.getElementById(ifIndexPage).toString();
            if(ifIndexMess != null){//当前访问的是首页
                System.out.println("进入了首页!");
                baseSessions = sessions;
                jsonObject.put("code","200");
                jsonObject.put("sessionId",sessions);
            }else{
                jsonObject.put("code","999");
                jsonObject.put("sessionId","未成功进入首页"+document);
                System.out.println("未成功进入首页!"+ifIndexMess);
                baseSessions = null;
            }
        } catch (Exception e) {
            jsonObject.put("code","999");
            jsonObject.put("sessionId",e.toString());
            e.printStackTrace();
        }
        return jsonObject;
    }

    public static void logout(String sessionId){
        try {
            Document logoutDoc = Jsoup.connect(logoutUrl).cookie("JSESSIONID", sessionId).post();
            System.out.println("注销成功!");
        }catch (IOException e){
            System.out.println("进入异常!");
        }
    }

    public static String replaceBlank(String str) {

        String dest = "";

        if (str != null) {

            Pattern p = Pattern.compile("\\s*|\t|\r|\n");

            Matcher m = p.matcher(str);

            dest = m.replaceAll("");

        }

        return dest;

    }
}

使用的工具类:

package com.jst.tess.util;

import javax.imageio.ImageIO;
import java.awt.*;
import java.awt.image.BufferedImage;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;

public class FileUtils {

        /**
         * 级联创建目录
         * @param path
         */
        public static void creatDir(String path) {
            File file = new File(path);
            if(!file.exists()) {
                file.mkdirs();
            }
        }
        /**
         * 验证码图片处理
         * @param imgUrl
         * @param resUrl
         */
        public static void removeBackground(String imgUrl, String resUrl){
            //定义一个临界阈值
            int threshold = 400;
            try{
                BufferedImage img = ImageIO.read(new File(imgUrl));
                int width = img.getWidth();
                int height = img.getHeight();
                for(int i = 1;i < width;i++){
                    for (int x = 0; x < width; x++){
                        for (int y = 0; y < height; y++){
                            Color color = new Color(img.getRGB(x, y));
                            //System.out.println("red:"+color.getRed()+" | green:"+color.getGreen()+" | blue:"+color.getBlue());
                            int num = color.getRed()+color.getGreen()+color.getBlue();
                            if(num >= threshold){
                                img.setRGB(x, y, Color.WHITE.getRGB());
                            }
                        }
                    }
                }
                for(int i = 1;i<width;i++){
                    Color color1 = new Color(img.getRGB(i, 1));
                    int num1 = color1.getRed()+color1.getGreen()+color1.getBlue();
                    for (int x = 0; x < width; x++)
                    {
                        for (int y = 0; y < height; y++)
                        {
                            Color color = new Color(img.getRGB(x, y));

                            int num = color.getRed()+color.getGreen()+color.getBlue();
                            if(num==num1){
                                img.setRGB(x, y, Color.BLACK.getRGB());
                            }else{
                                img.setRGB(x, y, Color.WHITE.getRGB());
                            }
                        }
                    }
                }
                File file = new File(resUrl);
                if (!file.exists())
                {
                    File dir = file.getParentFile();
                    if (!dir.exists())
                    {
                        dir.mkdirs();
                    }
                    try
                    {
                        file.createNewFile();
                    }
                    catch (IOException e)
                    {
                        e.printStackTrace();
                    }
                }
                ImageIO.write(img, "jpg", file);
            }catch (Exception e){
                e.printStackTrace();
            }
        }

        /**
         * 保存文件
         * @param imgdata
         * @param filePath
         * @param filename
         */
        public static void saveImg(byte[] imgdata,String filePath,String filename) {
            BufferedOutputStream bos = null;
            FileOutputStream fos = null;
            File file = null;
            File dir = new File(filePath);
            try {
                if(!dir.exists()&&dir.isDirectory()) {
                    dir.mkdirs();
                }
                file = new File(filePath+File.separator+filename);
                fos = new FileOutputStream(file);
                bos = new BufferedOutputStream(fos);
                bos.write(imgdata);
            } catch (Exception e) {
                e.printStackTrace();
            }finally{
                if(bos!=null) {
                    try {
                        bos.close();
                    } catch (IOException e) {
                        e.printStackTrace();
                    }
                }
                if(fos!=null) {
                    try {
                        fos.close();
                    } catch (IOException e) {
                        e.printStackTrace();
                    }
                }
            }
        }
    }

部分代码参考自:Java识别验证码和图像处理_梁康h的博客-CSDN博客

Java 爬虫之识别图片验证码后登录_JavaBigADog的博客-CSDN博客

  • 0
    点赞
  • 11
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值