关闭

使用httpclient获取其他网站数据(含解析验证码)

标签: 验证码httpclientpython
148人阅读 评论(0) 收藏 举报
分类:
    **使用httpclient获取其他网站数据**

使用httpclient模拟浏览器请求网站加载个人诉讼记录信息接口;
总结:1.系统如果上线,linux系统中使用了python命令来识别验证码,先将验证码保存在本地,识别完成后删除;需要一个python脚本,代码粘下面:
2.如果是在windows系统上运行该系统,提供了一个OCR的封装类,直接调用即可识别验证码;这里需要使用一个工具类,地址:http://download.csdn.net/download/qq_23339149/9617921

接口类:    
import com.alibaba.fastjson.JSONObject;
import com.aweb.platform.util.StringUtils;
import com.dbn.sysmodule.util.IdcardUtils;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.methods.PostMethod;
import org.apache.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Value;

import java.io.*;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
 * Created by warming on 2016/8/31 with IntelliJ IDEA.
 */
public class OuterInformationServiceImpl implements com.dbn.remote.service.OuterInformationService {
    private Logger log = Logger.getLogger(OuterInformationServiceImpl.class);

    @Value("${zhiXingIndexUrl}")
    private String zhiXingIndexUrl;
    @Value("${verificationCodeUrl}")
    private String verificationCodeUrl;
    @Value("${zhiXingSearchUrl}")
    private String zhiXingSearchUrl;
    @Value("${zhiXingSearchUserAgent}")
    private String zhiXingSearchUserAgent;

    public String getPersonLitigationRecords(String pName, String cardNum) throws Exception {
        String jsonStr = null;
        try {
            if (StringUtils.checkStr(pName) && IdcardUtils.validateCard(cardNum)) {
                HttpClient client = new HttpClient();
                GetMethod method = null;
                loadIndex(method, client);//模拟加载首页
                String htmlResponse = getRecords(client, method, pName, cardNum);
                int count = 0;
                Boolean success = false;
                while (count < 5) { //请求次数
                    if (htmlResponse.contains("验证码错误")) {
                        htmlResponse = getRecords(client, method, pName, cardNum);
                        count++;
                    } else {
                        success = true;
                        break;
                    }
                }
                log.info("验证码解析错误次数:" + count);
                if (!success) {
                    return "查询失败";
                }
                jsonStr = getJsonStrByHtml(htmlResponse);
                log.info("查询结果:" + jsonStr);
            }
        } catch (Exception e) {
            e.printStackTrace();
            throw new Exception("查询个人诉讼记录错误!", e);
        }
        return jsonStr;
    }

    private String getRecords(HttpClient client, GetMethod method, String pName, String cardNum) throws Exception {
        //加载验证码
        method = new GetMethod(verificationCodeUrl);
        method.addRequestHeader("User-Agent", zhiXingSearchUserAgent);
        client.executeMethod(method);
        //通过linux调用python命令执行;
        String fileName = "/tmp/" + Long.toString(System.currentTimeMillis()) + String.valueOf(getRandom()) + ".jpeg";
        FileOutputStream fout = null;
        try {
            fout = new FileOutputStream(fileName);
            fout.write(method.getResponseBody());
        }catch(Exception e){
            log.info("将验证码写入本地失败!");
        }finally{
            if(fout != null){
                fout.flush();
                fout.close();
            }
        }
        String code = exec(fileName);
        log.info("解析验证码为::" + code);

        //适用于windows操作系统
//        InputStream bis = new ByteArrayInputStream(get.getResponseBody());
//        String code = ParseJPEG_withOCR.getRecogniseStr(bis);
//        log.info("验证码解析结果:" + code);
//        bis.close();

        PostMethod post = new PostMethod(zhiXingSearchUrl);
        post.setRequestHeader("Content-Type", "application/x-www-form-urlencoded");
        post.setRequestHeader("Referer", zhiXingIndexUrl);
        post.addRequestHeader("User-Agent", zhiXingSearchUserAgent);
        post.setRequestBody("searchCourtName=" + URLEncoder.encode("全国法院(包含地方各级法院)") + "&selectCourtId=1&selectCourtArrange=1&pname=" +
                URLEncoder.encode(pName) + "&cardNum=" + cardNum + "&j_captcha=" + code);
        client.executeMethod(post);
        InputStream is = null;
        ByteArrayOutputStream baos = new ByteArrayOutputStream();
        try{
            is = post.getResponseBodyAsStream();
            int i = -1;
            while ((i = is.read()) != -1) {
                baos.write(i);
            }
        }catch (Exception e){
            log.info("获取查询返回页面失败!");
        }finally {
            if(is != null){
                is.close();
            }
        }
        return baos.toString();
    }

    //获取HTML页面中table标签对应的json值
    private String getJsonStrByHtml(String htmlResponse) throws Exception {
        if (StringUtils.checkStr(htmlResponse)) {
            JSONObject data = new JSONObject();
            org.jsoup.nodes.Document doc = Jsoup.parse(htmlResponse);
            Elements trs = doc.getElementsByTag("tr");
            Elements ths = doc.getElementsByTag("th");
            Elements tds = doc.getElementsByTag("td");
            int trsSize = trs.size();//行数
            int thsSize = ths.size();//表头列数
            int tdsSize = tds.size();//td数
            if (trsSize > 0 && thsSize > 0 && tdsSize > 0) {
                List<Object> list = new ArrayList<>();
                for (int j = 0; j < trs.size() - 1; j++) {
                    Map<String, Object> map = new HashMap<>();
                    for (int i = 0; i < thsSize - 1; i++) {
                        map.put(ths.get(i).text(), tds.get(thsSize * j + i).text());
                    }
                    list.add(map);
                }
                data.put("data", list);
                return data.toJSONString();
            } else {
                return null;
            }
        } else {
            return null;
        }
    }

    //模拟请求首页
    private void loadIndex(GetMethod get, HttpClient client) throws IOException {
        get = new GetMethod(zhiXingIndexUrl);
        get.addRequestHeader("User-Agent", zhiXingSearchUserAgent);
        client.executeMethod(get);
        log.info("首页加载完成");
    }

    //调用linux命令
    public String exec(String fileName) {
        log.info("验证文件名称:" + fileName);
        try {
            String cmd ="python /tmp/captcha.py " + fileName;
            Process process = Runtime.getRuntime().exec(cmd);
            LineNumberReader br = new LineNumberReader(new InputStreamReader(
                    process.getInputStream()));
            StringBuffer sb = new StringBuffer();
            String line;
            while ((line = br.readLine()) != null) {
                System.out.println(line);
                sb.append(line).append("\n");
            }
            //删除生成的验证码图片
            Runtime.getRuntime().exec("rm -f " + fileName);
            return sb.toString();
        } catch (Exception e) {
            e.printStackTrace();
        }
        return null;
    }

    //随机三位数
    public int getRandom() {
        int number = 0;
        while (true) {
            number = (int) (Math.random() * 1000);
            if (number >= 100 && number < 1000) {
                break;
            }
        }
        return number;
    }
}

工具类:

import com.asprise.util.ocr.OCR;

import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;


public class ParseJPEG_withOCR {
    public static String getRecogniseStr(InputStream imageFile) {
        String s = "";
        try {
            BufferedImage image = ImageIO.read(imageFile);
            int width = image.getTileWidth();
            int height = image.getTileHeight();
            image = image.getSubimage(0, 0, width, height);
            s = new OCR().recognizeEverything(image);
        } catch (IOException e) {
            e.printStackTrace();
            System.out.println(" 图片识别失败! ");
        }
        return s;
    }
    public static String getRecogniseStrByFile(File imageFile) {
        String s = "";
        try {
            BufferedImage image = ImageIO.read(imageFile);
            int width = image.getTileWidth();
            int height = image.getTileHeight();
            image = image.getSubimage(0, 0, width, height);
            s = new OCR().recognizeEverything(image);
        } catch (IOException e) {
            e.printStackTrace();
            System.out.println(" 图片识别失败! ");
        }
        return s;
    }

    public static void main(String[] args) {
//        for (int i = 0; i < 100; i++) {
//            String code = getRecogniseStrByFile(new File("D:\\pic\\download/" + i + ".jpeg"));
//            System.out.println(code);
//        }
    }

}

python脚本(文件名命名为:captcha.py),在linux同目录下保存验证码,执行命令:python /tmp/captcha.py ” + fileName 即可返回数据,pytesseract类库可
百度下载:

from PIL import Image
import sys
import pytesseract

def output(imgfile):
    img = Image.open(imgfile)
    gray = img.convert('L')
    print(pytesseract.image_to_string(gray, config='-psm 7'))


if __name__ == "__main__":
    file_name = sys.argv[1]
    output(file_name)
1
0

查看评论
* 以上用户言论只代表其个人观点,不代表CSDN网站的观点或立场
    个人资料
    • 访问:6727次
    • 积分:390
    • 等级:
    • 排名:千里之外
    • 原创:31篇
    • 转载:1篇
    • 译文:1篇
    • 评论:1条
    最新评论