package com.alipay.dtcrawler;
import com.alipay.dtcrawler.core.service.page.Html;
import com.alipay.zdal.common.lang.StringUtil;
import jodd.http.HttpRequest;
import jodd.http.HttpResponse;
import org.apache.commons.httpclient.Cookie;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.io.FileUtils;
import sun.misc.BASE64Decoder;
import sun.misc.BASE64Encoder;
import us.codecraft.webmagic.selector.Selectable;
import javax.imageio.ImageIO;
import javax.script.ScriptException;
import java.awt.image.BufferedImage;
import java.io.*;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Created by guanzhi on 2015/12/22.
*/
public class CourtExec {
static BASE64Encoder encoder = new BASE64Encoder();
static BASE64Decoder decoder = new BASE64Decoder();
private static final String accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8";
private static final String host = "zhixing.court.gov.cn";
private static
final String user_agent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.10 Safari/537.36";
//private static final String user_agent = "Mozilla/5.0 (MSIE 9.0; qdesk 2.4.1266.203; Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko";
private static final String cache_control = "no-cache";
private static final String accept_language = "zh-CN,zh;q=0.8";
private static final String accept_encoding = "gzip, deflate, sdch";
private static final String Referer = "http://www.XX.com";
private static int court_number = 1;
//睡眠时间
private static int court_time =0;
public static void main(String[] args) throws Exception {
try {
//读取是否存在记录行数文件
File file = new File(args[3]);
if (!file.exists()) {
file.createNewFile();
}
String last_number_line = FileUtils.readFileToString(file);
int line = 0;
if (StringUtil.isNotBlank(last_number_line)) {
line= Integer.parseInt(last_number_line);
}
//线程池大小
ExecutorService service = Executors.newFixedThreadPool(11);
//最大读取数,超过读取则线程阻塞
BlockingQueue<String> basket = new LinkedBlockingQueue<String>(10);
ArrayList<Callable<Integer>> callers = new ArrayList<Callable<Integer>>();
CourtExec courtExec = new CourtExec();
//args[]:0、读取文件名;1、图片路径;2、人名id存放路径;3、读取文件记录行数存储
if (args.length == 4) {
Producer producer = courtExec.new Producer(basket, args[0],args[3], line);
Future<Integer> re = service.submit(producer);
for (int i = 0; i < 10; i++) {
Consumer consumer = courtExec.new Consumer(basket, re, args[1], args[2]);
callers.add(consumer);
}
service.invokeAll(callers);
service.shutdownNow();
} else {
System.out.println("请检查是否缺少设置参数!");
}
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* 图片转码
*
* @param imgFile
* @return
*/
public static String encodeImage(File imgFile) throws IOException {
BufferedImage bi;
bi = ImageIO.read(imgFile);
String ens = null;
if (bi != null) {
ByteArrayOutputStream bo = new ByteArrayOutputStream();
ImageIO.write(bi, "jpg", bo);
byte[] bytes = bo.toByteArray();
ens = encoder.encode(bytes);
//String en = encoder.encodeBuffer(bytes).trim();
}
return ens;
}
/**
* 图片解码
* @param imgCode
* @return
*/
/* public static String decodeImage(String imgCode) throws IOException {
byte[] img = decoder.decodeBuffer(imgCode);
ByteArrayInputStream bais = new ByteArrayInputStream(img);
BufferedImage bi1 = ImageIO.read(bais);
File w2 = new File(“d://11.jpg”);
ImageIO.write(bi1, “jpg”, w2);
}*/
/**
* 请求验证码
*
* @param imgUrl
* @return
*/
public static Map<String, String> getImage(String imgUrl, int i, String img_url) throws Exception {
Map<String, String> map = new HashMap<String, String>();
HttpClient httpClient = new HttpClient();
GetMethod getMethod1 = getMethod(imgUrl, null);
httpClient.executeMethod(getMethod1);
Cookie[] cookies = httpClient.getState().getCookies();
String cookieStr = "";
for (Cookie cookie : cookies) {
cookieStr += cookie.getName() + ":" + cookie.getValue() + ";";
}
/* //第一次get请求获得的js脚本,拼装cookie
String script = getMethod1.getResponseBodyAsString().replace("<script>", "").replace("</script>", "").replace("document.cookie=dc;", "function setTimeout(){}");
ScriptEngine engine = new ScriptEngineManager().getEngineByName("javascript");
engine.eval(script);
cookieStr += ((String) engine.get("dc")).split(";")[0];
//第二次组装cookie值发送请求
GetMethod getMethod2 = getMethod(imgUrl, cookieStr);
httpClient.executeMethod(getMethod2);*/
InputStream inputStream = getMethod1.getResponseBodyAsStream();
String saveImg = img_url + "/" + i + ".jpg";
saveFile(inputStream, saveImg);
map.put("imgUrl", saveImg);
return map;
}
//调用服务
public static String postData(String imgcode) {
String msgCode = null;
HttpResponse response = HttpRequest.get("http://120.0.0.1:8080/getVerifyCode?img_content=" + imgcode + "&img_format=jpeg&char_length=5&char_type=2&variable=true&variable_length=4").send();
if (StringUtil.isNotBlank(response.body())) {
String status = response.body().split(",")[0].split(":")[1];
if (status.equals("1")) {
msgCode = response.body().split(",")[1].split(":")[1];
}
}
return msgCode;
}
public static GetMethod getMethod(String url, String cookie) {
GetMethod getMethod = new GetMethod(url);
getMethod.setRequestHeader("Accept", accept);
getMethod.setRequestHeader("Accept-Encoding", accept_encoding);
getMethod.setRequestHeader("Accept-Language", accept_language);
getMethod.setRequestHeader("Cache-Control", cache_control);
if (!StringUtil.isBlank(cookie)) {
getMethod.setRequestHeader("Cookie", cookie);
}
getMethod.setRequestHeader("Host", host);
getMethod.setRequestHeader("User-Agent", user_agent);
return getMethod;
}
private static void saveFile(InputStream inputStream, String saveImg) throws Exception {
FileOutputStream fos = new FileOutputStream(new File(saveImg));
byte[] buf = new byte[1024];
int len = 0;
while ((len = inputStream.read(buf)) != -1) {
fos.write(buf, 0, len);
}
fos.flush();
fos.close();
}
//获取人名
class Producer implements Callable<Integer> {
private BlockingQueue<String> basket;
private String file_path;
private int line_number;
private String record_line;
public Producer(BlockingQueue<String> basket, String file_path,String record_line, int line_number) {
this.basket = basket;
this.file_path = file_path;
this.line_number = line_number;
this.record_line = record_line;
}
@Override
public Integer call() throws Exception {
BufferedReader br = new BufferedReader(new FileReader(file_path));
String str = null;
int i = 0;
while ((str = br.readLine()) != null) {
try {
i++;
if (line_number>21 && i>=line_number-21){
basket.put(str);
File writer = new File(record_line);
FileUtils.writeStringToFile(writer, i + "", false);
}else if (line_number<=21){
basket.put(str);
File writer = new File(record_line);
FileUtils.writeStringToFile(writer, i + "", false);
}
} catch (InterruptedException e) {
e.printStackTrace();
}
}
br.close();
return 0;
}
}
//处理人名
class Consumer implements Callable<Integer> {
private BlockingQueue<String> basket;
private Future<Integer> re;
private String img_url;
private String save_data_url;
public Consumer(BlockingQueue<String> basket, Future<Integer> re, String img_url, String save_data_url) {
this.basket = basket;
this.re = re;
this.img_url = img_url;
this.save_data_url = save_data_url;
}
@Override
public Integer call() throws Exception {
while (basket.size() > 0 || !re.isDone()) {
String name = basket.poll();
if (StringUtil.isNotBlank(name)) {
doProcess(img_url, save_data_url,name);
//全局统计行数,写入文件,用于续抓
court_number++;
}
}
return 0;
}
}
public static void doProcess(String img_url, String save_data_url, String name) throws Exception {
try {
for (int m = 0; m < 1; m++) {
/*//请求验证码
String url = "http://www.xx.com";
Map<String, String> map = getImage(url, court_number, img_url);
File ff = new File(map.get("imgUrl"));
//图片转码.
String ens = encodeImage(ff);
//转码失败重新发请求
if (ens == null) {
m--;
continue;
}
//调用服务解析验证码
String imgCode = postData(ens);
//解析失败重新发请求
if (imgCode == null) {
m--;
continue;
}
//重新获取cookie*/
String cookie = getCookie(Referer);
//post请求人名IDList
List<String> nameIdList = doPostSearch(cookie, "", name);
File writer = new File(save_data_url);
if (!writer.exists()){
writer.createNewFile();
}
if (nameIdList.size()>0){
for (String nameId :nameIdList){
FileUtils.writeStringToFile(writer, nameId + "\n", true);
}
}
//saveData(nameIdList, cookie, save_data_url);
}
}catch (RequestException re){
System.out.println(re);
//404一次休眠5分钟,最大休眠50分钟
if (court_time <= 3000000){
court_time +=300000;
}
//休眠一分钟
Thread.sleep(court_time);
doProcess(img_url, save_data_url, name);
}catch (Exception e){
e.printStackTrace();
//休眠10秒
Thread.sleep(10000);
doProcess(img_url, save_data_url, name);
}
}
//重新请求cookie
private static String getCookie(String url) throws IOException, ScriptException {
HttpClient httpClient = new HttpClient();
GetMethod getMethod1 = getMethod(url, null);
httpClient.executeMethod(getMethod1);
Cookie[] cookies = httpClient.getState().getCookies();
String cookieStr = "";
for (Cookie cookie : cookies) {
cookieStr += cookie.getName() + "=" + cookie.getValue() + ";";
}
/*//第一次get请求获得的js脚本,拼装cookie
String script = getMethod1.getResponseBodyAsString().replace("<script>", "").replace("</script>", "").replace("document.cookie=dc;", "function setTimeout(){}");
ScriptEngine engine = new ScriptEngineManager().getEngineByName("javascript");
engine.eval(script);
cookieStr += ((String) engine.get("dc")).split(";")[0];
//第二次组装cookie值发送请求
GetMethod getMethod2 = getMethod(url, cookieStr);
httpClient.executeMethod(getMethod2);
Cookie[] cookies1 = httpClient.getState().getCookies();
String cookieStr1 = "";
for (Cookie cookie : cookies1) {
cookieStr1 += cookie.getName() + "=" + cookie.getValue() + ";";
}*/
String cookie = cookieStr+";";
return cookie;
}
//发送post请求
public static List<String> doPostSearch(String cookie, String imgCode, String name) throws Exception {
List<String> nameIdList = new ArrayList<>();
int totalPage = 0;
for (int ii = 0; ii < 2; ii++) {
//提交查询请求
HttpResponse response = HttpRequest.post(Referer).timeout(60000).header("Cookie", cookie).header("User-Agent", user_agent)
.form(
"searchCourtName", "全国法院(包含地方各级法院)",
"selectCourtId", "1",
"selectCourtArrange", "1",
"pname", name,
"cardNum", "",
"j_captcha", 7709).send();
if (response.statusCode() == 404){
throw new RequestException("404");
}else if (response.statusCode() == 200) {
//请求成功时,,将休眠时间置0
court_time =0;
String total = getMatchStr(response.body(), "totalPage\\s=\\s(\\d+);");
Html html = Html.create(response.body());
List<Selectable> namesId = html.$("#Resultlist").xpath("//tbody/tr").nodes();
if (StringUtil.isNotBlank(total)) {
totalPage = Integer.parseInt(total);
}
int m = 0;
for (Selectable nameid : namesId) {
if (m != 0) {
String name_id = getMatchStr(nameid.toString(), "id=\"(\\d+)\"");
nameIdList.add(name_id);
}
m++;
}
ii = 2;
} else {
ii = 0;
}
}
//从第二页开始取值
if (totalPage > 1) {
for (int i = 1; i < totalPage; i++) {
for (int ii = 0; ii < 2; ii++) {
//提交查询请求
HttpResponse response = HttpRequest.post(Referer).timeout(60000).header("Cookie", cookie).header("User-Agent", user_agent)
.form(
"currentPage", i + 1,
"selectCourtId", "1",
"selectCourtArrange", "1",
"pname", name,
"cardNum", "",
"j_captcha", imgCode).send();
if (response.statusCode() == 200) {
Html html = Html.create(response.body());
List<Selectable> namesId = html.$("#Resultlist").xpath("//tbody/tr").nodes();
int m = 0;
for (Selectable nameid : namesId) {
if (m != 0) {
String name_id = getMatchStr(nameid.toString(), "id=\"(\\d+)\"");
nameIdList.add(name_id);
}
m++;
}
ii = 2;
} else {
ii = 0;
}
}
}
}
return nameIdList;
}
//get详情页,存储数据
public static void saveData(List<String> nameIdList, String cookie, String save_data_url) throws Exception {
if (nameIdList.size() > 0) {
for (int ii = 0; ii < nameIdList.size(); ii++) {
try{
for (int m = 0; m < 2; m++) {
String getUrl = "http://www.xx.com/detail?id=" + nameIdList.get(ii);
HttpResponse get = HttpRequest.get(getUrl).timeout(60000)
.header("Cookie", cookie).header("User-Agent", user_agent)
.send();
if (get.statusCode() == 200) {
String name_data = get.body().toString();
name_data = "[data_url:" + getUrl + ";" + new String(name_data.getBytes("iso-8859-1"), "utf-8").trim() + "]";
File writer = new File(save_data_url);
if (!writer.exists()){
writer.createNewFile();
}
FileUtils.writeStringToFile(writer, name_data + "\n", true);
m = 10;
} else {
m = 0;
}
}
}catch (Exception e){
ii--;
}
}
}
}
public static String getMatchStr(String obj, String rgex) {
String tmp = "";
Pattern pattern = Pattern.compile(rgex);
Matcher matcher = pattern.matcher(obj);
while (matcher.find()) {
tmp = matcher.group(1);
}
return tmp;
}
}