1.业务需求:
从指定外网抓点货,冷启动
2.站点分析:
.限制IP…
.需要登录……
.对登录账号有抓取频率限制……….
.抓取频率过低,直接跳验证码页面…………..
.验证码长度、模样(纯数字&字母数字混合)TM不固定………………..
“我们能不能不抓了?“
“不行!必须得抓…”
“……”
这么说,此前写的爬虫,多线程、生产者—>消费者 并发抓取压根行不通。多线程毫无意义。
3.使用技术:
1.HttpClient
:读取指定URL网页内容
2.Jsoup
:解析所要的页面数据——省得写恶心的正则表达式
3.Swing
:绘制用户操作界面
4.Tess4J
:自动识别验证码(http://tess4j.sourceforge.net/)
5.Exe4J
:生成可独立运行的exe程序——给每人机器安装一个,大家一起监控抓~
4.实现要点:
1.代理IP
从一些网站上抓取代理IP,并检测是否可以使用,如下:
package com.ydj.zhuaqu.proxy;
import java.io.IOException;
import java.net.InetSocketAddress;
import java.net.Socket;
import java.net.UnknownHostException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Random;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
import net.sf.json.JSONArray;
import net.sf.json.JSONObject;
import org.apache.commons.collections.map.LRUMap;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import com.ydj.common.kit.MyLog;
/**
*
* @author : Ares.yi
* @createTime : 2014-11-10 上午11:13:42
* @version : 1.0
* @description :
*
*/
public class ProxyIpPool {
/**设置最多IP数*/
private static final int MAX_IP = 100;
/**设置最少IP数(最好控制和外部使用线程数一致)*/
@SuppressWarnings("unused")
private static final int MIN_IP = 10;
// public static ConcurrentHashMap<Integer,Integer> canUseIPs = new ConcurrentHashMap<Integer,Integer>();
public static List<ProxyIp> canUseIpList = Collections.synchronizedList(new ArrayList<ProxyIp>(MAX_IP));
private static LRUMap notCanUseIPsTemp = new LRUMap(2000);
/**每次抓取IP数*/
private static final int NUM = 20;
private static final String ORDER_ID = "904557733280949";
private static final String KDL_URL = "http://dev.kuaidaili.com/api/getproxy?orderid="+ORDER_ID+"&num="+NUM+"&quality=1&an_ha=1&dedup=1&format=json";
private ProxyIpPool(){
}
/**
* 启动抓取代理IP线程
*
* @author : Ares.yi
* @createTime : 2015年10月29日 下午5:58:54
*/
public static void startCrawl(){
final int period = 3;
ScheduledExecutorService scheduledExecutorService = Executors.newScheduledThreadPool(1);
scheduledExecutorService.scheduleAtFixedRate(new Runnable() {
int i = 0 ;
@Override
public void run() {
produceIP(i);
i++;
}
}, 1, period,TimeUnit.MINUTES);
}
private static void produceIP(int i){
int currentSize = canUseIpList.size();
if( currentSize >= MAX_IP){
MyLog.logInfo(i+":current proxyPool size is:"+currentSize+",no need crawl new ip.NotCanUseIPsTemp size is:"+notCanUseIPsTemp.size());
return ;
}
JSONArray ips = getIPFromKuaiDaiLi();
produceIP(ips);
MyLog.logInfo(i+":current proxyPool size is:"+canUseIpList.size()+",notCanUseIPsTemp size is:"+notCanUseIPsTemp.size());
}
private static void produceIP(JSONArray ips){
if(ips == null || ips.isEmpty()){
return ;
}
for(int i = 0 ;i < ips.size() ;i++ ){
Object one = ips.get(i);
String s[] = one.toString().split(":");
String ip = s[0];
int port = Integer.valueOf(s[1]);
ProxyIp proxyIp = new ProxyIp(ip, port);
if(isCanUse(ip, port)){
addIP(proxyIp);
}else{
removeIP(proxyIp);
}
}
}
public static ProxyIp useOneProxyIp(){
if(canUseIpList.isEmpty()){
MyLog.logInfo(Thread.currentThread().getName()+" useOneProxyIp,but proxyPool is empty,need to wait 2 min crawl IP.");
try {
Thread.sleep(2 * 60 * 1000);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
Collections.sort(canUseIpList);
ProxyIp proxyIp = canUseIpList.remove(0);
proxyIp.useThis();
return proxyIp;
}
public static void returnProxyIp(ProxyIp proxyIp){
proxyIp.setUseing(false);
canUseIpList.add(proxyIp);
return ;
}
/**
* 从快代理网站获取代理IP
* @return
*
* @author : Ares.yi
* @createTime : 2015年10月29日 下午2:36:05
*/
private static JSONArray getIPFromKuaiDaiLi(){
JSONArray ips = new JSONArray();
HttpClient client = new HttpClient();
GetMethod method = new GetMethod(KDL_URL);
HttpMethodParams param = method.getParams();
param.setContentCharset("UTF-8");
try {
client.executeMethod(method);
String res = method.getResponseBodyAsString();
JSONObject json = JSONObject.fromObject(res);
if(json != null && json.containsKey("data")){
ips = json.getJSONObject("data").getJSONArray("proxy_list");
MyLog.logInfo(ips);
}
} catch (Exception e) {
e.printStackTrace();
}
return ips;
}
/**
* 从更多的网站获取代理IP
* @return
*
* @author : Ares.yi
* @createTime : 2015年10月29日 下午2:46:40
*/
@SuppressWarnings("unused")
private static JSONArray getIPFromXXX(){
JSONArray ips = new JSONArray();
HttpClient client = new HttpClient();
GetMethod method = new GetMethod("XXX");
HttpMethodParams param = method.getParams();
param.setContentCharset("UTF-8");
try {
client.executeMethod(method);
String res = method.getResponseBodyAsString();
JSONObject json = JSONObject.fromObject(res);
if(json != null && json.containsKey("data")){
ips = json.getJSONObject("data").getJSONArray("proxy_list");
MyLog.logInfo(ips);
}
} catch (Exception e) {
e.printStackTrace();
}
return ips;
}
/**
* 检测代理IP是否可用
*
* @param ip
* @param port
* @return
*
* @author : Ares.yi
* @createTime : 2015年10月29日 下午2:37:22
*/
private static boolean isCanUse(String ip,int port){
if(port < 0 ){
return false;
}
if(notCanUseIPsTemp.containsKey(ip)){
MyLog.logInfo(ip+":"+port+" can't use again.");
return false;
}
if(!checkIp(ip, port)){
return false;
}
return checkIpUseTargetSite(ip, port);
}
/**
* 检测代理IP是否可用
*
* @param ip
* @param port
* @return
*
* @author : Ares.yi
* @createTime : 2015年10月29日 下午12:35:28
*/
private static boolean checkIp(String ip,int port){
Socket server = null;
try {
server = new Socket();
InetSocketAddress address = new InetSocketAddress(ip,port);
server.connect(address, 3000);
MyLog.logInfo(ip+":"+port+" is ok!");
return true;
}catch (UnknownHostException e) {
//e.printStackTrace();
MyLog.logInfo(ip+":"+port+" is wrong!");
} catch (IOException e) {
//e.printStackTrace();
MyLog.logInfo(ip+":"+port+" is wrong!!");
}
return false;
}
/**
* 到目标网站准确检测代理IP是否可用
*
* @param ip
* @param port
* @return
*
* @author : Ares.yi
* @createTime : 2015年10月29日 下午12:06:03
*/
private static boolean checkIpUseTargetSite(String ip,int port){
HttpClientBuilder httpClientBuilder = HttpClientBuilder.create();
CloseableHttpClient closeableHttpClient = httpClientBuilder.build();
HttpHost proxy = new HttpHost(ip,port, "http");
RequestConfig config = RequestConfig.custom().setConnectTimeout(3000).setSocketTimeout(3000).setProxy(proxy).build();
HttpGet httpGet = new HttpGet("http://www.autozi.com/partCategory.html/");
httpGet.setConfig(config);
try {
CloseableHttpResponse response = closeableHttpClient.execute(httpGet);
HttpEntity httpentity = response.getEntity();
String html = EntityUtils.toString(httpentity, "UTF-8");
if(Jsoup.parse(html).select("div[class=header fix]").first() != null){
return true;
}
} catch (Exception exc){
// exc.printStackTrace();
MyLog.logError(exc.getMessage());
}
return false;
}
public static void removeIP(ProxyIp proxyIp){
canUseIpList.remove(proxyIp);
notCanUseIPsTemp.put(proxyIp.getIp(),proxyIp.getPort());
}
public static void addIP(ProxyIp proxyIp){
canUseIpList.add(proxyIp);
notCanUseIPsTemp.remove(proxyIp.getIp());
}
/**
* 测试使用代理IP
*
* @author : Ares.yi
* @createTime : 2015年10月29日 下午6:00:16
*/
private static void testUseProxyIp(){
ExecutorService threadPool = Executors.newFixedThreadPool(10);
for(int i=0 ;i <20 ;i++){
final int flag = i;
threadPool.execute(new Runnable() {
@Override
public void run() {
ProxyIp proxyIp = useOneProxyIp();
MyLog.logInfo(flag+" job "+Thread.currentThread().getName()+" get proxyIp is : "+proxyIp.toString());
long millis = new Random().nextInt(10) * 1000;
try {
Thread.sleep(millis);//每个线程随机sleep N秒,模拟线程在工作
} catch (InterruptedException e) {
e.printStackTrace();
}
returnProxyIp(proxyIp);
MyLog.logInfo(flag+" job "+Thread.currentThread().getName()+" use proxyIp is : "+proxyIp.toString()+",work use time "+millis+" end and return to pool.");
}
});
}
}
}
使用代理IP:
/**
* 使用代理获取网页内容
*
* @param url
* @param proxyIp
* @param proxyPort
* @return
* @throws ParseException
* @throws IOException
*
* @author : Ares.yi
* @createTime : 2015年10月30日 上午9:55:21
*/
public static String getHtml(String url,String proxyIp,int proxyPort) throws ParseException, IOException {
HttpClientBuilder httpClientBuilder = HttpClientBuilder.create();
CloseableHttpClient closeableHttpClient = httpClientBuilder.build();
HttpHost proxy = new HttpHost(proxyIp,proxyPort, "http");
RequestConfig config = RequestConfig.custom().setConnectTimeout(3000).setSocketTimeout(3000).setProxy(proxy).build();
HttpPost httpGet = new HttpPost(url);
httpGet.setConfig(config);
String html = "";
CloseableHttpResponse response = null;
try {
response = closeableHttpClient.execute(httpGet);
}catch(Exception exc){
exc.printStackTrace();
System.out.println("get请求失败!");
return "cannot connect";
}
HttpEntity httpEntity = response.getEntity();
if (httpEntity != null) {
// 打印响应内容
try{
html = EntityUtils.toString(httpEntity, "UTF-8");
}catch(Exception excep){
System.out.println(url);
}
}else{
return "cannot connect";
}
closeableHttpClient.close();
return html;
}
2.模拟登录
提取登录Cookie和User-Agent:
代码片段,如下:
public static String postRequest(String url,
Map<String, String> parameterMap, String charSet)
throws UnsupportedEncodingException {
CloseableHttpClient client = HttpClients.createDefault();
HttpPost httpPost = new HttpPost(url);
UrlEncodedFormEntity postEntity = new UrlEncodedFormEntity(getParam(parameterMap), charSet);
httpPost.setEntity(postEntity);
httpPost.addHeader("HOST", "sec.1688.com");
httpPost.addHeader("User-Agent", Constant.userAgent);
httpPost.addHeader("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
httpPost.addHeader("Cookie", Constant.cookie);
MyLog.logInfo("request line:" + httpPost.getRequestLine());
try {
// 执行post请求
HttpResponse httpResponse = client.execute(httpPost);
Header header = httpResponse.getFirstHeader("Location");
if (header != null && Toolbox.isNotEmpty(header.getValue())) {
MyLog.logInfo("location:" + header.getValue());
return "SUCCESS";
} else {
String html = printResponse(httpResponse);
return html;
}
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
client.close();
} catch (IOException e) {
}
}
return "";
}
3.验证码
获取输入验证码页面信息:
public static Ali1688CheckCodeFormData getCheckCodeFormData(String url,String checkCodePageHtml){
Ali1688CheckCodeFormData ali1688CheckCodeFormData= null;
if(Toolbox.isEmptyString(checkCodePageHtml)){
return ali1688CheckCodeFormData;
}
Document doc = Jsoup.parse(checkCodePageHtml);
String action = doc.select("input[name=action]").attr("value");
String event_submit_do_query = doc.select("input[name=event_submit_do_query]").attr("value");
String smPolicy = doc.select("input[name=smPolicy]").attr("value");
String smReturn = doc.select("input[name=smReturn]").attr("value");
String smApp = doc.select("input[name=smApp]").attr("value");
String smCharset = doc.select("input[name=smCharset]").attr("value");
String smTag = doc.select("input[name=smTag]").attr("value");
String smSign = doc.select("input[name=smSign]").attr("value");
String identity = doc.select("input[name=identity]").attr("value");
String captcha = doc.select("input[name=captcha]").attr("value");
String sessionid = doc.select("img[id=checkcodeImg]").attr("src");
sessionid = sessionid.substring(sessionid.indexOf("sessionid=")+10,sessionid.indexOf("&"));
ali1688CheckCodeFormData = new Ali1688CheckCodeFormData(action, event_submit_do_query, smPolicy, smReturn, smApp, smCharset, smTag, smSign, identity, captcha, sessionid,url);
return ali1688CheckCodeFormData;
}
提交验证码:
public static String submitCheckCode(String checkcode) throws UnsupportedEncodingException, IOException{
String smApp = Constant.ali1688CheckCodeFormData.getSmApp();
String smPolicy = Constant.ali1688CheckCodeFormData.getSmPolicy();
String smCharset = Constant.ali1688CheckCodeFormData.getSmCharset();
String smTag = Constant.ali1688CheckCodeFormData.getSmTag();
String smReturn = Constant.ali1688CheckCodeFormData.getSmReturn();
String smSign = Constant.ali1688CheckCodeFormData.getSmSign();
String get = "smApp="+smApp+"&smPolicy="+smPolicy+"&smCharset="+smCharset+"&smTag="+smTag+"&smReturn="+smReturn+"&smSign="+smSign;
try {
get = java.net.URLEncoder.encode(get,"utf-8");
} catch (UnsupportedEncodingException e1) {
}
String formAction = "https://sec.1688.com/query.htm?"+get;
Map<String,String> parameterMap = new HashMap<String,String>();
parameterMap.put("action", Constant.ali1688CheckCodeFormData.getAction());
parameterMap.put("event_submit_do_query", Constant.ali1688CheckCodeFormData.getEvent_submit_do_query());
parameterMap.put("smPolicy", smPolicy);
parameterMap.put("smReturn", smReturn);
parameterMap.put("smApp", smApp);
parameterMap.put("smCharset", smCharset);
parameterMap.put("smTag", smTag);
parameterMap.put("smSign", smSign);
parameterMap.put("identity", Constant.ali1688CheckCodeFormData.getIdentity());
parameterMap.put("captcha", Constant.ali1688CheckCodeFormData.getCaptcha());
parameterMap.put("checkcode", checkcode);
String res = HttpKit.postRequest(formAction, parameterMap, "UTF-8");
if (Toolbox.isNotEmpty(res) && "SUCCESS".equals(res)) {
return "SUCCESS";
}else{
String html = res;
Constant.ali1688CheckCodeFormData = getCheckCodeFormData(smReturn,html);
}
return "";
}
4.exe4j操作:
5.部分界面: