年前在考虑搞个关键扩词工具,如果直接抓取google的相关搜索。用不了几次就别封ip了,设了抓取间隔时长也没用(也可能设的时候太短),没办法只能抓取代理ip了,便有了下面的小demo。可惜的是网络资源还是太少,不够抓的,真正能用的代理ip没几个。根本形不成可用的规模。代码如下给可能用到的人做个参考吧:
package com.emar.spider;
import java.util.HashMap;
import java.util.Map;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.emar.core.httpClient.HttpclientUtil;
import com.emar.core.util.PropertiesUtil;
public class Proxy {
private static final String url = "http://www.proxycn.com/html_proxy/30fastproxy-1.html";
private static Map<String, String> ipMap = new HashMap<String, String>();
private static final String filePath = "D:/sts/workspace-sts-2.5.1_t1/sf3a/src/main/resources/proxy.properties";
public static Map<String, String> getProxyMap() {
String html = "";
boolean flag = true;
while (flag) {
try {
html = HttpclientUtil.get(url, null, "GB2312");
flag = false;
} catch (Exception e) {
e.printStackTrace();
}
}
Document doc = Jsoup.parse(html);
Elements trs = doc.select("tr[onDblClick]");
for (Element e : trs) {
String ip = e.attr("onDblClick").replaceAll("clip", "")
.replaceAll("已拷贝到剪贴板!", "").replaceAll("alert", "")
.replaceAll("'", "").replaceAll(";", "")
.replaceAll("\\(", "").replaceAll("\\)", "");
String[] ipArray = ip.split(":");
ipMap.put(ipArray[0], ipArray[1]);
}
return ipMap;
}
public static void writeValidProxy() {
Map<String, String> ipMap = getProxyMap();
System.out.println("本次共获取到的:" + ipMap.size() + "个代理");
for (String ip : ipMap.keySet()) {
String port = ipMap.get(ip);
System.out.println("获取新的待检验的:" + ip + "=" + port);
boolean flag = HttpclientUtil.checkProxy(ip, port);
if (flag) {
System.out.println("写入有效:" + ip + "=" + port);
PropertiesUtil.writeProperties(filePath, ip, port);
} else {
System.out.println("移除失效:" + ip + "=" + port);
PropertiesUtil.removeProperties(filePath, ip);
}
}
}
public static Map<String, String> getValidProxyMap() {
return PropertiesUtil.readProperties(filePath);
}
public static void removeInvalidProxy() {
Map<String, String> ipMap = getValidProxyMap();
for (String ip : ipMap.keySet()) {
String port = ipMap.get(ip);
System.out.println("校验原有:" + ip + "=" + port);
boolean flag = HttpclientUtil.checkProxy(ip, port);
if (!flag) {
System.out.println("移除失效:" + ip + "=" + port);
PropertiesUtil.removeProperties(filePath, ip);
}
}
}
/**
* @param args
* @throws InterruptedException
*/
public static void main(String[] args) {
while (true) {
try {
removeInvalidProxy();
writeValidProxy();
Thread.sleep(1000 * 60 * 30);
} catch (Exception e) {
e.printStackTrace();
}
}
}
}