思路: 获取网页上的代理IP--实体/集合--验证ip是否测通--设置响应时间--200的放到队列里面
package com.yanshu.service;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.InetAddress;
import java.net.URL;
import java.net.UnknownHostException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Queue;
import java.util.Timer;
import java.util.TimerTask;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.context.annotation.Primary;
import org.springframework.stereotype.Service;
import com.alibaba.fastjson.JSON;
import com.yanshu.pojo.ProxyPojo;
import com.yanshu.utils.AbstractSpider;
import com.yanshu.utils.ProxyUtil;
@Service
@Primary
public class ProxyService {
static List listIPCode=new ArrayList<>();
Queue queueonejson = new LinkedList<>();
Queue queueonetxt = new LinkedList<>();
//根据给定的网站,解析html。解析完成后把代理存到队列。
public List<ProxyPojo> getProxys() throws Exception
{
//http://www.ip181.com/
//http://www.httpdaili.com/mfdl/
String str = "http://www.ip181.com/";
Document doc = Jsoup.connect(str)
.header("Accept",
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8")
.header("Accept-Encoding", "gzip, deflate").header("Accept-Language", "zh-CN,zh;q=0.8")
.header("Cookie",
"yd_cookie=e64da574-bcaf-4d0399793aa77242311f7594721786d92f16; _ydclearance=7cf765a4ef379341c2883e12-7e36-4a16-9700-cf015dcb3470-1506759060; channelid=0; sid=1506751100012222; Hm_lvt_7ed65b1cc4b810e9fd37959c9bb51b31=1506565718,1506751872; Hm_lpvt_7ed65b1cc4b810e9fd37959c9bb51b31=1506751952; _ga=GA1.2.134580015.1506565719; _gid=GA1.2.1367043272.1506751872; _gat=1")
.header("User-Agent",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36")
.get();
//System.out.println(doc.text());
//Document doc = Jsoup.connect(str).get();
//#kb-wall-truth-list > li:nth-child(1) > div > div.kb-item-wrap11 > table > tbody >
Elements tag = doc.select(" tr");
List<ProxyPojo> proxys = new ArrayList<ProxyPojo>();
for (Element element : tag)
{
//使用选择器获取到每一个tr的下面的第一个td标签
String ip = element.select("td:nth-child(1)").text();
String prot = element.select("td:nth-child(2)").text();
String city = element.select("td:nth-child(6)").text();
if (ip.contains("端口号") || prot.contains("端口") || "".equals(ip) || "".equals(prot))
{
continue;
}
String content = "IP"+ ip + "端口号:"+ prot + "城市:" +city;
ProxyPojo proxy = new ProxyPojo();
proxy.setIp(ip);
proxy.setProt(prot);
proxy.setCity(city);
proxys.add(proxy);
}
if (proxys.size() > 1) {
for (ProxyPojo entity : proxys) {
String enip = entity.getIp();
//System.out.println(isPing(enip));
// isReachable方法是测试主机是否可以联通
if (isPing(enip) == true) {
String enProt = entity.getProt();
String hurl = enip + ":" + enProt;
try {
Timer timer = new Timer();
timer.schedule(new TimerTask() {
public void run() {
URL url;
try {
// 生成一个URL对象,要获取源代码的网页地址为:http://www.sina.com.cn
url = new URL("http://" + hurl);
//System.out.println("url==>" + url);
// 打开URL
HttpURLConnection urlConnection = (HttpURLConnection) url.openConnection();
int responsecode = 0;
// 获取服务器响应代码
responsecode = urlConnection.getResponseCode();
//System.out.println("code==>" + responsecode);
if (responsecode == 200) {
queueonejson.offer(entity);
//System.out.println("code200:==" + queueFourtxt);
}
} catch (Exception e) {
// TODO Auto-generated catch block
System.out.println("获取不到网页的源码,服务器响应代码为:" + e);
}
}
}, 1000);// 设定指定的时间time,此处为2000毫秒
} catch (Exception e) {
// TODO Auto-generated catch block
System.out.println("获取不到网页的源码,服务器响应代码为:" + e);
}
}
}
}
//return (List<ProxyPojo>) queueonejson;
int size = queueonejson.size();
List listproxy=new ArrayList();
Map mapproxy=new HashMap();
mapproxy.put("code", "302");
mapproxy.put("message", "No data!!!");
listproxy.add(mapproxy);
return size!=0?(List<ProxyPojo>) queueonejson:(List<ProxyPojo>) JSON.toJSON(listproxy);
}
//根据给定的网站,解析html。解析完成后把代理存到队列。
public List<?> getProxy() throws Exception
{
String ipReg = "\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3} \\d{1,6}";
Pattern ipPtn = Pattern.compile(ipReg);
//http://www.ip181.com/
//http://www.httpdaili.com/mfdl/
String str = "http://www.ip181.com/";
Document doc = Jsoup.connect(str)
.header("Accept",
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8")
.header("Accept-Encoding", "gzip, deflate").header("Accept-Language", "zh-CN,zh;q=0.8")
.header("Cookie",
"yd_cookie=e64da574-bcaf-4d0399793aa77242311f7594721786d92f16; _ydclearance=7cf765a4ef379341c2883e12-7e36-4a16-9700-cf015dcb3470-1506759060; channelid=0; sid=1506751100012222; Hm_lvt_7ed65b1cc4b810e9fd37959c9bb51b31=1506565718,1506751872; Hm_lpvt_7ed65b1cc4b810e9fd37959c9bb51b31=1506751952; _ga=GA1.2.134580015.1506565719; _gid=GA1.2.1367043272.1506751872; _gat=1")
.header("User-Agent",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36")
.get();
//Document doc = Jsoup.connect(str).get();
//#kb-wall-truth-list > li:nth-child(1) > div > div.kb-item-wrap11 > table > tbody >
Elements tag = doc.select(" tr");
List<ProxyPojo> proxys = new ArrayList<ProxyPojo>();
for (Element element : tag)
{
//使用选择器获取到每一个tr的下面的第一个td标签
String ip = element.select("td:nth-child(1)").text();
String prot = element.select("td:nth-child(2)").text();
String city = element.select("td:nth-child(6)").text();
if (ip.contains("端口号") || prot.contains("端口") || "".equals(ip) || "".equals(prot))
{
continue;
}
String content = "IP"+ ip + "端口号:"+ prot + "城市:" +city;
ProxyPojo proxy = new ProxyPojo();
proxy.setIp(ip);
proxy.setProt(prot);
proxy.setCity(city);
proxys.add(proxy);
}
if (proxys.size() > 1) {
for (ProxyPojo entity : proxys) {
String enip = entity.getIp();
//System.out.println(isPing(enip));
// isReachable方法是测试主机是否可以联通
if (isPing(enip) == true) {
String enProt = entity.getProt();
String hurl = enip + ":" + enProt;
try {
Timer timer = new Timer();
timer.schedule(new TimerTask() {
public void run() {
URL url;
try {
// 生成一个URL对象,要获取源代码的网页地址为:http://www.sina.com.cn
url = new URL("http://" + hurl);
// System.out.println("url==>" + url);
// 打开URL
HttpURLConnection urlConnection = (HttpURLConnection) url.openConnection();
int responsecode = 0;
// 获取服务器响应代码
responsecode = urlConnection.getResponseCode();
if (responsecode == 200) {
queueonetxt.offer(hurl);
}
} catch (Exception e) {
// TODO Auto-generated catch block
System.out.println("获取不到网页的源码,服务器响应代码为:" + e);
}
}
}, 1000);// 设定指定的时间time,此处为2000毫秒
} catch (Exception e) {
// TODO Auto-generated catch block
System.out.println("获取不到网页的源码,服务器响应代码为:" + e);
}
}
}
}
//return (List<?>) removeDuplicate(queueonetxt);
int size = queueonetxt.size();
List listproxy=new ArrayList();
Map mapproxy=new HashMap();
mapproxy.put("code", "302");
mapproxy.put("message", "No data!!!");
listproxy.add(mapproxy);
return size!=0?(List<?>) removeDuplicate(queueonetxt):(List<?>) JSON.toJSON(listproxy);
}
//使用isReachable方法探测主机是否可以连通
public static boolean isPing(String ip) {
boolean status = false;
if(ip != null) {
try {
status = InetAddress.getByName(ip).isReachable(1000);
//System.out.println("status==>"+status);
} catch(UnknownHostException e) {
}
catch(IOException e) {
}
}
return status;
}
public static int getCode(String httpurl)
{
URL url;
int responsecode = 0;
HttpURLConnection urlConnection;
try{
//生成一个URL对象,要获取源代码的网页地址为:http://www.sina.com.cn
url=new URL("http://"+httpurl);
//打开URL
urlConnection = (HttpURLConnection)url.openConnection();
//获取服务器响应代码
responsecode=urlConnection.getResponseCode();
//等待3秒--如果不行就下一个---不能超时
/* int maxCount = 2;
int currentCount = 0;
while(currentCount < maxCount){
int code = responsecode;
if (code!= 200){ // 请求不成功
System.out.println("---111---");
currentCount++;
System.out.println("---currentCount---"+currentCount);
Thread.sleep(3000);// 等待3s
}
}*/
//System.out.println("responsecode==>"+responsecode);
if(responsecode==200){
System.out.println("httpurl==>"+httpurl);
}
else{
System.out.println("获取不到网页的源码,服务器响应代码为:"+responsecode);
}
}
catch(Exception e){
System.out.println("获取不到网页的源码,出现异常:"+e);
}
return responsecode;
}
/**
* 去除重复元素
*
* @param queueAll
* 需要处理的list
* @param <T>
* 泛型方法
* @return 去重后的list
*/
public static <T> List<T> removeDuplicate(Queue queueAll) {
if (queueAll == null || queueAll.size() == 0) {
return new ArrayList<>();
}
return new ArrayList<>(new HashSet<>(queueAll));
}
}