java 爬取代理IP 终极版

思路: 获取网页上的代理IP--实体/集合--验证ip是否测通--设置响应时间--200的放到队列里面

package com.yanshu.service;


import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.InetAddress;
import java.net.URL;
import java.net.UnknownHostException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Queue;
import java.util.Timer;
import java.util.TimerTask;
import java.util.regex.Matcher;
import java.util.regex.Pattern;


import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.context.annotation.Primary;
import org.springframework.stereotype.Service;


import com.alibaba.fastjson.JSON;
import com.yanshu.pojo.ProxyPojo;
import com.yanshu.utils.AbstractSpider;
import com.yanshu.utils.ProxyUtil;
@Service
@Primary
public class ProxyService {
static List listIPCode=new ArrayList<>();
 Queue queueonejson = new LinkedList<>();
 Queue queueonetxt = new LinkedList<>();
//根据给定的网站,解析html。解析完成后把代理存到队列。
public  List<ProxyPojo> getProxys() throws Exception
{
//http://www.ip181.com/
//http://www.httpdaili.com/mfdl/
String str = "http://www.ip181.com/";
Document doc = Jsoup.connect(str)
.header("Accept",
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8")
.header("Accept-Encoding", "gzip, deflate").header("Accept-Language", "zh-CN,zh;q=0.8")
.header("Cookie",
"yd_cookie=e64da574-bcaf-4d0399793aa77242311f7594721786d92f16; _ydclearance=7cf765a4ef379341c2883e12-7e36-4a16-9700-cf015dcb3470-1506759060; channelid=0; sid=1506751100012222; Hm_lvt_7ed65b1cc4b810e9fd37959c9bb51b31=1506565718,1506751872; Hm_lpvt_7ed65b1cc4b810e9fd37959c9bb51b31=1506751952; _ga=GA1.2.134580015.1506565719; _gid=GA1.2.1367043272.1506751872; _gat=1")
.header("User-Agent",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36")
.get();

//System.out.println(doc.text());
//Document doc = Jsoup.connect(str).get();
//#kb-wall-truth-list > li:nth-child(1) > div > div.kb-item-wrap11 > table > tbody >
Elements tag = doc.select(" tr");
List<ProxyPojo> proxys = new ArrayList<ProxyPojo>();
for (Element element : tag)
{
//使用选择器获取到每一个tr的下面的第一个td标签
String ip = element.select("td:nth-child(1)").text();
String prot = element.select("td:nth-child(2)").text();
String city = element.select("td:nth-child(6)").text();

if (ip.contains("端口号") || prot.contains("端口") || "".equals(ip) || "".equals(prot))
{
continue;
}

String content = "IP"+ ip + "端口号:"+ prot + "城市:" +city;

ProxyPojo proxy = new ProxyPojo(); 

proxy.setIp(ip);
proxy.setProt(prot);
proxy.setCity(city);
proxys.add(proxy);

}
if (proxys.size() > 1) {
for (ProxyPojo entity : proxys) {
String enip = entity.getIp();
//System.out.println(isPing(enip));
// isReachable方法是测试主机是否可以联通
if (isPing(enip) == true) {


String enProt = entity.getProt();
String hurl = enip + ":" + enProt;
try {


Timer timer = new Timer();
timer.schedule(new TimerTask() {
public void run() {


URL url;
try {
// 生成一个URL对象,要获取源代码的网页地址为:http://www.sina.com.cn
url = new URL("http://" + hurl);
//System.out.println("url==>" + url);
// 打开URL
HttpURLConnection urlConnection = (HttpURLConnection) url.openConnection();
int responsecode = 0;
// 获取服务器响应代码
responsecode = urlConnection.getResponseCode();
//System.out.println("code==>" + responsecode);
if (responsecode == 200) {
queueonejson.offer(entity);
//System.out.println("code200:==" + queueFourtxt);
}
} catch (Exception e) {
// TODO Auto-generated catch block
System.out.println("获取不到网页的源码,服务器响应代码为:" + e);
}
}
}, 1000);// 设定指定的时间time,此处为2000毫秒
} catch (Exception e) {
// TODO Auto-generated catch block
System.out.println("获取不到网页的源码,服务器响应代码为:" + e);
}
}
}
}
//return (List<ProxyPojo>) queueonejson;
int size = queueonejson.size();
List listproxy=new ArrayList();
Map mapproxy=new HashMap();
mapproxy.put("code", "302");
mapproxy.put("message", "No data!!!");
listproxy.add(mapproxy);
return size!=0?(List<ProxyPojo>) queueonejson:(List<ProxyPojo>) JSON.toJSON(listproxy);

}

//根据给定的网站,解析html。解析完成后把代理存到队列。
public  List<?> getProxy() throws Exception
{

String ipReg = "\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3} \\d{1,6}";
Pattern ipPtn = Pattern.compile(ipReg);
//http://www.ip181.com/
//http://www.httpdaili.com/mfdl/
String str = "http://www.ip181.com/";
Document doc = Jsoup.connect(str)
.header("Accept",
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8")
.header("Accept-Encoding", "gzip, deflate").header("Accept-Language", "zh-CN,zh;q=0.8")
.header("Cookie",
"yd_cookie=e64da574-bcaf-4d0399793aa77242311f7594721786d92f16; _ydclearance=7cf765a4ef379341c2883e12-7e36-4a16-9700-cf015dcb3470-1506759060; channelid=0; sid=1506751100012222; Hm_lvt_7ed65b1cc4b810e9fd37959c9bb51b31=1506565718,1506751872; Hm_lpvt_7ed65b1cc4b810e9fd37959c9bb51b31=1506751952; _ga=GA1.2.134580015.1506565719; _gid=GA1.2.1367043272.1506751872; _gat=1")
.header("User-Agent",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36")
.get();

//Document doc = Jsoup.connect(str).get();
//#kb-wall-truth-list > li:nth-child(1) > div > div.kb-item-wrap11 > table > tbody >
Elements tag = doc.select(" tr");
List<ProxyPojo> proxys = new ArrayList<ProxyPojo>();
for (Element element : tag)
{
//使用选择器获取到每一个tr的下面的第一个td标签
String ip = element.select("td:nth-child(1)").text();
String prot = element.select("td:nth-child(2)").text();
String city = element.select("td:nth-child(6)").text();

if (ip.contains("端口号") || prot.contains("端口") || "".equals(ip) || "".equals(prot))
{
continue;
}

String content = "IP"+ ip + "端口号:"+ prot + "城市:" +city;

ProxyPojo proxy = new ProxyPojo(); 

proxy.setIp(ip);
proxy.setProt(prot);
proxy.setCity(city);
proxys.add(proxy);

}
if (proxys.size() > 1) {
for (ProxyPojo entity : proxys) {
String enip = entity.getIp();
//System.out.println(isPing(enip));
// isReachable方法是测试主机是否可以联通
if (isPing(enip) == true) {


String enProt = entity.getProt();
String hurl = enip + ":" + enProt;
try {


Timer timer = new Timer();
timer.schedule(new TimerTask() {
public void run() {


URL url;
try {
// 生成一个URL对象,要获取源代码的网页地址为:http://www.sina.com.cn
url = new URL("http://" + hurl);
// System.out.println("url==>" + url);
// 打开URL
HttpURLConnection urlConnection = (HttpURLConnection) url.openConnection();
int responsecode = 0;
// 获取服务器响应代码
responsecode = urlConnection.getResponseCode();

if (responsecode == 200) {
queueonetxt.offer(hurl);

}
} catch (Exception e) {
// TODO Auto-generated catch block
System.out.println("获取不到网页的源码,服务器响应代码为:" + e);
}
}
}, 1000);// 设定指定的时间time,此处为2000毫秒
} catch (Exception e) {
// TODO Auto-generated catch block
System.out.println("获取不到网页的源码,服务器响应代码为:" + e);
}
}
}
}
//return (List<?>) removeDuplicate(queueonetxt);
int size = queueonetxt.size();
List listproxy=new ArrayList();
Map mapproxy=new HashMap();
mapproxy.put("code", "302");
mapproxy.put("message", "No data!!!");
listproxy.add(mapproxy);
return size!=0?(List<?>) removeDuplicate(queueonetxt):(List<?>) JSON.toJSON(listproxy);

}

//使用isReachable方法探测主机是否可以连通
  public static boolean isPing(String ip) {
       boolean status = false;
       if(ip != null) {
           try {
               status = InetAddress.getByName(ip).isReachable(1000);
               //System.out.println("status==>"+status);
           } catch(UnknownHostException e) {
           }
           catch(IOException e) {
           }
       }
       return status;
   }
  


  public static int getCode(String httpurl)
  {
  URL url;
       int responsecode = 0;
       HttpURLConnection urlConnection;
     
       try{
           //生成一个URL对象,要获取源代码的网页地址为:http://www.sina.com.cn
           url=new URL("http://"+httpurl);
           //打开URL
           urlConnection = (HttpURLConnection)url.openConnection();
           //获取服务器响应代码
           responsecode=urlConnection.getResponseCode();
           //等待3秒--如果不行就下一个---不能超时
    /* int maxCount = 2;
           int currentCount = 0;
           while(currentCount < maxCount){
               int code = responsecode;
               if (code!= 200){ // 请求不成功
                System.out.println("---111---");
                        currentCount++;
                        System.out.println("---currentCount---"+currentCount);
                        Thread.sleep(3000);// 等待3s
               }
       }*/
          
           //System.out.println("responsecode==>"+responsecode);
           if(responsecode==200){
              System.out.println("httpurl==>"+httpurl);
           }
           else{
               System.out.println("获取不到网页的源码,服务器响应代码为:"+responsecode);
           }
       }
       catch(Exception e){
           System.out.println("获取不到网页的源码,出现异常:"+e);
       }
return responsecode;
  }
/**
* 去除重复元素
*
* @param queueAll
*            需要处理的list
* @param <T>
*            泛型方法
* @return 去重后的list
*/
public static <T> List<T> removeDuplicate(Queue queueAll) {
if (queueAll == null || queueAll.size() == 0) {
return new ArrayList<>();
}
return new ArrayList<>(new HashSet<>(queueAll));
}


 





}

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值