记录一次网页爬取 快代理 免费IP代理的方式 使用JAVA实现
一: 使用 HttpClient + Jsoup 获取并解析网页源代码(多线程)
创建 IPEntity类(实体类)
package edu.liuzheng;
/**
* @author qingyun
* @version 1.0
* @date 2021/5/23 20:41
*/
public class IPEntity {
/**
* IP
*/
public String IP;
/**
* 端口
*/
public String PORT;
/**
* 匿名度
*/
public String Anonymity;
/**
* 类型
*/
public String type;
/**
* 位置
*/
public String position;
/**
* 响应时间
*/
public String ResponseSpeed;
/**
* 最后验证时间
*/
public String LastVerificationTime;
public String getIP() {
return IP;
}
public void setIP(String IP) {
this.IP = IP;
}
public String getPORT() {
return PORT;
}
public void setPORT(String PORT) {
this.PORT = PORT;
}
public String getAnonymity() {
return Anonymity;
}
public void setAnonymity(String anonymity) {
Anonymity = anonymity;
}
public String getType() {
return type;
}
public void setType(String type) {
this.type = type;
}
public String getPosition() {
return position;
}
public void setPosition(String position) {
this.position = position;
}
public String getResponseSpeed() {
return ResponseSpeed;
}
public void setResponseSpeed(String responseSpeed) {
ResponseSpeed = responseSpeed;
}
public String getLastVerificationTime() {
return LastVerificationTime;
}
public void setLastVerificationTime(String lastVerificationTime) {
LastVerificationTime = lastVerificationTime;
}
@Override
public String toString() {
return "IPEntity{" +
"IP='" + IP + '\'' +
", PORT='" + PORT + '\'' +
", Anonymity='" + Anonymity + '\'' +
", type='" + type + '\'' +
", position='" + position + '\'' +
", ResponseSpeed='" + ResponseSpeed + '\'' +
", LastVerificationTime='" + LastVerificationTime + '\'' +
'}';
}
}
创建 CrawlingTest (主要方法类)
package edu.liuzheng;
import org.apache.http.HttpEntity;
import org.apache.http.HttpStatus;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.utils.HttpClientUtils;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;
/**
* 使用 HttpClient + Jsoup 获取并解析网页源代码
* 继承线程类
* @author qingyun
* @version 1.0
* @date 2021/5/23 21:36
*/
public class CrawlingTest implements Runnable{
/**
* 存储IP信息List
*/
static List<IPEntity> ipEntities = new ArrayList<>();
/**
* 设置爬取的页面(使用的多线程故设置属性为AtomicInteger 防止线程错误)
*/
public static AtomicInteger index = new AtomicInteger(1);
public CrawlingTest() {
}
public static void getIPList(){
//1.生成httpclient,相当于该打开一个浏览器
CloseableHttpClient httpClient = HttpClients.createDefault();
CloseableHttpResponse response = null;
//2.创建get请求,相当于在浏览器地址栏输入 网址
HttpGet request = new HttpGet("https://www.kuaidaili.com/free/inha/"+index+"/");
//设置请求头,将爬虫伪装成浏览器(如果无伪装可能无法接收信息)
request.setHeader("User-Agent","Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36");
try {
//3.执行get请求,相当于在输入地址栏后敲回车键
response = httpClient.execute(request);
//输出相应状态信息
System.out.println(response.getStatusLine().getStatusCode());
//4.判断响应状态为200,进行处理
if(response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
//5.获取响应内容
HttpEntity httpEntity = response.getEntity();
String html = EntityUtils.toString(httpEntity, "utf-8");
// System.out.println(html);
/**
* 下面是Jsoup展现自我的平台
*/
//6.Jsoup解析html
Document document = Jsoup.parse(html);
//像js一样,查找到第一个表格信息
Element postList = document.selectFirst("tbody");
//像js一样,获取tr
Elements postItems = postList.select("tr");
//循环处理每篇博客
for (Element postItem : postItems) {
//解析TD信息
Elements titleEle = postItem.select("td");
IPEntity ipEntity = new IPEntity();
System.out.println("IP:" + titleEle.get(0).text());
ipEntity.setIP(titleEle.get(0).text());
System.out.println("PORT:" + titleEle.get(1).text());
ipEntity.setPORT(titleEle.get(1).text());
System.out.println("匿名度:" + titleEle.get(2).text());
ipEntity.setAnonymity(titleEle.get(2).text());
System.out.println("类型:" + titleEle.get(3).text());
ipEntity.setType(titleEle.get(3).text());
System.out.println("位置:" + titleEle.get(4).text());
ipEntity.setPosition(titleEle.get(4).text());
System.out.println("响应速度:" + titleEle.get(5).text());
ipEntity.setResponseSpeed(titleEle.get(5).text());
System.out.println("最后验证时间:" + titleEle.get(6).text());
ipEntity.setLastVerificationTime(titleEle.get(6).text());
//存储信息
ipEntities.add(ipEntity);
}
System.out.println("第"+index+"页完成");
} else {
//如果返回状态不是200,比如404(页面不存在)等,根据情况做处理,这里略
System.out.println("返回状态不是200");
System.out.println(EntityUtils.toString(response.getEntity(), "utf-8"));
}
} catch (ClientProtocolException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
//6.关闭流
HttpClientUtils.closeQuietly(response);
HttpClientUtils.closeQuietly(httpClient);
//获得当前值并且加1
index.getAndIncrement();
}
}
@Override
public void run() {
//多线程执行的主体方法
getIPList();
}
}
二: 设置 ScheduledExecutorService 线程池子 执行多线程任务
创建 HttpClientTest 类
package edu.liuzheng;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
public class HttpClientTest {
public static void main(String[] args) {
/**
* 创建定时任务线程池 10 corePoolSize 核心池大小
*/
ScheduledExecutorService service = Executors.newScheduledThreadPool(10);
//初次延迟
long initialDelay = 1;
//延迟时间
long period = 3;
//执行任务下载任务
service.scheduleAtFixedRate(new CrawlingTest(), initialDelay, period, TimeUnit.SECONDS);
}
}
结果如下