程序实现原理非常简单,使用httpclient+jsoup来实现。使用httpclient来执行请求,jsoup用来解析页面(主要用来获取代理服务器信息的)。
(1)创建Maven程序,导入httpclient和jsoup依赖。
<dependencies>
<!-- httpclient -->
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.5</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.7.3</version>
</dependency>
<!--json工具类-->
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.32</version>
</dependency>
</dependencies>
(2)httpclient工具类:HttpClientUtil
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import org.apache.http.HttpHost;
import org.apache.http.HttpStatus;
import org.apache.http.NameValuePair;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpDelete;
import org.apache.http.client.methods.HttpEntityEnclosingRequestBase;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.methods.HttpPut;
import org.apache.http.client.methods.HttpRequestBase;
import org.apache.http.client.utils.URIBuilder;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
/**
* @author: dmf
* @date: 2019年3月19日
* @description: httpClient工具类
*/
public class HttpClientUtil {
// 编码格式。发送编码格式统一用UTF-8
private static final String ENCODING = "UTF-8";
// 设置连接超时时间,单位毫秒。
private static final int CONNECT_TIMEOUT = 3000;
// 请求获取数据的超时时间(即响应时间),单位毫秒。
private static final int SOCKET_TIMEOUT = 4000;
/**
* @Description:get方式请求
* @param url 请求地址
* @param headers 请求头参数集合
* @param params 请求参数集合
* @param proxy 代理对象
* @return
* @throws Exception
*/
public static HttpClientResult doGet(String url,Map<String,String> headers, Map<String,String> params,HttpHost proxy) throws Exception {
// 创建httpClient对象
CloseableHttpClient httpClient = HttpClients.createDefault();
// 创建访问的地址
URIBuilder uriBuilder = new URIBuilder(url);
if (params != null) {
Set<Entry<String, String>> entrySet = params.entrySet();
for (Entry<String, String> entry : entrySet) {
uriBuilder.setParameter(entry.getKey(), entry.getValue());
}
}
// 创建http对象
HttpGet httpGet = new HttpGet(uriBuilder.build());
//设置代理服务器和超时时间。
RequestConfig requestConfig;
if(proxy!=null) {
requestConfig = RequestConfig.custom().setProxy(proxy).setConnectTimeout(CONNECT_TIMEOUT).setSocketTimeout(SOCKET_TIMEOUT).build();
}else {
requestConfig = RequestConfig.custom().setConnectTimeout(CONNECT_TIMEOUT).setSocketTimeout(SOCKET_TIMEOUT).build();
}
httpGet.setConfig(requestConfig);
// 设置请求头
packageHeader(headers, httpGet);
// 创建httpResponse对象
CloseableHttpResponse httpResponse = null;
try {
// 执行请求并获得响应结果
return getHttpClientResult(httpResponse, httpClient, httpGet);
} finally {
// 释放资源
release(httpResponse, httpClient);
}
}
/**
* @Description:post请求方式
* @param url 请求地址
* @param headers 请求头集合
* @param params 请求参数集合
* @return
* @throws Exception
*/
public static HttpClientResult doPost(String url, Map<String, String> headers, Map<String, String> params,HttpHost proxy) throws Exception {
// 创建httpClient对象
CloseableHttpClient httpClient = HttpClients.createDefault();
// 创建http对象
HttpPost httpPost = new HttpPost(url);
//设置代理服务器以及超时时间
RequestConfig requestConfig;
if(proxy!=null) {
requestConfig = RequestConfig.custom().setProxy(proxy).setConnectTimeout(CONNECT_TIMEOUT).setSocketTimeout(SOCKET_TIMEOUT).build();
}else {
requestConfig = RequestConfig.custom().setConnectTimeout(CONNECT_TIMEOUT).setSocketTimeout(SOCKET_TIMEOUT).build();
}
httpPost.setConfig(requestConfig);
//封装装请求头
packageHeader(headers, httpPost);
// 封装请求参数
packageParam(params, httpPost);
// 创建httpResponse对象
CloseableHttpResponse httpResponse = null;
try {
// 执行请求并获得响应结果
return getHttpClientResult(httpResponse, httpClient, httpPost);
} finally {
// 释放资源
release(httpResponse, httpClient);
}
}
/**
* @Description:put请求
* @param url 请求地址
* @param params 参数集合
* @return
* @throws Exception
*/
public static HttpClientResult doPut(String url, Map<String, String> params) throws Exception {
CloseableHttpClient httpClient = HttpClients.createDefault();
HttpPut httpPut = new HttpPut(url);
RequestConfig requestConfig = RequestConfig.custom().setConnectTimeout(CONNECT_TIMEOUT).setSocketTimeout(SOCKET_TIMEOUT).build();
httpPut.setConfig(requestConfig);
packageParam(params, httpPut);
CloseableHttpResponse httpResponse = null;
try {
return getHttpClientResult(httpResponse, httpClient, httpPut);
} finally {
release(httpResponse, httpClient);
}
}
/**
* @Description:delete请求
* @param url 请求地址
* @param params 参数集合
* @return
* @throws Exception
*/
public static HttpClientResult doDelete(String url) throws Exception {
CloseableHttpClient httpClient = HttpClients.createDefault();
HttpDelete httpDelete = new HttpDelete(url);
RequestConfig requestConfig = RequestConfig.custom().setConnectTimeout(CONNECT_TIMEOUT).setSocketTimeout(SOCKET_TIMEOUT).build();
httpDelete.setConfig(requestConfig);
CloseableHttpResponse httpResponse = null;
try {
return getHttpClientResult(httpResponse, httpClient, httpDelete);
} finally {
release(httpResponse, httpClient);
}
}
/**
* @Description:delete请求,带参数
* @param url 请求地址
* @param params 参数集合
* @return
* @throws Exception
*/
public static HttpClientResult doDelete(String url, Map<String, String> params) throws Exception {
if (params == null) {
params = new HashMap<String, String>();
}
params.put("_method", "delete");
return doPost(url, params,null,null);
}
/**
* Description: 封装请求头
* @param params
* @param httpMethod
*/
public static void packageHeader(Map<String, String> params, HttpRequestBase httpMethod) {
// 封装请求头
if (params != null) {
Set<Entry<String, String>> entrySet = params.entrySet();
for (Entry<String, String> entry : entrySet) {
// 设置到请求头到HttpRequestBase对象中
httpMethod.setHeader(entry.getKey(), entry.getValue());
}
}
}
/**
* @Description: 封装请求参数
* @param params
* @param httpMethod
* @throws UnsupportedEncodingException
*/
public static void packageParam(Map<String, String> params, HttpEntityEnclosingRequestBase httpMethod)
throws UnsupportedEncodingException {
// 封装请求参数
if (params != null) {
List<NameValuePair> nvps = new ArrayList<NameValuePair>();
Set<Entry<String, String>> entrySet = params.entrySet();
for (Entry<String, String> entry : entrySet) {
nvps.add(new BasicNameValuePair(entry.getKey(), entry.getValue()));
}
// 设置到请求的http对象中
httpMethod.setEntity(new UrlEncodedFormEntity(nvps, ENCODING));
}
}
/**
* @Description:执行请求
* @param httpResponse
* @param httpClient
* @throws IOException
*/
public static HttpClientResult getHttpClientResult(CloseableHttpResponse httpResponse,
CloseableHttpClient httpClient, HttpRequestBase httpMethod) throws Exception {
// 执行请求
httpResponse = httpClient.execute(httpMethod);
// 获取返回结果
if (httpResponse != null && httpResponse.getStatusLine() != null) {
String content = "";
if (httpResponse.getEntity() != null) {
content = EntityUtils.toString(httpResponse.getEntity(), ENCODING);
}
return new HttpClientResult(httpResponse.getStatusLine().getStatusCode(), content);
}
return new HttpClientResult(HttpStatus.SC_INTERNAL_SERVER_ERROR);
}
/**
* @Description:释放资源
* @param httpResponse
* @param httpClient
* @throws IOException
*/
public static void release(CloseableHttpResponse httpResponse, CloseableHttpClient httpClient) throws IOException {
// 释放资源
if (httpResponse != null) {
httpResponse.close();
}
if (httpClient != null) {
httpClient.close();
}
}
}
主要就是get和post方法,分别对应get和post请求。
(3)jsoup工具类 JsoupUtil
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
* @author: dmf
* @date: 2019年3月19日
* @Description:jsoup工具类,主要用来爬取免费代理服务ip
*/
public class JsoupUtil {
/**
* @Description: 通过url获取document对象
* @param url:请求地址
* @return
* @throws IOException
*/
public static Document getDocByUrl(String url) throws IOException{
Document doc = Jsoup.connect(url)
.userAgent("Mozilla")
.cookie("auth", "token")
.timeout(3000)
.post();
return doc;
}
/**
* @Description:通过一个html字符串获取document对象
* @param html:网页的html代码字符串
* @return
*/
public static Document getDocByHtml(String html){
Document doc = Jsoup.parse(html);
return doc;
}
/**
* @Description:解析网页,获取网页上的免费代理ip和端口信息。根据具体网页不同,解析也过程不同,所以需要根据具体网页编写该方法。
* 此方法解析的是快代理的免费代理。(https://www.kuaidaili.com/free)
* @param doc:网页的doc对象
* @return
*/
public static List<Map<String, String>> getData(Document doc) {
List<Map<String, String>> list = new ArrayList<>();
Element ele = doc.getElementById("list");
Elements eletrs = ele.getElementsByTag("tr");
//循环tr标签
for (Element eletr : eletrs) {
Elements eletds = eletr.getElementsByTag("td");
//保存td标签里的ip和port的值
Map<String, String> map = new HashMap<>();
for (Element eletd : eletds) {
if("IP".equals(eletd.attr("data-title"))){
map.put("ip", eletd.text());
//System.out.println(element.text());
}
if("PORT".equals(eletd.attr("data-title"))) {
map.put("port", eletd.text());
}
//获取代理服务器类型。http/https
if("类型".equals(eletd.attr("data-title"))) {
map.put("type", eletd.text());
}
}
if(!map.isEmpty()) {
list.add(map);
}
}
return list;
}
}
注意getData方法,此方法是用来获取代理服务器信息的。解析的是快代理(https://www.kuaidaili.com/free) 的免费代理服务器信息。如果需要使用其他的代理服务器需要自行编写方法。
(4)返回结果类,用来封装返回结果
import java.io.Serializable;
public class HttpClientResult implements Serializable{
@Override
public String toString() {
return "HttpClientResult [code=" + code + ", content=" + content + "]";
}
public int getCode() {
return code;
}
public void setCode(int code) {
this.code = code;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
private static final long serialVersionUID = 1L;
/**
* 响应状态码
*/
private int code;
/**
* 响应数据
*/
private String content;
public HttpClientResult(int code) {
this.code = code;
}
public HttpClientResult(int code,String content) {
this.code = code;
this.content = content;
}
}
(5)主类:
package com.dmf.reptile;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import org.apache.http.HttpHost;
import org.jsoup.nodes.Document;
import com.dmf.reptile.utils.HttpClientResult;
import com.dmf.reptile.utils.HttpClientUtil;
import com.dmf.reptile.utils.JsoupUtil;
public class Test {
public static List<Map<String, String>> proxydata = new ArrayList<>();
//需要刷访问量的csdn博客地址
public static String[] links = { "https://blog.csdn.net/qq_34609889/article/details/86714796",
"https://blog.csdn.net/qq_34609889/article/details/86679463"
};
public static int num = -200;
public static void main(String[] args) throws Exception {
//使用线程池来操作
ExecutorService pool = Executors.newFixedThreadPool(3);
for (int i = 0; i < 3; i++) {
pool.execute(new Runnable() {
@Override
public void run() {
System.out.println("线程"+Thread.currentThread().getName()+"启动!");
test();
}
});
}
pool.shutdown();
}
public synchronized static void addNum() {
num+=200;
try {
//线程间隔三秒启动,快代理设置了同一时间同一个ip只能访问一次,短时间内次数太多容易被封ip;
Thread.sleep(3000);
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public static void test() {
// 第一层循环取出每页的代理ip
addNum();
for (int i = num; i < num+200; i++) {
//快代理高匿名代理的url规则
String url = "https://www.kuaidaili.com/free/inha/" + (i + 1) + "/";
//普通代理url规则
// String url = "https://www.kuaidaili.com/free/intr/" + (i + 1) + "/";
//获取代理服务器信息
proxydata = getProxy(url);
// 循环通过代理IP访问link
for (Map<String,String> map : proxydata) {
// 生成代理服务器对象
HttpHost proxy = new HttpHost(map.get("ip").toString(), Integer.parseInt(map.get("port").toString()),
map.get("type").toString());
int code = 0;
// 每个link访问一遍
for (String link : links) {
try {
code = doget(link,proxy).getCode();
// 如果不是200,说明该ip不能访问csdn,直接跳过后续的link,使用下一个代理服务器
if (code!=200) {
break;
}
System.out.println("第"+(i+1)+"页代理地址:" + map.get("ip") + ":" + map.get("port") + "------访问结果:" + code);
} catch (Exception e1) {
System.out.println("第"+(i+1)+"页代理ip无效!"+ map.get("ip") + ":" + map.get("port"));
// 直接退出循环,无需每个link都去访问
break;
}
}
}
}
}
/**
* @Descript:获取代理服务器信息(ip、端口、类型)
* @param url
* @return
* @throws Exception
*/
public static List<Map<String, String>> getProxy(String url){
//如果代理服务器网站做了封ip的策略,如果访问过于频繁可能会被封ip,用单线程的话应该不会,但是使用多线程的话很容易被封,我用10个线程跑过,瞬间被封了。
//可以通过设置代理服务器去访问,去其他代理服务器网站找个免费能用的就行。
//HttpHost proxy = new HttpHost("163.125.232.238",8118);
Document doc = null;
//请求结果
HttpClientResult result = null;
try {
//
result = doget(url,null);
//result = doget(url,proxy);
//如果返回不是200,说明没有获取到代理服务器信息
if(result.getCode()!=200)
throw new Exception();
//通过httpclient取到html网页
doc = JsoupUtil.getDocByHtml(result.getContent());
} catch (Exception e) {
System.out.println("获取代理服务器信息失败!");
}
//使用jsoup自带的访问url
// Document doc = JsoupUtil.getDocByUrl(url);
List<Map<String, String>> list = JsoupUtil.getData(doc);
return list;
}
//使用post请求
public static HttpClientResult dopost(String url, HttpHost proxy) throws Exception {
//设置请求头集合
Map<String, String> headers = new HashMap<String, String>();
// headers.put("Cookie", "123");
headers.put("Connection", "keep-alive");
headers.put("Accept", "application/json");
headers.put("Accept-Language", "zh-CN,zh;q=0.9");
headers.put("User-Agent",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36");
HttpClientResult result = HttpClientUtil.doPost(url, headers, null,proxy);
return result;
}
//使用get请求
public static HttpClientResult doget(String url,HttpHost proxy) throws Exception {
//设置请求头集合
Map<String, String> headers = new HashMap<String, String>();
// headers.put("Cookie", "123");
//headers.put("Connection", "keep-alive");
headers.put("Accept", "application/json");
headers.put("cache-control", "max-age=0");
headers.put("authority", "blog.csdn.net");
headers.put("accept-encoding", "gzip, deflate, br");
headers.put("Accept-Language", "zh-CN,zh;q=0.9");
headers.put("User-Agent",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36");
HttpClientResult result = HttpClientUtil.doGet(url, headers, null,proxy);
return result;
}
}
代码中使用了多线程来执行,由于本人对java多线程研究不深,有不对或者不全之处欢迎大佬指正。
注意事项:
1、由于使用的是免费代理,所以大部分的代理服务器都是不能用的,只有非常小的部分能用。导致能刷的数量非常有限。所以想要刷到几万条的话,只能去买代理ip,网上应该有一些代理网站有几块钱的套餐,提供几百上千个代理ip。
2、因为程序要先去快代理爬取免费代理服务器的信息,所以我们需要先访问快代理的网站,但是快代理设置了ip限制,同一个短时间内只能访问一次,使用多线程的话,后面的线程获取到的是错误页面,而且线程太多容易被封ip。所以需要让线程池里的线程间隔执行。程序里通过Thread.sleep(3000)来实现。
3、csdn网站每篇博客,在一分钟同一个ip只能增加一个访问量,所以需要设置代理服务器。
需要源码的话可以去我的github上下载 :