使用Httpclient+jsoup实现刷csdn博客访问量程序

程序实现原理非常简单,使用httpclient+jsoup来实现。使用httpclient来执行请求,jsoup用来解析页面(主要用来获取代理服务器信息的)。
(1)创建Maven程序,导入httpclient和jsoup依赖。

<dependencies>
		<!-- httpclient -->
		<dependency>
			<groupId>org.apache.httpcomponents</groupId>
			<artifactId>httpclient</artifactId>
			<version>4.5.5</version>
		</dependency>

		<dependency>
			<groupId>org.jsoup</groupId>
			<artifactId>jsoup</artifactId>
			<version>1.7.3</version>
		</dependency>
        <!--json工具类-->
		<dependency>
			<groupId>com.alibaba</groupId>
			<artifactId>fastjson</artifactId>
			<version>1.2.32</version>
		</dependency>
	</dependencies>

(2)httpclient工具类:HttpClientUtil

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import org.apache.http.HttpHost;
import org.apache.http.HttpStatus;
import org.apache.http.NameValuePair;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpDelete;
import org.apache.http.client.methods.HttpEntityEnclosingRequestBase;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.methods.HttpPut;
import org.apache.http.client.methods.HttpRequestBase;
import org.apache.http.client.utils.URIBuilder;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;

/**
 * @author: dmf
 * @date: 2019年3月19日
 * @description: httpClient工具类
 */
public class HttpClientUtil {


	// 编码格式。发送编码格式统一用UTF-8
    private static final String ENCODING = "UTF-8";
    
    // 设置连接超时时间,单位毫秒。
    private static final int CONNECT_TIMEOUT = 3000;
    
    // 请求获取数据的超时时间(即响应时间),单位毫秒。
    private static final int SOCKET_TIMEOUT = 4000;

   
    /**
     * @Description:get方式请求
     * @param url 请求地址
     * @param headers 请求头参数集合
     * @param params 请求参数集合
     * @param proxy 代理对象
     * @return
     * @throws Exception
     */
    public static HttpClientResult doGet(String url,Map<String,String> headers, Map<String,String> params,HttpHost proxy) throws Exception {
        // 创建httpClient对象
    	CloseableHttpClient httpClient = HttpClients.createDefault();

    	// 创建访问的地址
        URIBuilder uriBuilder = new URIBuilder(url);
        if (params != null) {
            Set<Entry<String, String>> entrySet = params.entrySet();
            for (Entry<String, String> entry : entrySet) {
                uriBuilder.setParameter(entry.getKey(), entry.getValue());
            }
        }

        // 创建http对象
        HttpGet httpGet = new HttpGet(uriBuilder.build());
        //设置代理服务器和超时时间。
        RequestConfig requestConfig;
        if(proxy!=null) {
        	requestConfig = RequestConfig.custom().setProxy(proxy).setConnectTimeout(CONNECT_TIMEOUT).setSocketTimeout(SOCKET_TIMEOUT).build();
        }else {
        	requestConfig = RequestConfig.custom().setConnectTimeout(CONNECT_TIMEOUT).setSocketTimeout(SOCKET_TIMEOUT).build();
        }
        
        httpGet.setConfig(requestConfig);
        
        // 设置请求头
        packageHeader(headers, httpGet);

        // 创建httpResponse对象
        CloseableHttpResponse httpResponse = null;

        try {
            // 执行请求并获得响应结果
            return getHttpClientResult(httpResponse, httpClient, httpGet);
        } finally {
            // 释放资源
            release(httpResponse, httpClient);
        }
    }

  
    /**
     * @Description:post请求方式
     * @param url 请求地址
     * @param headers 请求头集合
     * @param params 请求参数集合
     * @return
     * @throws Exception
     */
    public static HttpClientResult doPost(String url, Map<String, String> headers, Map<String, String> params,HttpHost proxy) throws Exception {
        // 创建httpClient对象
        CloseableHttpClient httpClient = HttpClients.createDefault();

        // 创建http对象
        HttpPost httpPost = new HttpPost(url);
        
        //设置代理服务器以及超时时间
        RequestConfig requestConfig;
        if(proxy!=null) {
        	requestConfig = RequestConfig.custom().setProxy(proxy).setConnectTimeout(CONNECT_TIMEOUT).setSocketTimeout(SOCKET_TIMEOUT).build();
        }else {
        	requestConfig = RequestConfig.custom().setConnectTimeout(CONNECT_TIMEOUT).setSocketTimeout(SOCKET_TIMEOUT).build();
        }
        httpPost.setConfig(requestConfig);
       
        //封装装请求头
        packageHeader(headers, httpPost);
        // 封装请求参数
        packageParam(params, httpPost);
        // 创建httpResponse对象
        CloseableHttpResponse httpResponse = null;

        try {
            // 执行请求并获得响应结果
            return getHttpClientResult(httpResponse, httpClient, httpPost);
        } finally {
            // 释放资源
            release(httpResponse, httpClient);
        }
    }

  

    /**
     * @Description:put请求
     * @param url 请求地址
     * @param params 参数集合
     * @return
     * @throws Exception
     */
    public static HttpClientResult doPut(String url, Map<String, String> params) throws Exception {
        CloseableHttpClient httpClient = HttpClients.createDefault();
        HttpPut httpPut = new HttpPut(url);
        RequestConfig requestConfig = RequestConfig.custom().setConnectTimeout(CONNECT_TIMEOUT).setSocketTimeout(SOCKET_TIMEOUT).build();
        httpPut.setConfig(requestConfig);
        
        packageParam(params, httpPut);

        CloseableHttpResponse httpResponse = null;

        try {
            return getHttpClientResult(httpResponse, httpClient, httpPut);
        } finally {
            release(httpResponse, httpClient);
        }
    }

    /**
     * @Description:delete请求
     * @param url 请求地址
     * @param params 参数集合
     * @return
     * @throws Exception
     */
    public static HttpClientResult doDelete(String url) throws Exception {
        CloseableHttpClient httpClient = HttpClients.createDefault();
        HttpDelete httpDelete = new HttpDelete(url);
        RequestConfig requestConfig = RequestConfig.custom().setConnectTimeout(CONNECT_TIMEOUT).setSocketTimeout(SOCKET_TIMEOUT).build();
        httpDelete.setConfig(requestConfig);

        CloseableHttpResponse httpResponse = null;
        try {
            return getHttpClientResult(httpResponse, httpClient, httpDelete);
        } finally {
            release(httpResponse, httpClient);
        }
    }

    /**
     * @Description:delete请求,带参数
     * @param url 请求地址
     * @param params 参数集合
     * @return
     * @throws Exception
     */
    public static HttpClientResult doDelete(String url, Map<String, String> params) throws Exception {
        if (params == null) {
            params = new HashMap<String, String>();
        }

        params.put("_method", "delete");
        return doPost(url, params,null,null);
    }
    
    /**
     * Description: 封装请求头
     * @param params
     * @param httpMethod
     */
    public static void packageHeader(Map<String, String> params, HttpRequestBase httpMethod) {
        // 封装请求头
        if (params != null) {
            Set<Entry<String, String>> entrySet = params.entrySet();
            for (Entry<String, String> entry : entrySet) {
                // 设置到请求头到HttpRequestBase对象中
                httpMethod.setHeader(entry.getKey(), entry.getValue());
            }
        }
    }

    /**
     * @Description: 封装请求参数
     * @param params
     * @param httpMethod
     * @throws UnsupportedEncodingException
     */
    public static void packageParam(Map<String, String> params, HttpEntityEnclosingRequestBase httpMethod)
            throws UnsupportedEncodingException {
        // 封装请求参数
        if (params != null) {
            List<NameValuePair> nvps = new ArrayList<NameValuePair>();
            Set<Entry<String, String>> entrySet = params.entrySet();
            for (Entry<String, String> entry : entrySet) {
                nvps.add(new BasicNameValuePair(entry.getKey(), entry.getValue()));
            }

            // 设置到请求的http对象中
            httpMethod.setEntity(new UrlEncodedFormEntity(nvps, ENCODING));
        }
    }

    /**
     * @Description:执行请求
     * @param httpResponse
     * @param httpClient
     * @throws IOException
     */
    public static HttpClientResult getHttpClientResult(CloseableHttpResponse httpResponse,
            CloseableHttpClient httpClient, HttpRequestBase httpMethod) throws Exception {
        // 执行请求
        httpResponse = httpClient.execute(httpMethod);

        // 获取返回结果
        if (httpResponse != null && httpResponse.getStatusLine() != null) {
            String content = "";
            if (httpResponse.getEntity() != null) {
                content = EntityUtils.toString(httpResponse.getEntity(), ENCODING);
            }
            return new HttpClientResult(httpResponse.getStatusLine().getStatusCode(), content);
        }
        return new HttpClientResult(HttpStatus.SC_INTERNAL_SERVER_ERROR);
    }

    /**
     * @Description:释放资源
     * @param httpResponse
     * @param httpClient
     * @throws IOException
     */
    public static void release(CloseableHttpResponse httpResponse, CloseableHttpClient httpClient) throws IOException {
        // 释放资源
        if (httpResponse != null) {
            httpResponse.close();
        }
        if (httpClient != null) {
            httpClient.close();
        }
    }

	
}

主要就是get和post方法,分别对应get和post请求。
(3)jsoup工具类 JsoupUtil

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**
 * @author: dmf
 * @date: 2019年3月19日
 * @Description:jsoup工具类,主要用来爬取免费代理服务ip
 */
public class JsoupUtil {

	/**
	 * @Description: 通过url获取document对象
	 * @param url:请求地址
	 * @return
	 * @throws IOException
	 */
	public static Document getDocByUrl(String url) throws IOException{
		Document doc = Jsoup.connect(url)
				  .userAgent("Mozilla")
				  .cookie("auth", "token")
				  .timeout(3000)
				  .post();
		return doc;
	}
	
	/**
	 * @Description:通过一个html字符串获取document对象
	 * @param html:网页的html代码字符串
	 * @return
	 */
	public static Document getDocByHtml(String html){
		Document doc = Jsoup.parse(html);
		return doc;
	}
	/**
	 * @Description:解析网页,获取网页上的免费代理ip和端口信息。根据具体网页不同,解析也过程不同,所以需要根据具体网页编写该方法。
	  *                                      此方法解析的是快代理的免费代理。(https://www.kuaidaili.com/free)
	 * @param doc:网页的doc对象
	 * @return
	 */
	public static List<Map<String, String>> getData(Document doc) {
		List<Map<String, String>> list = new ArrayList<>();
		Element ele = doc.getElementById("list");
		Elements eletrs = ele.getElementsByTag("tr");
	    
		//循环tr标签
		for (Element eletr : eletrs) {
			Elements eletds = eletr.getElementsByTag("td");
			//保存td标签里的ip和port的值
			Map<String, String> map = new HashMap<>();
			for (Element eletd : eletds) {
				
				if("IP".equals(eletd.attr("data-title"))){
					map.put("ip", eletd.text());
					//System.out.println(element.text());
				}
				if("PORT".equals(eletd.attr("data-title"))) {
					map.put("port", eletd.text());
				}
				//获取代理服务器类型。http/https
				if("类型".equals(eletd.attr("data-title"))) {
					map.put("type", eletd.text());
				}
			}
			if(!map.isEmpty()) {
				list.add(map);
			}
		}
		return list;
	}
}

注意getData方法,此方法是用来获取代理服务器信息的。解析的是快代理(https://www.kuaidaili.com/free) 的免费代理服务器信息。如果需要使用其他的代理服务器需要自行编写方法。

(4)返回结果类,用来封装返回结果

import java.io.Serializable;

public class HttpClientResult implements Serializable{
	@Override
	public String toString() {
		return "HttpClientResult [code=" + code + ", content=" + content + "]";
	}

	public int getCode() {
		return code;
	}

	public void setCode(int code) {
		this.code = code;
	}

	public String getContent() {
		return content;
	}

	public void setContent(String content) {
		this.content = content;
	}

	private static final long serialVersionUID = 1L;

	/**
     * 响应状态码
     */
    private int code;

    /**
     * 响应数据
     */
    private String content;
    
    public HttpClientResult(int code) {
    	this.code = code;
    }
    
    public HttpClientResult(int code,String content) {
    	this.code = code;
    	this.content = content;
    }
}

(5)主类:

package com.dmf.reptile;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;

import org.apache.http.HttpHost;
import org.jsoup.nodes.Document;

import com.dmf.reptile.utils.HttpClientResult;
import com.dmf.reptile.utils.HttpClientUtil;
import com.dmf.reptile.utils.JsoupUtil;

public class Test {

	public static List<Map<String, String>> proxydata = new ArrayList<>();
	//需要刷访问量的csdn博客地址
	public static String[] links = { "https://blog.csdn.net/qq_34609889/article/details/86714796",
			"https://blog.csdn.net/qq_34609889/article/details/86679463"
		};
	public static int num = -200;
	

	public static void main(String[] args) throws Exception {

        //使用线程池来操作
		ExecutorService pool = Executors.newFixedThreadPool(3);
		for (int i = 0; i < 3; i++) {
			pool.execute(new Runnable() {
				@Override
				public void run() {
					System.out.println("线程"+Thread.currentThread().getName()+"启动!");
					test();
					
					
				}
			});
		}
		pool.shutdown();
	
	}

	public synchronized static void addNum() {
		num+=200;
		try {
			//线程间隔三秒启动,快代理设置了同一时间同一个ip只能访问一次,短时间内次数太多容易被封ip;
			Thread.sleep(3000);
		} catch (InterruptedException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}
	public static void test() {
		// 第一层循环取出每页的代理ip
		addNum();
		for (int i = num; i < num+200; i++) {
			//快代理高匿名代理的url规则
			String url = "https://www.kuaidaili.com/free/inha/" + (i + 1) + "/";
			//普通代理url规则
			// String url = "https://www.kuaidaili.com/free/intr/" + (i + 1) + "/";
			
			//获取代理服务器信息
			proxydata = getProxy(url);

			// 循环通过代理IP访问link
			for (Map<String,String> map : proxydata) {
				// 生成代理服务器对象
				HttpHost proxy = new HttpHost(map.get("ip").toString(), Integer.parseInt(map.get("port").toString()),
						map.get("type").toString());
				int code = 0;
				// 每个link访问一遍
				for (String link : links) {
					try {
						code = doget(link,proxy).getCode();
						// 如果不是200,说明该ip不能访问csdn,直接跳过后续的link,使用下一个代理服务器
						if (code!=200) {
							break;
						}
						System.out.println("第"+(i+1)+"页代理地址:" + map.get("ip") + ":" + map.get("port") + "------访问结果:" + code);
					} catch (Exception e1) {
						System.out.println("第"+(i+1)+"页代理ip无效!"+ map.get("ip") + ":" + map.get("port"));
						// 直接退出循环,无需每个link都去访问
						break;
					}
				}
			}
		}
	}

	/**
	 * @Descript:获取代理服务器信息(ip、端口、类型)
	 * @param url
	 * @return
	 * @throws Exception
	 */
	public static List<Map<String, String>> getProxy(String url){

		//如果代理服务器网站做了封ip的策略,如果访问过于频繁可能会被封ip,用单线程的话应该不会,但是使用多线程的话很容易被封,我用10个线程跑过,瞬间被封了。
		//可以通过设置代理服务器去访问,去其他代理服务器网站找个免费能用的就行。
		//HttpHost proxy = new HttpHost("163.125.232.238",8118);
		
		Document doc = null;
		//请求结果
		HttpClientResult result = null;
		try {
			//
			result = doget(url,null);
			//result = doget(url,proxy);
			
			//如果返回不是200,说明没有获取到代理服务器信息
			if(result.getCode()!=200)
				throw new Exception();
			//通过httpclient取到html网页
			doc = JsoupUtil.getDocByHtml(result.getContent());
		} catch (Exception e) {
			System.out.println("获取代理服务器信息失败!");
		}
		
		//使用jsoup自带的访问url
		// Document doc = JsoupUtil.getDocByUrl(url);
		List<Map<String, String>> list = JsoupUtil.getData(doc);
		return list;
	}

	//使用post请求
	public static HttpClientResult dopost(String url, HttpHost proxy) throws Exception {
		//设置请求头集合
		Map<String, String> headers = new HashMap<String, String>();
		// headers.put("Cookie", "123");
		headers.put("Connection", "keep-alive");
		headers.put("Accept", "application/json");
		headers.put("Accept-Language", "zh-CN,zh;q=0.9");
		headers.put("User-Agent",
				"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36");
		HttpClientResult result = HttpClientUtil.doPost(url, headers, null,proxy);
		return result;
	}

	//使用get请求
	public static HttpClientResult doget(String url,HttpHost proxy) throws Exception {
		//设置请求头集合
		Map<String, String> headers = new HashMap<String, String>();
		// headers.put("Cookie", "123");
		//headers.put("Connection", "keep-alive");
		headers.put("Accept", "application/json");
		headers.put("cache-control", "max-age=0");
		headers.put("authority", "blog.csdn.net");
		headers.put("accept-encoding", "gzip, deflate, br");
		headers.put("Accept-Language", "zh-CN,zh;q=0.9");
		headers.put("User-Agent",
				"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36");
		HttpClientResult result = HttpClientUtil.doGet(url, headers, null,proxy);
		return result;
	}
}

代码中使用了多线程来执行,由于本人对java多线程研究不深,有不对或者不全之处欢迎大佬指正。

注意事项:
1、由于使用的是免费代理,所以大部分的代理服务器都是不能用的,只有非常小的部分能用。导致能刷的数量非常有限。所以想要刷到几万条的话,只能去买代理ip,网上应该有一些代理网站有几块钱的套餐,提供几百上千个代理ip。
2、因为程序要先去快代理爬取免费代理服务器的信息,所以我们需要先访问快代理的网站,但是快代理设置了ip限制,同一个短时间内只能访问一次,使用多线程的话,后面的线程获取到的是错误页面,而且线程太多容易被封ip。所以需要让线程池里的线程间隔执行。程序里通过Thread.sleep(3000)来实现。
3、csdn网站每篇博客,在一分钟同一个ip只能增加一个访问量,所以需要设置代理服务器。
需要源码的话可以去我的github上下载 :

https://github.com/dmfgithub1/reptile.git

评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值