java 爬虫学习 httpclient使用

使用httpclient 模拟游览器请求

maven 依赖

        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpcore</artifactId>
            <version>4.4.10</version>
        </dependency>

        <!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
            <version>4.5.6</version>
        </dependency>

        <dependency>
    <groupId>commons-io</groupId>
    <artifactId>commons-io</artifactId>
    <version>2.5</version>
</dependency>

测试代码

这是请求某个网站,获取数据

package com.httpclient.demo.httpClient;

import java.io.File;
import java.io.InputStream;

import org.apache.commons.io.FileUtils;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.client.HttpClient;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;

/**
 * Hello world!
 *
 */
public class App {
	public static void main(String[] args) {
        //创建CloseableHttpClient 实例
		CloseableHttpClient chc =HttpClients.createDefault();
        //请求方式和网址
		HttpGet get = new HttpGet("http://cj.sdust.edu.cn/");
        //添加请求头,模拟游览器
		get.setHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36\r\n");
		CloseableHttpResponse chr = null;
		try {
            //发送请求
			chr = chc.execute(get);
            //获取请求页面的数据
			HttpEntity httpentity = chr.getEntity();
			//转码获得数据
			String str = EntityUtils.toString(httpentity,"utf-8");
			System.out.println(str.toString());
		}catch (Exception e) {
			e.printStackTrace();
		}
		
		
		
	}
}

 

测试代码

这是获取网站图片,写入本地

package com.httpclient.demo.httpClient;

import java.io.File;
import java.io.InputStream;

import org.apache.commons.io.FileUtils;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.client.HttpClient;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;

/**
 * Hello world!
 *
 */
public class App {
	public static void main(String[] args) {
        //创建CloseableHttpClient 实例
		CloseableHttpClient chc =HttpClients.createDefault();
        //请求方式和网址
		HttpGet get = new HttpGet("http://cj.sdust.edu.cn/_mediafile/cj/_festival/_pic/9803f445-5965-42db-9695-28857a49b4b1.jpg");
        //添加请求头,模拟游览器
		get.setHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36\r\n");
		CloseableHttpResponse chr = null;
		try {
            //发送请求
			chr = chc.execute(get);
            //获取请求页面的数据
			HttpEntity httpentity = chr.getEntity();
            //写入图片的话 网址一定是图片的网址
            //使用了 FileUtils工具类
			if(httpentity !=null){
			InputStream is = httpentity.getContent();
			FileUtils.copyToFile(is, new File("C:\\Users\\雷霆世纪\\Desktop\\aa.jpg"));
			}
		}catch (Exception e) {
			e.printStackTrace();
		}
		
		
		
	}
}

ip代理

代理IP的话 也分几种 透明代理、匿名代理、混淆代理、高匿代理

1、透明代理(Transparent Proxy)

透明代理虽然可以直接“隐藏”你的IP地址,但是还是可以从HTTP_X_FORWARDED_FOR来查到你是谁。

2、匿名代理(Anonymous Proxy)

匿名代理比透明代理进步了一点:别人只能知道你用了代理,无法知道你是谁

3、混淆代理(Distorting Proxies)

如上,与匿名代理相同,如果使用了混淆代理,别人还是能知道你在用代理,但是会得到一个假的IP地址,伪装的更逼真:-)

4、高匿代理(Elite proxy或High Anonymity Proxy)

可以看出来,高匿代理让别人根本无法发现你是在用代理,所以是最好的选择。

一般我们搞爬虫 用的都是 高匿的代理IP

测试代码 

使用高匿代理

package com.httpclient.demo.httpClient;

import java.io.File;
import java.io.InputStream;

import org.apache.commons.io.FileUtils;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.client.HttpClient;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;

/**
 * Hello world!
 *
 */
public class App {
	public static void main(String[] args) {
		CloseableHttpClient chc =HttpClients.createDefault();
		HttpGet get = new HttpGet("http://cj.sdust.edu.cn/");
		//使用代理 ,添加在头部,ip需要经常变,很容易挂
		HttpHost proxy = new HttpHost("218.91.94.133", 9999);
		RequestConfig config = RequestConfig.custom().setProxy(proxy).build();
		get.setConfig(config);

		get.setHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36\r\n");
		CloseableHttpResponse chr = null;
		try {
			chr = chc.execute(get);
			HttpEntity httpentity = chr.getEntity();
			String str = EntityUtils.toString(httpentity,"utf-8");
			System.out.println(str.toString());
		}catch (Exception e) {
			e.printStackTrace();
		}

	}
}

添加httpclient 请求服务器时间,读取数据时间

测试代码

package com.httpclient.demo.httpClient;

import java.io.File;
import java.io.InputStream;

import org.apache.commons.io.FileUtils;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.client.HttpClient;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;

/**
 * Hello world!
 *
 */
public class App {
	public static void main(String[] args) {
		CloseableHttpClient chc =HttpClients.createDefault();
		HttpGet get = new HttpGet("https://www.cnblogs.com/zhou-test/p/10001270.html");
		
		//设置连接服务器时间 ,读取数据超时时间  毫秒
		//RequestConfig config = RequestConfig.custom().setConnectTimeout(10000).setSocketTimeout(10000).build();
		get.setConfig(config);
		
		get.setHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36\r\n");
		CloseableHttpResponse chr = null;
		try {
			chr = chc.execute(get);
			HttpEntity httpentity = chr.getEntity();
			String str = EntityUtils.toString(httpentity,"utf-8");
			System.out.println(str.toString());
		}catch (Exception e) {
			e.printStackTrace();
		}

	}
}

 

 

 

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值