为什么数据采集(爬虫)需要用到代理IP

10 篇文章 0 订阅
8 篇文章 0 订阅

这个问题很简单了,因为你要采集的网址会封禁你的请求IP,导致你的请求无法获取到正确的数据。

代理IP起到了中间层的作用,你要采集的网址服务器会认为你的请求IP是代理IP,从而无法实施封禁行为。

当然,并不是所有的代理IP都能起到这个作用。代理IP分为高匿名、透明(普通匿名也认为是透明)两种;透明代理IP服务器端看到的是你的真实IP和代理IP,高匿名代理IP服务器端只能看到代理IP。所以说,还必须使用高匿名代理IP。

下面介绍几个常用的代理IP网站:

无忧代理IP http://www.data5u.com

小猪代理IP http://www.xiaozhudaili.com

下面附上JAVA使用代理IP做爬虫的DEMO:

MAVEN引入如下依赖包:

<!-- ************htmlunit************* -->
<dependency>
	<groupId>net.sourceforge.htmlunit</groupId>
	<artifactId>htmlunit</artifactId>
	<version>2.23</version>
</dependency>
<dependency>  
    <groupId>xml-apis</groupId>  
    <artifactId>xml-apis</artifactId>  
    <version>1.4.01</version>  
</dependency>	

复制下面代码,新建Java类com.xiaozhudaili.test.TestDynamicIp.java:

package com.xiaozhudaili.test;

import java.io.BufferedInputStream;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.util.ArrayList;
import java.util.List;

import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.ProxyConfig;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;

/**
 * 这个DEMO主要为了测试代理IP的稳定性
 * 也可以作为爬虫参考项目,如需使用,请自行修改代码webParseHtml方法
 * @author http://www.xiaozhudaili.com/ 
 */
public class TestDynamicIp {
	public static List ipList = new ArrayList<>();
	public static boolean gameOver = false;
	public static void main(String[] args) {
		long fetchIpSeconds = 5;
		int threadNum = 10;
		int testTime = 3;
		// 请填写小猪代理IP的API接口,接口返回格式为TXT,分隔符为\n
		String apiUrl = "一定要把这里改为API接口哦~";
		// 你要抓去的目标网址
		String targetUrl = "http://pv.sohu.com/cityjson?ie=utf-8";
		// 是否加载JS,加载JS会导致速度变慢
		boolean useJS = false;
		// 请求超时时间,单位毫秒,默认5秒
		int timeOut = 5000;
		
		if (apiUrl == null || "".equals(apiUrl)) {
			System.err.println("请输入API接口");
			return;
		}
		
		System.out.println(">>>>>>>>>>小猪代理IP测试开始<<<<<<<<<<");
		System.out.println("***************");
		System.out.println("提取IP间隔 " + fetchIpSeconds + " 秒 ");
		System.out.println("开启爬虫线程 " + threadNum);
		System.out.println("爬虫目标网址  " + targetUrl);
		System.out.println("API接口  " + apiUrl);
		System.out.println("测试次数 3 ");
		System.out.println("***************\n");
		TestDynamicIp tester = new TestDynamicIp();
		new Thread(tester.new GetIP(fetchIpSeconds * 1000, testTime, apiUrl)).start();
		for (int i = 0; i < threadNum; i++) {
			tester.new Crawler(100, targetUrl, useJS, timeOut).start();
		}
		while(!gameOver){
			try {
				Thread.sleep(100);
			} catch (InterruptedException e) {
				e.printStackTrace();
			}
		}
		System.out.println(">>>>>>>>>>小猪代理IP测试结束<<<<<<<<<<");
		System.exit(0);
	}
    
	// 抓取目标站,检测IP
	public class Crawler extends Thread{
		@Override
		public void run() {
			while(!gameOver){
				webParseHtml(targetUrl);
				try {
					Thread.sleep(sleepMs);
				} catch (InterruptedException e) {
					e.printStackTrace();
				}
			}
		}
		
		long sleepMs = 200;
		boolean useJs = false;
		String targetUrl = "";
		int timeOut = 5000;
		
		public Crawler(long sleepMs, String targetUrl, boolean useJs, int timeOut) {
			this.sleepMs = sleepMs;
			this.targetUrl = targetUrl;
			this.useJs = useJs;
			this.timeOut = timeOut;
		}
		public String webParseHtml(String url) {
			String html = "";
			BrowserVersion[] versions = {BrowserVersion.INTERNET_EXPLORER_11, BrowserVersion.CHROME, BrowserVersion.FIREFOX_38, BrowserVersion.INTERNET_EXPLORER_8};
			WebClient client = new WebClient(versions[(int)(versions.length * Math.random())]);
			try {
				client.getOptions().setThrowExceptionOnFailingStatusCode(false);
				client.getOptions().setJavaScriptEnabled(useJs);
				client.getOptions().setCssEnabled(false);
				client.getOptions().setThrowExceptionOnScriptError(false);
				client.getOptions().setTimeout(timeOut);
				client.getOptions().setAppletEnabled(true);
				client.getOptions().setGeolocationEnabled(true);
				client.getOptions().setRedirectEnabled(true);
				
				// 这行代码允许访问HTTPS网站,防止报SSL证书错误
				client.getOptions().setUseInsecureSSL(true);
				
				String ipport = getAProxy();
				if (ipport != null) {
					ProxyConfig proxyConfig = new ProxyConfig(ipport.split(":")[0], Integer.parseInt(ipport.split(":")[1]));
					client.getOptions().setProxyConfig(proxyConfig);
				}else {
					System.out.print(".");
					return "";
				}
			
				HtmlPage page = client.getPage(url);
				html = page.asXml();
				
				System.out.println(getName() + " 使用代理 " + ipport + "请求目标网址返回HTML:" + html);
				
			} catch (Exception e) {
				return webParseHtml(url);
			} finally {
				client.close();
			}
			return html;
		}
		
	    private String getAProxy() {
	    	if (ipList.size() > 0) {
	    		String ip = ipList.get((int)(Math.random() * ipList.size()));
	    		return ip ;
			}
			return null;
		}
	}
	
	// 定时获取动态IP
	public class GetIP implements Runnable{
		long sleepMs = 1000;
		int maxTime = 3;
		String apiUrl = "";
		
		public GetIP(long sleepMs, int maxTime, String apiUrl ) {
			this.sleepMs = sleepMs;
			this.maxTime = maxTime;
			this.apiUrl = apiUrl;
		}
		
		@Override
		public void run() {
			long getIpTime = 0;
			int time = 1;
			while(!gameOver){
				if(time >= 4){
					gameOver = true;
					break;
				}
				try {
					java.net.URL url = new java.net.URL(apiUrl);
			    	HttpURLConnection connection = (HttpURLConnection)url.openConnection();
			    	connection.setConnectTimeout(3000);
			    	connection = (HttpURLConnection)url.openConnection();
			    	
			        InputStream raw = connection.getInputStream();  
			        InputStream in = new BufferedInputStream(raw);  
			        byte[] data = new byte[in.available()];
			        int bytesRead = 0;  
			        int offset = 0;  
			        while(offset < data.length) {  
			            bytesRead = in.read(data, offset, data.length - offset);  
			            if(bytesRead == -1) {  
			                break;  
			            }  
			            offset += bytesRead;  
			        }  
			        in.close();  
			        raw.close();
					String[] res = new String(data, "UTF-8").split("\n");
					List<String> ipList = new ArrayList<>();
					for (String ip : res) {
						try {
							String[] parts = ip.split(",");
							if (Integer.parseInt(parts[1]) > 0) {
								ipList.add(parts[0]);
							}
						} catch (Exception e) {
						}
					}
					if (ipList.size() > 0) {
						TestDynamicIp.ipList = ipList;
						System.out.println("第" + ++getIpTime + "次获取动态IP " + ipList.size() + " 个");
						time += 1;
					}
				} catch (Exception e) {
					e.printStackTrace();
					System.err.println(">>>>>>>>>>>>>>获取IP出错");
				}
				try {
					Thread.sleep(sleepMs);
				} catch (InterruptedException e) {
					e.printStackTrace();
				}
			}
		}
	}
	
}

 

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

DATA5U

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值