java爬虫

一原理

    创建HttpClient对象,并指定url,如需要get请求请创建HttpGet对象,post请求请创建HttpPost对象。HttpClient中execute方法发送请求。

二小例子

package com.xiang;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;

import org.apache.http.client.config.CookieSpecs;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;

public class Spider {

	public static void main(String[] args) {

		// HttpClient 超时配置
		RequestConfig requestConfig = RequestConfig.custom()
				.setCookieSpec(CookieSpecs.STANDARD)
				.setConnectionRequestTimeout(6000).setConnectTimeout(6000)
				.build();
		CloseableHttpClient httpClient = HttpClients.custom()
				.setDefaultRequestConfig(requestConfig).build();
		//for (int i = 0; i < 100; i++) {//页面上有页码用到,提高效率,并用多线程
			HttpGet httpGet = new HttpGet("http://www.baidu.com");//此处填写地址 创建一个get请求
			httpGet.addHeader(
					"User-Agent",
					"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.152 Safari/537.36");
			CloseableHttpResponse response = null;
			InputStream in = null;
			try {
				// 不敢爬太快,封ip就不好了
				//Thread.sleep(3600);
				response = httpClient.execute(httpGet);
				in = response.getEntity().getContent();
				String html = convertStreamToString(in);
				new Thread(new BaiduParser(html)).start();
			} catch (Exception e) {
				//do nothing
			}finally{
				try {
					if(response != null){
					    response.close();
					}
				} catch (IOException e) {
					// do nothing
				}
			}
		//}
	}

	//将爬到的内容转化为String
	private static String convertStreamToString(InputStream in) {
		BufferedReader reader = new BufferedReader(new InputStreamReader(in));
		StringBuilder sb = new StringBuilder();

		String line = null;
		try {
			while ((line = reader.readLine()) != null) {
				sb.append(line + "\n");
			}
		} catch (IOException e) {
			e.printStackTrace();
		} finally {
			try {
				in.close();
			} catch (IOException e) {
				e.printStackTrace();
			}
		}

		return sb.toString();
	}

}


package com.xiang;

public class BaiduParser implements Runnable{

	String html;
	public BaiduParser(String html) {
	    this.html = html;
	}
	public void run() {
		
		System.out.println(html);
		//通过正则表达式或截取取得自己想要的内容
	}
}


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值