JAVA爬取虎嗅网截图_httpclient3+jsoup多线程抓取虎嗅网全部文章

周六无事生非,一直觉得虎嗅网的文章质量很高,早上醒来突然有个念头想把它的文章都搞下来;花了半个钟头,写了个小程序,jdk6下采用httpclient3.1、线程池ThreadPoolExecutor、jsoup1.7.1,commons.io

* 等工具多线程抓取虎嗅网全部文章,并以文本文件形式持久化入磁盘,速度还是可以的半个小时把全站所有文章搞定

1.[代码][Java]代码

import java.io.File;

import java.io.IOException;

import java.util.ArrayList;

import java.util.List;

import java.util.concurrent.ArrayBlockingQueue;

import java.util.concurrent.ThreadPoolExecutor;

import java.util.concurrent.TimeUnit;

import org.apache.commons.httpclient.HttpClient;

import org.apache.commons.httpclient.HttpException;

import org.apache.commons.httpclient.HttpMethod;

import org.apache.commons.httpclient.HttpStatus;

import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;

import org.apache.commons.httpclient.URIException;

import org.apache.commons.httpclient.methods.GetMethod;

import org.jsoup.Jsoup;

import org.jsoup.nodes.Document;

import org.jsoup.select.Elements;

import org.apache.commons.io.FileUtils;

/**

* @ClassName: CrawlHuxiu

* @Description: jdk6下采用httpclient3.1、线程池ThreadPoolExecutor、jsoup1.7.1和commons.io

* 等工具包多线程抓取虎嗅网全部文章,并以文本文件形式持久化入磁盘

* @author Leon温陵

* @version V1.0

*/

public class CrawlHuxiu {

/**

* @Title: main

* @Description: 采用多线程抓取虎嗅网全文

* @param args

* @author

* @throws IOException

* @date 2012-12-15

*/

public static void main(String[] args) throws IOException, InterruptedException

{

CrawlHuxiu crawlHuxiu= new CrawlHuxiu();

// 创建 HttpClient 的多线程实例

MultiThreadedHttpConnectionManager connectionManager =

new MultiThreadedHttpConnectionManager();

HttpClient httpClient = new HttpClient(connectionManager);

crawlHuxiu.getHuxiu(httpClient, "http://www.huxiu.com/article/8000/1.html");

}

/**根据jsoup解析出爬取回来的虎嗅网文章内容,利用common.io包放入磁盘

* @param doc

* @param i

* @throws IOException

*/

static void crawlHuxiu(Document doc, int i) throws IOException

{

List yuliao= new ArrayList();

Elements title= doc.select("title");//主题

String str = title.first().text().replace("-看点-@虎嗅网", "").

replace("-观点-@虎嗅网", "").replace("-读点-@虎嗅网", "");

System.out.println("title: "+str);

Elements userAndTime = doc.select(".author-name"); // 发帖时间

Elements content= doc.select("#article_content");//内容

if(str.contains("提示信息 - 虎嗅网"))

{

System.out.println("文章被删除");

return;

}

String baseUrl = "http://www.huxiu.com/article/";

yuliao.add(str);

yuliao.add(baseUrl + i + "/1.html");

yuliao.add(userAndTime.get(0).select(".fc1").text());

yuliao.add(userAndTime.get(1).text());

yuliao.add(content.get(0).text());

File file = new File("c:\\huxiu\\"+str+".txt");

FileUtils.writeLines(file, yuliao);

yuliao.clear();

return;

}

/**

* 线程池执行的任务,抓取网页

*/

static class ThreadPoolTask implements Runnable

{

private static final long serialVersionUID = 0;

HttpClient httpClient =null;

HttpMethod getMethod = null;

int i = 0;

// 保存任务所需要的数据

private Object threadPoolTaskData;

ThreadPoolTask(Object tasks)

{

this.threadPoolTaskData = tasks;

}

public ThreadPoolTask(HttpMethod getMethod, int i) {

// TODO Auto-generated constructor stub

}

public ThreadPoolTask(HttpClient httpClient, HttpMethod getMethod, int i) {

// TODO Auto-generated constructor stub

this.httpClient = httpClient;

this.getMethod = getMethod;

this.i = i;

}

public void run()

{

// 处理一个任务,这里的处理方式太简单了,仅仅是一个打印语句

try {

System.out.println("executing request " + getMethod.getURI());

// 执行getMethod

int status = httpClient.executeMethod(getMethod);

System.out.println("status:" + status);

// 连接返回的状态码

if (HttpStatus.SC_OK == status)

{

// 获取到的内容

Document doc = null;

try

{

doc = Jsoup.parse(getMethod.getResponseBodyAsStream(),

"utf-8", "");

}

catch (HttpException e)

{

e.printStackTrace();

}

if (getMethod.getURI().getURI().startsWith(

"http://www.huxiu.com/article/"))

{

crawlHuxiu(doc, i);

}

}

}

catch (URIException e)

{

e.printStackTrace();

}

catch (IOException e)

{

e.printStackTrace();

} catch (Exception e)

{

e.printStackTrace();

} finally {

// 释放连接

getMethod.releaseConnection();

}

}

}

/**

* 获取虎嗅网的全体文章

*

* @param httpClient

* @param startUrl

* @throws InterruptedException

*/

public void getHuxiu(HttpClient httpClient, String startUrl) throws InterruptedException

{

// 构造一个线程池

ThreadPoolExecutor threadPool = new ThreadPoolExecutor(100, 500, 3, TimeUnit.SECONDS,

new ArrayBlockingQueue(10000), new ThreadPoolExecutor.DiscardOldestPolicy());

String baseUrl = "http://www.huxiu.com/article/";

// 请求URI

HttpMethod getMethod = null;

String id = startUrl.split("/")[4];

Integer count = Integer.valueOf(id);

for (int i = count; i > 100; i--)

{

// 根据虎嗅网文章url的特点,构造请求URI,用具有100个活动线程的线程池进行加载

getMethod = new GetMethod(baseUrl + i + "/1.html");

threadPool.execute(new ThreadPoolTask(httpClient, getMethod, i));

Thread.sleep(10);

}

while(true)

{

Thread.sleep(10);

if(threadPool.getActiveCount()==0)

{

threadPool.shutdown();

break;

}

}

}

}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值