java写的简单爬虫程序

为了测试文本聚类算法,老板让每人就某一话题找1000篇新闻,,“你们没有爬虫??那就自己用手复制吧,3天复制完!”

好吧,那就写个爬虫吧~查资料+编码一下午搞定,写的非常简陋,只能爬新浪军事板块的文章,不怕大家笑话了,哈哈~

package com.slimspider;

import java.io.BufferedWriter;
import java.io.FileWriter;
import java.util.ArrayList;
import java.util.List;

import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.util.EntityUtils;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.visitors.TextExtractingVisitor;

import com.slimspider.utils.StringURLQueue;

/**
*
* Just for sina military section
*
* @author LiangTE
*
*/

public class Main {

private static int num = 0;

public static void crawler(String url) {
HttpClient httpClient = new DefaultHttpClient();

StringURLQueue queue = StringURLQueue.getInstance();

List<String> deprecatedURLs = new ArrayList<String>();

try {
HttpGet httpget = new HttpGet(url);

HttpResponse response = httpClient.execute(httpget);

HttpEntity entity = response.getEntity();

String body = EntityUtils.toString(entity, "gbk");

/**
*
* 抓取当前页面的正文
*
*/
String totalContent = body.substring(body.indexOf("<!-- publish_helper"), body.indexOf("<!-- publish_helper_end -->"));

String text = totalContent.substring(totalContent.indexOf("<p>"),
totalContent.indexOf("<style type"));

if(totalContent.indexOf("<div class=\"otherContent") != -1) {
String links = totalContent.substring(totalContent.indexOf("<div class=\"otherContent"));

Parser parser = new Parser(text);

TextExtractingVisitor visitor = new TextExtractingVisitor();

parser.visitAllNodesWith(visitor);

String resu = visitor.getExtractedText();

FileWriter fw = new FileWriter("D:/resu/m" + ++num + ".txt");

BufferedWriter bw = new BufferedWriter(fw);

bw.write(resu);

bw.close();

fw.close();

System.out.println(resu);

/**
*
* 抓取当前页面的url
*
*/
NodeFilter filter = new TagNameFilter("a");

Parser parser2 = new Parser(links);

NodeList nodeList = parser2.extractAllNodesThatMatch(filter);

int len = nodeList.size();

for (int i = 0; i < len; i++) {
LinkTag tag = (LinkTag) nodeList.elementAt(i);
String newUrl = tag.extractLink();
if(!deprecatedURLs.contains(newUrl)) {
if(newUrl.startsWith("http://mil.news.sina.com.cn")) {
queue.enQueue(newUrl);
}
}
}

String targetUrl = queue.deQueue();

deprecatedURLs.add(targetUrl);

crawler(targetUrl);
}

crawler(queue.deQueue());

} catch (Exception e) {
e.printStackTrace();
} finally {
httpClient.getConnectionManager().shutdown();
}
}


public static void main(String[] args) {

crawler("http://mil.news.sina.com.cn/2012-10-23/0913704471.html");

}

}



原文链接:[url]http://my.oschina.net/liangtee/blog/84869[/url]
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值