java爬虫代码_JAVA爬虫代码

/*** Created by wangzheng on 2017/2/19.*/

importjava.io.IOException;importjava.util.ArrayList;importjava.util.List;import org.jsoup.*;import org.jsoup.nodes.*;import org.jsoup.select.*;public classMain {private static final String URL = "http://blog.csdn.net/qq_33599520";public static void main(String[] args) throwsIOException {

Connection conn=Jsoup.connect(URL)

.userAgent("Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:50.0) Gecko/20100101 Firefox/50.0")

.timeout(5000)

.method(Connection.Method.GET);

Document doc=conn.get();

Element body=doc.body();//获取总页数

String totalPageStr = body.getElementById("papelist").select("span:eq(0)").text();

String regex= ".+共(\\d+)页";

totalPageStr= totalPageStr.replaceAll(regex, "$1");int totalPage =Integer.parseInt(totalPageStr);int pageNow = 1;

List articleList = new ArrayList();for(pageNow = 1; pageNow <= totalPage; pageNow++){

articleList.addAll(getArtitcleByPage(pageNow));

}//遍历输出博主所有的文章

for(Article article : articleList) {

System.out.println("文章标题:" +article.getTitle());

System.out.println("文章绝对路劲地址:http://blog.csdn.net" +article.getAddress());

System.out.println("文章简介:" +article.getDesption());

System.out.println("发表时间:" +article.getTime());

}

}public static List getArtitcleByPage(int pageNow) throwsIOException{

Connection conn= Jsoup.connect(URL + "/article/list/" +pageNow)

.userAgent("Mozilla/5.0 (Windows NT 6.1; rv:47.0) Gecko/20100101 Firefox/47.")

.timeout(5000)

.method(Connection.Method.GET);

Document doc=conn.get();

Element body=doc.body();

List resultList = new ArrayList();

Element articleListDiv= body.getElementById("article_list");

Elements articleList= articleListDiv.getElementsByClass("list_item");for(Element article : articleList){

Article articleEntity= newArticle();

Element linkNode= (article.select("div h1 a")).get(0);

Element desptionNode= (article.getElementsByClass("article_description")).get(0);

Element articleManageNode= (article.getElementsByClass("article_manage")).get(0);

articleEntity.setAddress(linkNode.attr("href"));

articleEntity.setTitle(linkNode.text());

articleEntity.setDesption(desptionNode.text());

articleEntity.setTime(articleManageNode.select("span:eq(0").text());

resultList.add(articleEntity);

}returnresultList;

}

}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值