爬虫 爬列表

//先建一个类

package spider;


public class News {


private String title;
private String content;
private String publishdate;
private String auther;
private String keywords;

public News() {

}


public News(String title, String content, String publishdate, String auther, String keywords) {
super();
this.title = title;
this.content = content;
this.publishdate = publishdate;
this.auther = auther;
this.keywords = keywords;
}


public String getTitle() {
return title;
}


public void setTitle(String title) {
this.title = title;
}


public String getContent() {
return content;
}


public void setContent(String content) {
this.content = content;
}


public String getPublishdate() {
return publishdate;
}


public void setPublishdate(String publishdate) {
this.publishdate = publishdate;
}


public String getAuther() {
return auther;
}


public void setAuther(String auther) {
this.auther = auther;
}


public String getKeywords() {
return keywords;
}


public void setKeywords(String keywords) {
this.keywords = keywords;
}


@Override
public String toString() {
return "News [title=" + title + ", content=" + content + ", publishdate=" + publishdate + ", auther=" + auther
+ ", keywords=" + keywords + "]";
}


}

//test

import java.io.IOException;

import java.util.ArrayList;
import java.util.List;


import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;


public class Spider {


public Document loadDocumentData(String url) {
//你需要从一个网站获取和解析一个HTML文档,并查找其中的相关数据
Document doc = null;
try {
doc = Jsoup.connect(url).get();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return doc;
}
// 解析列表


public List<String> parseDoc(Document doc) {
List<String> list = new ArrayList<>();
Elements elements = doc.getElementsByClass("news_top");
Elements links = elements.get(0).getElementsByTag("a");
for (int i = 0; i < links.size(); i++) {
list.add(links.get(i).attr("href"));
}
return list;
}


public News parseDatail(Document doc) {
       
String title = doc.getElementsByClass("main-title").text();
//查找第一个.date-source >.date元素(data是data-source的字标签)
String publishdate = doc.select(".date-source >.date").text();
//根据类查找元素
String article = doc.getElementsByClass("article").text();

String keywords = doc.getElementsByClass("keywords").text().split(": ")[1];

String auther = doc.getElementsByClass("show_author").text().split(":")[1];
 
News news = new News();    

news.setAuther(auther);
news.setContent(article);
news.setKeywords(keywords);
news.setPublishdate(publishdate);
news.setTitle(title);

return news; 
}


public static void main(String[] args) {


Spider spider = new Spider();
Document doc = spider.loadDocumentData("http://www.sina.com.cn/");

List<String> list = spider.parseDoc(doc);
/*List<News> list2=new ArrayList<>();*/

for(String url:list) {
Document detail = spider.loadDocumentData(url);
News ad = spider.parseDatail(detail);

       System.out.println(ad);
/* list2.add(ad);*/
}

}

}











  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值