爬虫爬列表

最新推荐文章于 2024-05-09 19:01:45 发布

卞小帅

最新推荐文章于 2024-05-09 19:01:45 发布

阅读量617

点赞数

本文链接：https://blog.csdn.net/I__Do__/article/details/80189633

版权

//先建一个类

package spider;

public class News {

private String title;
private String content;
private String publishdate;
private String auther;
private String keywords;

public News() {

}

public News(String title, String content, String publishdate, String auther, String keywords) {
super();
this.title = title;
this.content = content;
this.publishdate = publishdate;
this.auther = auther;
this.keywords = keywords;
}

public String getTitle() {
return title;
}

public void setTitle(String title) {
this.title = title;
}

public String getContent() {
return content;
}

public void setContent(String content) {
this.content = content;
}

public String getPublishdate() {
return publishdate;
}

public void setPublishdate(String publishdate) {
this.publishdate = publishdate;
}

public String getAuther() {
return auther;
}

public void setAuther(String auther) {
this.auther = auther;
}

public String getKeywords() {
return keywords;
}

public void setKeywords(String keywords) {
this.keywords = keywords;
}

@Override
public String toString() {
return "News [title=" + title + ", content=" + content + ", publishdate=" + publishdate + ", auther=" + auther
+ ", keywords=" + keywords + "]";
}

}

//test

import java.io.IOException;

import java.util.ArrayList;
import java.util.List;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;

public class Spider {

public Document loadDocumentData(String url) {
//你需要从一个网站获取和解析一个HTML文档，并查找其中的相关数据
Document doc = null;
try {
doc = Jsoup.connect(url).get();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return doc;
}
// 解析列表

public List<String> parseDoc(Document doc) {
List<String> list = new ArrayList<>();
Elements elements = doc.getElementsByClass("news_top");
Elements links = elements.get(0).getElementsByTag("a");
for (int i = 0; i < links.size(); i++) {
list.add(links.get(i).attr("href"));
}
return list;
}

public News parseDatail(Document doc) {

String title = doc.getElementsByClass("main-title").text();
//查找第一个.date-source >.date元素（data是data-source的字标签）
String publishdate = doc.select(".date-source >.date").text();
//根据类查找元素
String article = doc.getElementsByClass("article").text();

String keywords = doc.getElementsByClass("keywords").text().split(": ")[1];

String auther = doc.getElementsByClass("show_author").text().split("：")[1];

News news = new News();

news.setAuther(auther);
news.setContent(article);
news.setKeywords(keywords);
news.setPublishdate(publishdate);
news.setTitle(title);

return news;
}

public static void main(String[] args) {

Spider spider = new Spider();
Document doc = spider.loadDocumentData("http://www.sina.com.cn/");

List<String> list = spider.parseDoc(doc);
/*List<News> list2=new ArrayList<>();*/

for(String url:list) {
Document detail = spider.loadDocumentData(url);
News ad = spider.parseDatail(detail);

System.out.println(ad);
/* list2.add(ad);*/
}

}

}