//先建一个类
package spider;
public class News {
private String title;
private String content;
private String publishdate;
private String auther;
private String keywords;
public News() {
}
public News(String title, String content, String publishdate, String auther, String keywords) {
super();
this.title = title;
this.content = content;
this.publishdate = publishdate;
this.auther = auther;
this.keywords = keywords;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public String getPublishdate() {
return publishdate;
}
public void setPublishdate(String publishdate) {
this.publishdate = publishdate;
}
public String getAuther() {
return auther;
}
public void setAuther(String auther) {
this.auther = auther;
}
public String getKeywords() {
return keywords;
}
public void setKeywords(String keywords) {
this.keywords = keywords;
}
@Override
public String toString() {
return "News [title=" + title + ", content=" + content + ", publishdate=" + publishdate + ", auther=" + auther
+ ", keywords=" + keywords + "]";
}
}
//test
import java.io.IOException;
import java.util.ArrayList;import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
public class Spider {
public Document loadDocumentData(String url) {
//你需要从一个网站获取和解析一个HTML文档,并查找其中的相关数据
Document doc = null;
try {
doc = Jsoup.connect(url).get();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return doc;
}
// 解析列表
public List<String> parseDoc(Document doc) {
List<String> list = new ArrayList<>();
Elements elements = doc.getElementsByClass("news_top");
Elements links = elements.get(0).getElementsByTag("a");
for (int i = 0; i < links.size(); i++) {
list.add(links.get(i).attr("href"));
}
return list;
}
public News parseDatail(Document doc) {
String title = doc.getElementsByClass("main-title").text();
//查找第一个.date-source >.date元素(data是data-source的字标签)
String publishdate = doc.select(".date-source >.date").text();
//根据类查找元素
String article = doc.getElementsByClass("article").text();
String keywords = doc.getElementsByClass("keywords").text().split(": ")[1];
String auther = doc.getElementsByClass("show_author").text().split(":")[1];
News news = new News();
news.setAuther(auther);
news.setContent(article);
news.setKeywords(keywords);
news.setPublishdate(publishdate);
news.setTitle(title);
return news;
}
public static void main(String[] args) {
Spider spider = new Spider();
Document doc = spider.loadDocumentData("http://www.sina.com.cn/");
List<String> list = spider.parseDoc(doc);
/*List<News> list2=new ArrayList<>();*/
for(String url:list) {
Document detail = spider.loadDocumentData(url);
News ad = spider.parseDatail(detail);
System.out.println(ad);
/* list2.add(ad);*/
}
}
}