public static void main(String[] args) {
Long startTime, endTime;
System.out.println("小爬虫开始了。。。。。。。。。。。");
startTime = new Date().getTime();
Document document;
try {
document = Jsoup.connect("https://www.xsjtxt.com/book/84448/").get();
//document = Jsoup.connect("http://book.qidian.com/info/1006693964#Catalog").get();
System.out.println("document:"+document);
//自定一个对象,为存储到服务器中做准备
Reptile reptile = new Reptile();
String fictionName = document.select("h1").text();
System.out.println("小说书名:"+fictionName);
reptile.setFictionName(fictionName);
//获取页面 div里面的pc_list父属性下的子属性(ul列表下的li列表下的a标签)
Elements results = document.select("div.pc_list>ul>li>a");
System.out.println("results:"+results);
for (Element e : results) {
//获取章节名
String fictionChapter = e.text();
System.err.println("小说章节名:"+fictionChapter);
String fictionUrl = e.attr("abs:href");
System.err.println("章节路径的绝对路径:"+fictionUrl);
//解析章节路径的地址
document = Jsoup.connect(fictionUrl).get();
//获取章节路径地址里面的章节内容
String text=document.select("#content1").text();
System.out.println("章节的内容:"+text);
//将获取的先说章节名、章节地址、章节内容封装成一个对象然后进行存储
reptile.setFictionUrl(fictionUrl);
reptile.setFictionChapter(fictionChapter);
reptile.setText(text);
// reptileService.SaveAttribute(reptile);
}
} catch (Exception e) {
e.printStackTrace();
}
endTime = new Date().getTime();
System.out.println("小爬虫结束了,用时" + (endTime - startTime) + "ms");
}
<!-- 爬取数据需要的依赖 -->
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.58</version>
</dependency>
<dependency>
<groupId>dom4j</groupId>
<artifactId>dom4j</artifactId>
<version>1.6</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.9.2</version>
</dependency>