目标:
爬取笔下文学小说网的 《神墓》,下载到本地txt
材料:
jsoup
以下:
项目结构:
pom:
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.9.2</version>
</dependency>
网页元素:
实现:
package com.lxl.txt.controller;
import com.lxl.txt.bean.RuYi;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
public class MySelf {
private static RuYi getDetail(String url){
//基础地址
String next_url = "https://www.bxwxorg.com";
//获取html对象
Document dc = null;
try {
dc = Jsoup.connect(url).timeout(5000).get();
} catch (IOException e) {
System.out.println(e.getMessage());
}
//System.out.println(dc);
//获得标题
String title = dc.select("div.bookname > h1").text(); // 获取class=bookname的div元素后面的h1元素
//内容
String content = dc.getElementById("content").text();
//下一章地址
String pager_next = dc.getElementById("A3").attr("href");
next_url = next_url + pager_next;
RuYi ruyi = new RuYi();
ruyi.setTitle(title);
ruyi.setContent(content);
ruyi.setNextUrl(next_url);
ruyi.setPager_next(pager_next);
return ruyi;
}
//outPut
private static void outPut(StringBuffer buffer){
try {
File writeName = new File("E:\\output.txt"); // 相对路径,如果没有则要建立一个新的output.txt文件
writeName.createNewFile(); // 创建新文件,有同名的文件的话直接覆盖
try (FileWriter writer = new FileWriter(writeName);
BufferedWriter out = new BufferedWriter(writer)
) {
out.write(buffer + "\r\n"); // \r\n即为换行
out.flush(); // 把缓存区内容压入文件
}
} catch (IOException e) {
e.printStackTrace();
}
}
public static void main(String[] args) throws IOException {
//起始地址(第一章地址)
String url = "https://www.bxwxorg.com/read/36/677091.html";
//获取
RuYi ruyi = getDetail(url);
//添加stringBuffer
StringBuffer buffer = new StringBuffer();
buffer.append(ruyi.getTitle() + "\r\n");
buffer.append(ruyi.getContent() + "\r\n");
//循环,等于最终章的“/read/36/”时,停止继续探索
while(ruyi.getNextUrl() != null && ruyi.getContent() != null && !ruyi.getPager_next().equals("/read/36/")){
//重复获取
ruyi = getDetail(ruyi.getNextUrl().toString());
//添加stringBuffer
buffer.append(ruyi.getTitle() + "\r\n");
buffer.append(ruyi.getContent() + "\r\n");
}
//输出
outPut(buffer);
}
}
实体类:
package com.lxl.txt.bean;
public class RuYi {
private String title; //标题
private String content; //内容
private String nextUrl; //下一章地址
private String pager_next; //下一章编码
public String getPager_next() {
return pager_next;
}
public void setPager_next(String pager_next) {
this.pager_next = pager_next;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public String getNextUrl() {
return nextUrl;
}
public void setNextUrl(String nextUrl) {
this.nextUrl = nextUrl;
}
}
结果:
以上。