https://www.jianshu.com/p/fd5caaaa950d
1、从URL获取HTML来解析
Document doc = Jsoup.connect("http://www.baidu.com/").get();
2、从磁盘加载html,再解析成txt
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.*;
public class TEST {
public static void main(String[] args) {
try {
String from = "C:\\Users\\user\\Desktop\\新建文件夹\\novel\\";
//目标txt的地址
String to = "C:\\Users\\user\\Desktop\\新建文件夹\\novel\\xx.txt";
BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(to)));
for (int i = 0; i < 62; i++) {
//把html文件转换成String类型
String str1 = from + (i + 1) + ".html";
BufferedReader bufferedReader = new BufferedReader(new FileReader(str1));
StringBuilder stringBuilder = new StringBuilder();
String str;
while ((str = bufferedReader.readLine()) != null) {
stringBuilder.append(str);//把磁盘中的html拼接成一个字符串
}
//给小说添加每章题目
bw.newLine();
bw.newLine();
bw.newLine();
bw.newLine();
bw.write("第"+(i+1)+"章");
bw.newLine();//回车
String html = new String(stringBuilder);
//Jsoup解析html
Document document = Jsoup.parse(html);
//我的小说html里固定有16个<dd data-id="13">标签,属性名data-id=0~14是有效标签
for (int j = 0; j < 15; j++) {
//通过指定的标签属性名=data-id和属性值=i获得标签,<dd data-id="i>
Elements elements = document.getElementsByAttributeValue("data-id", String.valueOf(j));
for (Element element : elements) {
//通过标签名字<p></p>来获取获得标签,一个<p>小说换行一次
Elements p = element.getElementsByTag("p");
for (Element element1 : p) {
bw.write(" ");
bw.write(element1.text());//element1.text()获得标签里的文本
bw.newLine();
bw.flush();
System.out.println(element1.text());
}
}
}
bufferedReader.close();
}
bw.close();
/*
//通过标签的属性名来获取内容 <dd data-id="13">
Elements elements = document.getElementsByAttribute("data-id");
int size = elements.size();//正文里面共有几个data-id
for (int i = 0; i < size - 1; i++) {
//通过指定的属性名=data-id和属性值=13获得标签,<dd data-id="13">
Elements target = document.getElementsByAttributeValue("data-id", String.valueOf(i));
bw.write(target.text());
bw.newLine();
bw.flush();
System.out.println(target.text());
}*/
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}