1、首先要有 jsoup-1.7.1.jar jar包 引入。
2、jsoup 的工具类:
package com.wp.util;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
public class JsoupUtil {
public static Document getDocument(String url) {
int error_count = 0;
Document doc = null;
while (true) {
if (error_count > Constants.url_error_count) {
break;
}
try {
doc = Jsoup
.connect(url)
.timeout(Constants.url_ConnectTimeout)
.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
.header("Accept-Encoding", "gzip,deflate,sdch")
.header("Connection", "keep-alive")
.followRedirects(true)
.userAgent("Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)")
.get();
} catch (Exception e) {
error_count++;
}
if (doc != null) {
break;
}
}
return doc;
}
public static Document parseHtml(String html) {
return Jsoup.parse(html);
}
}
3、主要采集的类:
package com.wp.test;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.wp.util.JsoupUtil;
import com.wp.util.Util;
public class Caiji {
public static void main(String[] args) {
start();
}
public static void start() {
BufferedWriter w = null;
Document doc = JsoupUtil.getDocument("http://www.00kxs.com/html/0/596/");// 获取章节列表
try {
File file = new File("E:/abc.txt");
w = new BufferedWriter(new FileWriter(file));
if (doc != null) {
Elements list_a = doc.select("div[id=list] dl dd a");// 获取所有章节的url内容
for (Element el : list_a) {
String url = el.attr("abs:href");// 章节url
String name = el.text();// 章节名称
int chp_num = Util.parseInt(Util.getMatch("第(\\d+)章", name, 1));
System.out.println(url + "=====" + name + "====" + chp_num);
Document e_content = JsoupUtil.getDocument(url);// 获取章节正文
String content = "";// 章节正文
if (e_content != null) {
content = e_content.select("div[id=content]").html().replace(" ", " ");
content = content.replace("<br />", "\n").replaceAll("(\n){1,}", "\n");
w.append(name + "\n\n" + content.replaceAll("(\n){1,}", "\n") + "\n\n");
} else {
w.append(name + "\n\n");
}
}
} else {
System.out.println("没有获取正文");
}
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
if (w != null)
w.close();
} catch (Exception e) {
}
}
}
}