使用jsoup爬取了下某个古诗文网站中的本草纲目信息,并将格式保存成了json格式到文本文件中。
需要的jar包maven配置:
<!-- https://mvnrepository.com/artifact/com.alibaba/fastjson -->
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.32</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.3</version>
</dependency>
具体执行的代码如下:
public static void main(String[] args) throws IOException {
TestJsoup2 tj = new TestJsoup2();
tj.test();
}
public void test() throws IOException {
String url = "https://so.gushiwen.org/guwen/book_46653FD803893E4FA8121459333EEB43.aspx";
Document document = JsoupUtils.getRoot(url);
Elements bookConts = document.select("div.bookcont");
JSONArray arr = new JSONArray();
for(int i = 0 ; i < bookConts.size() ; i ++) {
Element bookCont = bookConts.get(i);
arr.add(this.analysisBookCont(bookCont));
}
String dpath = "D:\\study\\jsoup\\bcgmdetailjson.js";
FileUtils.writeLine(dpath, arr.toJSONString());
}
private JSONObject analysisBookCont(Element bookCont) throws IOException {
Elements bookMls = bookCont.select("div.bookMl");
Element bookMl = bookMls.get(0);
String moduleName = bookMl.select("strong").get(0).text().trim();
Elements links = bookCont.select("a[href]");
JSONObject moduleJson = new JSONObject();
moduleJson.put("moduleName", moduleName);
JSONArray moduleArr = new JSONArray();
for (Element link : links) {
String url = link.attr("href");
String name = link.text();
JSONObject json = this.doReptile(url);
json.put("name", name);
moduleArr.add(json);
}
moduleJson.put("value", moduleArr);
return moduleJson;
}
private JSONObject doReptile(String url) throws IOException {
Document document = JsoupUtils.getRoot(url);
Elements contson = document.select("div.contson");
Element e = contson.get(0);
Elements ps = e.select("p");
JSONObject json = new JSONObject();
for(int i = 0 ; i < ps.size() ; i ++) {
Element p = ps.get(i);
Elements strongs = p.select("strong");
if(strongs.size() > 0) {
String moduleName = strongs.get(0).text().trim();
if ("释名".equals(moduleName)) {
json.put("sm", this.getText(p));
}else if ("气味".equals(moduleName)) {
json.put("qw", this.getText(p));
}else if ("主治".equals(moduleName)) {
json.put("zz", this.getText(p));
}
}else {
String pt = p.text().trim();
if (pt.startsWith("\"释名\"")) {
json.put("sm", this.getText(p));
}else if (pt.startsWith("\"气味\"")) {
json.put("qw", this.getText(p));
}else if (pt.startsWith("\"主治\"")) {
json.put("zz", this.getText(p));
}
}
}
return json;
}
//获取文本的方式,可能是其中一种
private String getText(Element p) {
String text = p.text();
String splitText = "<br>";
return text.substring(text.indexOf(splitText) + splitText.length()).trim();
}
另需要的两个工具方法:
public static Document getRoot(String url) throws IOException {
return Jsoup.connect(url).get();
}
//追加文本文件
public static void writeLine(String path, String text) {
FileWriter fw = null;
try {
File f = new File(path);
fw = new FileWriter(f, true);
} catch (IOException e) {
e.printStackTrace();
}
PrintWriter pw = new PrintWriter(fw);
pw.println(text);
pw.flush();
try {
fw.flush();
pw.close();
fw.close();
} catch (IOException e) {
e.printStackTrace();
}
}
// 逐行读取文件内容,参数:路径
public static List<String> readFile(String path) {
File file = new File(path);
List<String> list = new ArrayList<>();
try {
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file), "UTF-8"));// 构造一个BufferedReader类来读取文件
String s = null;
while ((s = br.readLine()) != null) {// 使用readLine方法,一次读一行
list.add(s);
}
br.close();
} catch (Exception e) {
e.printStackTrace();
}
return list;
}
以上。