Jsoup多线程爬取小说
package com.product.downSX;
import org.apache.commons.io.FileUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.File;
import java.io.IOException;
import java.util.*;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
public class GetNovel {
public static void main(String[] args) {
try {
String author1 = "斗破苍穹";
List<Map<String, String>> maps = selectByAuthor(author1);
int count = 50;
String title = null;
for (Map map : maps) {
String author = (String) map.get("author");
title = (String) map.get("title");
String url = (String) map.get("url");
File file = FileUtils.getFile("D:\\小说\\" + author + "\\" + title + ".txt");
if (file.isFile()){
System.out.println("小说已存在");
continue;
}
List<Map<String, Object>> resultList = new ArrayList<>();
Map<String, Object> allMulu = getAllMulu(url);
List<String> mulus = (List<String>) allMulu.get("mulus");
ExecutorService threadPool = Executors.newFixedThreadPool(count);
for (int i = 0; i < count; i++) {
List<String> submulus = mulus.subList(i * mulus.size() / count, (i + 1) * mulus.size() / count);
NovelThread novelThread = new NovelThread(resultList, submulus);
threadPool.execute(novelThread);
}
threadPool.shutdown();
while (true) {
if (threadPool.isTerminated()) {
System.out.println("=============线程全部执行完了=============");
break;
}
}
resultList.sort(new Comparator<Map>() {
@Override
public int compare(Map o1, Map o2) {
return Integer.valueOf((String) o1.get("sort")) - Integer.valueOf((String) o2.get("sort"));
}
});
System.out.println(resultList.size());
wrtieTxt(author, title, resultList);
}
} catch (Exception e) {
e.printStackTrace();
}
}
private static void wrtieTxt(String author, String title, List<Map<String, Object>> resulList) throws IOException {
File file = new File("D:\\小说\\" + author + "\\" + title + ".txt");
for (Map map : resulList) {
String mulu = (String) map.get("mulu");
FileUtils.writeStringToFile(file, "\r\n" + mulu, true);
List<String> texts = (List<String>) map.get("text");
StringBuilder builder = new StringBuilder();
for (String text : texts) {
builder.append("\r\n" + text);
}
FileUtils.writeStringToFile(file, builder.toString(), true);
}
}
private static Map<String, Object> getAllMulu(String contenturl) throws IOException {
Map<String, Object> map = new HashMap<>();
List<String> zjs = new ArrayList<>();
Document doc = Jsoup.connect(contenturl)
.userAgent("Mozilla")
.cookie("auth", "token")
.timeout(3000)
.get();
Elements elementsByAttribute = doc.select("meta[property=og:title]");
String title = elementsByAttribute.attr("content");
map.put("title", title);
Element list = doc.getElementById("list");
Elements hrefs = list.select("a[href]");
for (Element href : hrefs) {
String data = href.text();
String url = href.attr("href");
zjs.add("https://www.xbiquge.la" + url + "&" + data);
}
map.put("mulus", zjs);
return map;
}
private static List<Map<String, String>> selectByAuthor(String author) throws IOException {
List<Map<String, String>> novels = new ArrayList<>();
List<String> context = new ArrayList<>();
Map<String, String> params = new HashMap<>();
params.put("searchkey", author);
Document doc = Jsoup.connect("https://www.xbiquge.la/modules/article/waps.php")
.data(params)
.userAgent("Mozilla")
.cookie("auth", "token")
.timeout(3000)
.post();
Elements trs = doc.body().getElementsByTag("tr");
for (Element tr : trs) {
System.out.println("=============");
Map<String, String> map = new HashMap<>();
Elements evens = tr.getElementsByClass("even");
if (evens == null || evens.size() == 0) {
continue;
}
map.put("title", evens.get(0).text());
map.put("author", evens.get(1).text());
for (Element even : evens) {
System.out.println(even);
Elements hrefs = even.select("a[href]");
for (Element href : hrefs) {
String url = href.attr("href");
map.put("url", url);
context.add(url);
}
}
novels.add(map);
}
return novels;
}
}
线程类
package com.product.downSX;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.TextNode;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class NovelThread implements Runnable {
private List<Map<String,Object>> resultList;
private List<String> urls;
public NovelThread(List<Map<String,Object>> resultList, List<String> urls) {
this.resultList = resultList;
this.urls = urls;
}
@Override
public void run() {
int coount = 0;
int fail = 0;
for (String url : urls) {
try {
coount++;
System.out.println("还剩:"+(urls.size()-coount));
Map<String, Object> map = new HashMap<>();
String[] split = url.split("&");
String[] split1 = split[0].split("/");
String sorts = split1[split1.length - 1].replace(".html", "");
List<String> text = getText(split[0]);
map.put("sort",sorts);
map.put("mulu",split[split.length-1]);
map.put("text",text);
resultList.add(map);
} catch (IOException e) {
fail++;
System.out.println(Thread.currentThread().getName()+"线程失败:"+fail+"次");
try {
Thread.sleep(1000);
} catch (InterruptedException e1) {
e1.printStackTrace();
}
e.printStackTrace();
}
}
}
private static List<String> getText(String url) throws IOException {
List<String> texts = new ArrayList<>();
Document doc = Jsoup.connect(url)
.userAgent("Mozilla")
.cookie("auth", "token")
.timeout(3000)
.get();
Element content = doc.getElementById("content");
List<TextNode> textNodes = content.textNodes();
for (TextNode textNode : textNodes) {
texts.add(textNode.text());
}
return texts;
}
}