Jsoup多线程爬取小说

Jsoup多线程爬取小说

package com.product.downSX;

import org.apache.commons.io.FileUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.File;
import java.io.IOException;
import java.util.*;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;

public class GetNovel {
    public static void main(String[] args) {
        try {
            String author1 = "斗破苍穹";
            List<Map<String, String>> maps = selectByAuthor(author1);
            int count = 50;//线程数
            String title = null;

            for (Map map : maps) {
                String author = (String) map.get("author");
                title = (String) map.get("title");
                String url = (String) map.get("url");
                File file = FileUtils.getFile("D:\\小说\\" + author + "\\" + title + ".txt");
                if (file.isFile()){
                    System.out.println("小说已存在");
                    continue;
                }
                List<Map<String, Object>> resultList = new ArrayList<>();
                Map<String, Object> allMulu = getAllMulu(url);
                List<String> mulus = (List<String>) allMulu.get("mulus");
                ExecutorService threadPool = Executors.newFixedThreadPool(count);
                for (int i = 0; i < count; i++) {
                    List<String> submulus = mulus.subList(i * mulus.size() / count, (i + 1) * mulus.size() / count);
                    NovelThread novelThread = new NovelThread(resultList, submulus);
                    threadPool.execute(novelThread);
                }
                threadPool.shutdown();
                while (true) {
                    if (threadPool.isTerminated()) {
                        System.out.println("=============线程全部执行完了=============");
                        break;
                    }
                }
                resultList.sort(new Comparator<Map>() {
                    @Override
                    public int compare(Map o1, Map o2) {
                        return Integer.valueOf((String) o1.get("sort")) - Integer.valueOf((String) o2.get("sort"));
                    }
                });
                System.out.println(resultList.size());

                wrtieTxt(author, title, resultList);

            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    private static void wrtieTxt(String author, String title, List<Map<String, Object>> resulList) throws IOException {
//        String text = "我是一个粉刷2222";
        File file = new File("D:\\小说\\" + author + "\\" + title + ".txt");
        for (Map map : resulList) {
            String mulu = (String) map.get("mulu");
            FileUtils.writeStringToFile(file, "\r\n" + mulu, true);
            List<String> texts = (List<String>) map.get("text");
            StringBuilder builder = new StringBuilder();
            for (String text : texts) {
                builder.append("\r\n" + text);
            }
            FileUtils.writeStringToFile(file, builder.toString(), true);

        }
    }

    private static Map<String, Object> getAllMulu(String contenturl) throws IOException {
        Map<String, Object> map = new HashMap<>();
        List<String> zjs = new ArrayList<>();
        Document doc = Jsoup.connect(contenturl)
                .userAgent("Mozilla")
                .cookie("auth", "token")
                .timeout(3000)
                .get();
        //doc.select("meta[property=og:title]",获取
        Elements elementsByAttribute = doc.select("meta[property=og:title]");
        String title = elementsByAttribute.attr("content");
        map.put("title", title);
        Element list = doc.getElementById("list");
        Elements hrefs = list.select("a[href]");
        for (Element href : hrefs) {
            String data = href.text();
            String url = href.attr("href");
            //https://www.xbiquge.la
            zjs.add("https://www.xbiquge.la" + url + "&" + data);
        }
        map.put("mulus", zjs);
        return map;
    }

    private static List<Map<String, String>> selectByAuthor(String author) throws IOException {
        List<Map<String, String>> novels = new ArrayList<>();
        List<String> context = new ArrayList<>();
        Map<String, String> params = new HashMap<>();
        params.put("searchkey", author);

        Document doc = Jsoup.connect("https://www.xbiquge.la/modules/article/waps.php")
                .data(params)
                .userAgent("Mozilla")
                .cookie("auth", "token")
                .timeout(3000)
                .post();
        //body,取body内数据,.getElementsByTag("table"),取标签数据,getElementsByClass 指定class
        Elements trs = doc.body().getElementsByTag("tr");
        for (Element tr : trs) {

//                System.out.println(tr);
            System.out.println("=============");
            Map<String, String> map = new HashMap<>();
            Elements evens = tr.getElementsByClass("even");
            if (evens == null || evens.size() == 0) {
                continue;
            }
            map.put("title", evens.get(0).text());
            map.put("author", evens.get(1).text());

            for (Element even : evens) {
                System.out.println(even);
                Elements hrefs = even.select("a[href]");
                for (Element href : hrefs) {
                    String url = href.attr("href");
                    map.put("url", url);
                    context.add(url);
//                    return context;

                }
            }
            novels.add(map);
        }
        return novels;
    }

}

线程类

package com.product.downSX;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.TextNode;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

public class NovelThread implements Runnable {

    private List<Map<String,Object>> resultList;
    private List<String> urls;

    public NovelThread(List<Map<String,Object>> resultList, List<String> urls) {
        this.resultList = resultList;
        this.urls = urls;
    }

    @Override
    public void run() {
        int coount = 0;
        int fail = 0;
        for (String url : urls) {
            try {
                coount++;
                System.out.println("还剩:"+(urls.size()-coount));
                Map<String, Object> map = new HashMap<>();
                String[] split = url.split("&");
                String[] split1 = split[0].split("/");
                String sorts = split1[split1.length - 1].replace(".html", "");
                List<String> text = getText(split[0]);
                map.put("sort",sorts);
                map.put("mulu",split[split.length-1]);
                map.put("text",text);
                resultList.add(map);
            } catch (IOException e) {
                fail++;
                System.out.println(Thread.currentThread().getName()+"线程失败:"+fail+"次");
                try {
                    Thread.sleep(1000);
                } catch (InterruptedException e1) {
                    e1.printStackTrace();
                }
                e.printStackTrace();
            }
        }
    }


    private static List<String> getText(String url) throws IOException {
        List<String> texts = new ArrayList<>();
        Document doc = Jsoup.connect(url)
                .userAgent("Mozilla")
                .cookie("auth", "token")
                .timeout(3000)
                .get();
//            System.out.println(doc);
        Element content = doc.getElementById("content");
        List<TextNode> textNodes = content.textNodes();
        for (TextNode textNode : textNodes) {
            texts.add(textNode.text());
        }

        return texts;
//        System.out.println(text);
    }
}

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值