java 小说TXT下载（小说爬虫）

最新推荐文章于 2024-03-22 11:01:19 发布

魑魅魍魉9527

最新推荐文章于 2024-03-22 11:01:19 发布

阅读量763

点赞数

分类专栏： java 文章标签： java 爬虫

本文链接：https://blog.csdn.net/qq_28934205/article/details/129144778

版权

java 专栏收录该内容

19 篇文章 0 订阅

订阅专栏

 implementation 'org.jsoup:jsoup:1.13.1'

package cy.main.mytest;

import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.Test;

import static org.junit.Assert.*;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.util.regex.Matcher;

/**
 * Example local unit test, which will execute on the development machine (host).
 *
 * @see <a href="http://d.android.com/tools/testing">Testing documentation</a>
 */
public class ExampleUnitTest {
    int m = 1;
    static boolean isRun = true;

    //  13000
    @Test
    public void addition_isCorrect() {
        createFile();
        try {
            while (isRun) {
                start();
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    static String url = "http://www.ibiqu.org/book/123189/188178476.htm";

    public static void start() {
        parse(url);
    }

    private static void parse(String serverString) {
        System.out.println(serverString);
        // 可以使用Jsoup自带的网络请求方式：
        Document document = null;
        try {
            Connection conn = Jsoup.connect(serverString).timeout(10000);
            conn.header("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:33.0) Gecko/20100101 Firefox/33.0");
            document = conn.get();
        } catch (Exception e) {
//            e.printStackTrace();
            System.err.println(e.getMessage());
        }
        // String string = document.toString();
        // System.out.println("document:" + string);

        // 解析xml
        // document = (Document) Jsoup.parse(serverString);
        if (document == null) {
            System.err.println("链接错误  正在重试。。。");
            try {
                Thread.sleep(10000);
            } catch (InterruptedException e) {
                throw new RuntimeException(e);
            }
            return;
        }

        Elements title = document.select("div");// 得到table标签中的内容
        for (Element item : title) {
            String name = item.attr("class");
            if (name.equals("bookname")) {
                Elements h1 = item.select("h1");
                String txt = "\r\n" + h1.text();
                getTxt(txt);
                System.out.println(txt);
            }
        }

        Elements div = document.select("div");// 得到table标签中的内容
        for (Element item : div) {
            // System.out.println("--------------------------");
            // System.out.println(item);

            String name = item.attr("id");
            if (name.equals("content")) {
                System.out.println(item.text().length());
                String[] line = item.text().split(" ");
                int n = line.length;
                for (int i = 0; i < n; i++) {
                    getTxt(line[i]);
                }
            }
        }

        Elements div1 = document.select("div");// 得到table标签中的内容
        for (Element item : div1) {
            String name = item.attr("class");
            if (name.equals("bottem2")) {

                Elements a = item.select("a");
                for (Element item1 : a) {
                    String name1 = item1.text();
                    if (name1.equals("下一章")) {
                        String href = item1.attr("href");
                        System.out.println(href);

                        if (!href.contains(".htm")) {
                            endTxt();
                            isRun = false;
                        } else {
                            url = "http://www.ibiqu.org" + href;
                            return;
                        }
                    }
                }
            }
        }
    }

    public static void getTxt(String msg) {
        String t = msg;
        Matcher matcher = Patterns.WEB_URL.matcher(msg);
        if (matcher.find()) {
            // System.out.println(matcher.group());
            t = t.replace(matcher.group(), "");
        }
        saveTxt(t + "\r\n");
    }

    public static void endTxt() {
        System.out.println("任务结束");
        try {
            writer.close();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }

    static BufferedWriter writer;

    public static void createFile() {
        File f = new File("D:\\txt\\2.txt");
        FileOutputStream writerStream = null;
        try {
            writerStream = new FileOutputStream(f, true);
            writer = new BufferedWriter(new OutputStreamWriter(writerStream, "UTF-8"));
        } catch (FileNotFoundException | UnsupportedEncodingException e) {
            e.printStackTrace();
        }
    }

    public static void saveTxt(String msg) {
        try {
//            System.out.println(msg);
            writer.write(msg);
            writer.flush();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}