关于用htmlunit 爬取并写入小说的简单案例

最新推荐文章于 2023-06-23 21:39:11 发布

Jose_Yang

最新推荐文章于 2023-06-23 21:39:11 发布

阅读量239

点赞数

分类专栏：爬虫

本文链接：https://blog.csdn.net/qq_25041297/article/details/115748919

版权

爬虫专栏收录该内容

5 篇文章 0 订阅

订阅专栏

首先是maven

	<dependency>
			<groupId>net.sourceforge.htmlunit</groupId>
			<artifactId>htmlunit</artifactId>
			<version>2.48.0</version>
		</dependency>

然后是例子，这里是用的这个网站的小说 http://www.aixiawx.com
然后就是代码，比较简单，就是爬取到页面然后截取再输出为text

import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlElement;
import com.gargoylesoftware.htmlunit.html.HtmlInput;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import org.apache.commons.text.StringEscapeUtils;


import javax.sound.midi.Soundbank;
import java.io.*;
import java.net.URLDecoder;
import java.util.ArrayList;
import java.util.List;

import java.util.regex.Matcher;
import java.util.regex.Pattern;


public class readTextBook {

    public static void main(String[] args) {
        String url = "http://www.aixiawx.com/28/28268/";
        //首先获取目录
        List<String[]> muluArray = processingDirectory(backHtml(url));
        for (int i = 0; i < muluArray.size(); i++) {
            String title = replaceBlank(muluArray.get(i)[0])+"\n";
            String sectionUrl = "http://www.aixiawx.com" + replaceBlank(muluArray.get(i)[1]);
            String sectionContext = section(backHtml(sectionUrl));
            try {
                System.out.println("正在写入" + title);
                writeText("C:\\Users\\MSI-PC\\Desktop\\魔临.txt", title + "\n\r" + sectionContext);
            } catch (IOException e) {
                e.printStackTrace();
            }
        }


    }


    /**
     * 返回html内容
     *
     * @param url
     * @return
     */
    private static String backHtml(String url) {
        System.out.printf("读取" + url);
        WebClient webClient = new WebClient(BrowserVersion.CHROME);
        webClient.getOptions().setCssEnabled(false);
        webClient.getOptions().setJavaScriptEnabled(false);
        HtmlPage page = null;
        try {
            page = webClient.getPage(url);
        } catch (IOException e) {
            e.printStackTrace();
        }
        //获取当前页面的html
        String string = page.asXml();
        webClient.close();
        return strToHtml(string);
    }

    public static String strToHtml(String s) {
        if (s == null || s.equals(""))
            return "";
        s = s.replaceAll("&amp;", "&");
        s = s.replaceAll("&lt;", "<");
        s = s.replaceAll("&gt;", ">");
        s = s.replaceAll("&nbsp;", "   ");
        s = s.replaceAll("<br/>", "\n");
        s = s.replaceAll("&#39", "'");
        s = s.replaceAll("&quot;", "'");
        return s;
    }


    /**
     * 处理目录
     *
     * @param html
     * @return
     */
    private static List<String[]> processingDirectory(String html) {
        String[] ddArray = html.split("<dd>");
        int pandua = 0;
        List<String[]> url = new ArrayList<>();
        for (int i = 1; i < ddArray.length; i++) {
            if (ddArray[i].contains("第一章")) {
                pandua = 1;
            }
            if (pandua == 1) {
                try {
                    String ulr = ddArray[i].substring(ddArray[i].indexOf("=\"") + 2, ddArray[i].indexOf(">") - 1);
                    String name = ddArray[i].substring(ddArray[i].indexOf(">"), ddArray[i].indexOf("</a>")).replaceAll(">", "");
                    url.add(new String[]{name, ulr});
                } catch (Exception e) {
                    e.printStackTrace();
                }

            }
        }
        return url;
    }


    /**
     * 处理章节
     *
     * @param html
     * @return
     */
    private static String section(String html) {
        String htmlArray = html.substring(html.indexOf("<div id=\"content\">") + 19);
        String context = htmlArray.substring(0, htmlArray.indexOf(" </div>")).
                replaceAll("<br/>", "").
                replaceAll("\r\n", "").
                replaceAll("————————", "\n").
                replaceAll("    ", ""); 
        return context;
    }


    /**
     * 写入text
     *
     * @param path
     * @param html
     * @throws IOException
     */
    private static void writeText(String path, String html) throws IOException {
        File file = new File(path);
        if (file.exists()) {
        } else {
            file.getParentFile().mkdirs();
        }
        file.createNewFile();
        FileWriter fw = new FileWriter(file, true);
        BufferedWriter bw = new BufferedWriter(fw);
        bw.write(html);
        bw.flush();
        bw.close();
    }


    /**
     * 去掉所有换行回车
     *
     * @param str
     * @return
     */
    public static String replaceBlank(String str) {
        String dest = "";
        if (str != null) {
            Pattern p = Pattern.compile("\\s*|\t|\r|\n");
            Matcher m = p.matcher(str);
            dest = m.replaceAll("").replaceAll("\"\"", "");
        }
        return dest;
    }
}

Jose_Yang

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
关于用htmlunit 爬取并写入小说的简单案例

首先是maven <dependency> <groupId>net.sourceforge.htmlunit</groupId> <artifactId>htmlunit</artifactId> <version>2.48.0</version> </dependency>然后是例子，这里是用的这个网站的小说 http://www.aixiawx.com然后就是代码，比较简单，就
复制链接

扫一扫