关于用htmlunit 爬取并写入小说的简单案例

首先是maven

	<dependency>
			<groupId>net.sourceforge.htmlunit</groupId>
			<artifactId>htmlunit</artifactId>
			<version>2.48.0</version>
		</dependency>

然后是例子,这里是用的这个网站的小说 http://www.aixiawx.com
然后就是代码,比较简单,就是爬取到页面然后截取再输出为text

import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlElement;
import com.gargoylesoftware.htmlunit.html.HtmlInput;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import org.apache.commons.text.StringEscapeUtils;


import javax.sound.midi.Soundbank;
import java.io.*;
import java.net.URLDecoder;
import java.util.ArrayList;
import java.util.List;

import java.util.regex.Matcher;
import java.util.regex.Pattern;


public class readTextBook {

    public static void main(String[] args) {
        String url = "http://www.aixiawx.com/28/28268/";
        //首先获取目录
        List<String[]> muluArray = processingDirectory(backHtml(url));
        for (int i = 0; i < muluArray.size(); i++) {
            String title = replaceBlank(muluArray.get(i)[0])+"\n";
            String sectionUrl = "http://www.aixiawx.com" + replaceBlank(muluArray.get(i)[1]);
            String sectionContext = section(backHtml(sectionUrl));
            try {
                System.out.println("正在写入" + title);
                writeText("C:\\Users\\MSI-PC\\Desktop\\魔临.txt", title + "\n\r" + sectionContext);
            } catch (IOException e) {
                e.printStackTrace();
            }
        }


    }


    /**
     * 返回html内容
     *
     * @param url
     * @return
     */
    private static String backHtml(String url) {
        System.out.printf("读取" + url);
        WebClient webClient = new WebClient(BrowserVersion.CHROME);
        webClient.getOptions().setCssEnabled(false);
        webClient.getOptions().setJavaScriptEnabled(false);
        HtmlPage page = null;
        try {
            page = webClient.getPage(url);
        } catch (IOException e) {
            e.printStackTrace();
        }
        //获取当前页面的html
        String string = page.asXml();
        webClient.close();
        return strToHtml(string);
    }

    public static String strToHtml(String s) {
        if (s == null || s.equals(""))
            return "";
        s = s.replaceAll("&amp;", "&");
        s = s.replaceAll("&lt;", "<");
        s = s.replaceAll("&gt;", ">");
        s = s.replaceAll("&nbsp;", "   ");
        s = s.replaceAll("<br/>", "\n");
        s = s.replaceAll("&#39", "'");
        s = s.replaceAll("&quot;", "'");
        return s;
    }


    /**
     * 处理目录
     *
     * @param html
     * @return
     */
    private static List<String[]> processingDirectory(String html) {
        String[] ddArray = html.split("<dd>");
        int pandua = 0;
        List<String[]> url = new ArrayList<>();
        for (int i = 1; i < ddArray.length; i++) {
            if (ddArray[i].contains("第一章")) {
                pandua = 1;
            }
            if (pandua == 1) {
                try {
                    String ulr = ddArray[i].substring(ddArray[i].indexOf("=\"") + 2, ddArray[i].indexOf(">") - 1);
                    String name = ddArray[i].substring(ddArray[i].indexOf(">"), ddArray[i].indexOf("</a>")).replaceAll(">", "");
                    url.add(new String[]{name, ulr});
                } catch (Exception e) {
                    e.printStackTrace();
                }

            }
        }
        return url;
    }


    /**
     * 处理章节
     *
     * @param html
     * @return
     */
    private static String section(String html) {
        String htmlArray = html.substring(html.indexOf("<div id=\"content\">") + 19);
        String context = htmlArray.substring(0, htmlArray.indexOf(" </div>")).
                replaceAll("<br/>", "").
                replaceAll("\r\n", "").
                replaceAll("————————", "\n").
                replaceAll("    ", ""); 
        return context;
    }


    /**
     * 写入text
     *
     * @param path
     * @param html
     * @throws IOException
     */
    private static void writeText(String path, String html) throws IOException {
        File file = new File(path);
        if (file.exists()) {
        } else {
            file.getParentFile().mkdirs();
        }
        file.createNewFile();
        FileWriter fw = new FileWriter(file, true);
        BufferedWriter bw = new BufferedWriter(fw);
        bw.write(html);
        bw.flush();
        bw.close();
    }


    /**
     * 去掉所有换行回车
     *
     * @param str
     * @return
     */
    public static String replaceBlank(String str) {
        String dest = "";
        if (str != null) {
            Pattern p = Pattern.compile("\\s*|\t|\r|\n");
            Matcher m = p.matcher(str);
            dest = m.replaceAll("").replaceAll("\"\"", "");
        }
        return dest;
    }
}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值