首先是maven
<dependency>
<groupId>net.sourceforge.htmlunit</groupId>
<artifactId>htmlunit</artifactId>
<version>2.48.0</version>
</dependency>
然后是例子,这里是用的这个网站的小说 http://www.aixiawx.com
然后就是代码,比较简单,就是爬取到页面然后截取再输出为text
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlElement;
import com.gargoylesoftware.htmlunit.html.HtmlInput;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import org.apache.commons.text.StringEscapeUtils;
import javax.sound.midi.Soundbank;
import java.io.*;
import java.net.URLDecoder;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class readTextBook {
public static void main(String[] args) {
String url = "http://www.aixiawx.com/28/28268/";
//首先获取目录
List<String[]> muluArray = processingDirectory(backHtml(url));
for (int i = 0; i < muluArray.size(); i++) {
String title = replaceBlank(muluArray.get(i)[0])+"\n";
String sectionUrl = "http://www.aixiawx.com" + replaceBlank(muluArray.get(i)[1]);
String sectionContext = section(backHtml(sectionUrl));
try {
System.out.println("正在写入" + title);
writeText("C:\\Users\\MSI-PC\\Desktop\\魔临.txt", title + "\n\r" + sectionContext);
} catch (IOException e) {
e.printStackTrace();
}
}
}
/**
* 返回html内容
*
* @param url
* @return
*/
private static String backHtml(String url) {
System.out.printf("读取" + url);
WebClient webClient = new WebClient(BrowserVersion.CHROME);
webClient.getOptions().setCssEnabled(false);
webClient.getOptions().setJavaScriptEnabled(false);
HtmlPage page = null;
try {
page = webClient.getPage(url);
} catch (IOException e) {
e.printStackTrace();
}
//获取当前页面的html
String string = page.asXml();
webClient.close();
return strToHtml(string);
}
public static String strToHtml(String s) {
if (s == null || s.equals(""))
return "";
s = s.replaceAll("&", "&");
s = s.replaceAll("<", "<");
s = s.replaceAll(">", ">");
s = s.replaceAll(" ", " ");
s = s.replaceAll("<br/>", "\n");
s = s.replaceAll("'", "'");
s = s.replaceAll(""", "'");
return s;
}
/**
* 处理目录
*
* @param html
* @return
*/
private static List<String[]> processingDirectory(String html) {
String[] ddArray = html.split("<dd>");
int pandua = 0;
List<String[]> url = new ArrayList<>();
for (int i = 1; i < ddArray.length; i++) {
if (ddArray[i].contains("第一章")) {
pandua = 1;
}
if (pandua == 1) {
try {
String ulr = ddArray[i].substring(ddArray[i].indexOf("=\"") + 2, ddArray[i].indexOf(">") - 1);
String name = ddArray[i].substring(ddArray[i].indexOf(">"), ddArray[i].indexOf("</a>")).replaceAll(">", "");
url.add(new String[]{name, ulr});
} catch (Exception e) {
e.printStackTrace();
}
}
}
return url;
}
/**
* 处理章节
*
* @param html
* @return
*/
private static String section(String html) {
String htmlArray = html.substring(html.indexOf("<div id=\"content\">") + 19);
String context = htmlArray.substring(0, htmlArray.indexOf(" </div>")).
replaceAll("<br/>", "").
replaceAll("\r\n", "").
replaceAll("————————", "\n").
replaceAll(" ", "");
return context;
}
/**
* 写入text
*
* @param path
* @param html
* @throws IOException
*/
private static void writeText(String path, String html) throws IOException {
File file = new File(path);
if (file.exists()) {
} else {
file.getParentFile().mkdirs();
}
file.createNewFile();
FileWriter fw = new FileWriter(file, true);
BufferedWriter bw = new BufferedWriter(fw);
bw.write(html);
bw.flush();
bw.close();
}
/**
* 去掉所有换行回车
*
* @param str
* @return
*/
public static String replaceBlank(String str) {
String dest = "";
if (str != null) {
Pattern p = Pattern.compile("\\s*|\t|\r|\n");
Matcher m = p.matcher(str);
dest = m.replaceAll("").replaceAll("\"\"", "");
}
return dest;
}
}