这个是根据
https://www.01xs.org/xiaoshuo/36628/
这个小说网站所写的爬虫,里面的很多为了简便直接写死在了代码里,后期有时间可以去完善他
里面的麻烦点
先是一开始不会用正则匹配,然后用的是String的split,但是这个split在切割“()”的时候是不能直接切割的,得“[()]”这样才行。而且这样切出来
会这样
public static void main(String[] args) throws Exception {
String s = "123()123";
String[] split = s.split("[()]");
for (int i = 0; i < split.length; i++) {
System.out.println(split[i]);
}
}
结果是 中间是有个空行的
123
123
然后就网上查了下正则的匹配方法。
但是正则匹配汉字的时候用的是这个
[^x00-xff]
查找的文章:
https://www.cnblogs.com/alibai/p/3593168.html
import java.io.*;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collections;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class FictionPa {
public static void main(String[] args) throws Exception {
//首页目录
String allurl = "https://www.01xs.org/xiaoshuo/36628/";
ArrayList<String> arrayList = writeAllUrls(allurl);
for (int i = 0; i < arrayList.size(); i++) {
//url是每一张的目录
String url = arrayList.get(i);
writeText(url);
}
System.out.println("OK");
}
//读取出章节列表的链接存入ArrList
public static ArrayList<String> writeAllUrls(String surl) throws Exception {
URL url = new URL(surl);
HttpURLConnection connection = (HttpURLConnection) url.openConnection();
//这个只是用代码模拟了浏览器的请求头而已,因为有些网站是限制用爬虫的
connection.setRequestProperty("User-Agent",
"Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_2 like Mac OS X; zh-cn) AppleWebKit/533.17.9 " +
"(KHTML, like Gecko) Version/5.0.2 Mobile/8H7 Safari/6533.18.5");
InputStream inputStream = connection.getInputStream();
BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream, "gbk"));
String line = null;
String regex = "/xiaoshuo/[0-9]{1,9}/[0-9]{1,9}.html";
Pattern compile = Pattern.compile(regex);
ArrayList<String> arrayList = new ArrayList<String>();
String preUrl = "https://www.01xs.org";
while ((line = reader.readLine()) != null) {
if (line.contains("<li>")) {
Matcher matcher = compile.matcher(line);
while (matcher.find()) {
String e = matcher.group(0);
arrayList.add(preUrl + e);
}
}
}
System.out.println(arrayList.toString());
return arrayList;
}
//把html页面上的文字读取出来 在这里可以加个参数里面是存储小说的地址 里面的编码也是一个变量
public static void writeText(String surl) throws Exception {
URL url = new URL(surl);
HttpURLConnection connection = (HttpURLConnection) url.openConnection();
connection.setRequestProperty("User-Agent",
"Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_2 like Mac OS X; zh-cn) AppleWebKit/533.17.9 " +
"(KHTML, like Gecko) Version/5.0.2 Mobile/8H7 Safari/6533.18.5");
InputStream inputStream = connection.getInputStream();
BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream, "gbk"));
String line = null;
String reglx = "<p>[^x00-xff]{1,}</p>";
Pattern compile = Pattern.compile(reglx);
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("E:\\神墓.txt", true)));
while ((line = reader.readLine()) != null) {
Matcher matcher = compile.matcher(line);
while (matcher.find()) {
String e = matcher.group(0);
String s = e.replaceAll("<p>", " ");
String s1 = s.replaceAll("</p>", "\r\n");
writer.write(s1);
writer.flush();
// System.out.print(s1);
}
}
inputStream.close();
System.out.println("一章结束!!!!");
}
}