静 态 网 页 爬 虫
——小说网站为例——
环境:
windows 7
idea 2018.2
jdk 1.8
思路:
- 获取带有章节链接的目录网页
- 通过正则表达式,得到每一篇的a标签
- 通过a标签拼接成每一篇文章的url
- 调用Jw方法,再次获取每篇文章的正文
- 通过正则表达式,得到标题和正文(正文由于正则表达式写的很差,后面用了字符串的拼接与替换)
总结:
最简单的方法,也是最笨的方法。
学习更多的方法,很重要。得到可以有很多办法得到,有简单的,有繁琐的,而我要一点一点的从繁琐到简单,不断的进步。
主方法:BookTest.java
获取目录
package com.etc;
import java.io.*;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class BookTest {
public static void main(String[] args) {
Jw jw = new Jw();
//输入地址,获取目录
try {
URL mulu = new URL("http://www.biquge.com.tw/0_757/");
//读取网页
BufferedReader br = new BufferedReader(new InputStreamReader(mulu.openStream(), "gbk"));
//获得文章标题的a标签正则(因为文章标题的a标签在一行下,所以br.rinline()读取到的是一行)
String urlReg = "<d([d]{1}[\\s\\S]*)d>";
Pattern urlPattern = Pattern.compile(urlReg);
//开始读取
String str = null;
while ((str = br.readLine()) != null) {
Matcher urlMatcher = urlPattern.matcher(str);
if (urlMatcher.find()) {
System.out.println("输出a标签:");
//输出a标签
String str1 = urlMatcher.group();
//通过 </dd> 分割每篇文章的a标签
String[] strings = str1.split("</dd>");
for (int i = 0; i < strings.length; i++) {
// System.out.println(strings[i].toString() + a);
//截取a标签
String str2 = strings[i].substring(strings[i].indexOf("<a") + 9, strings[i].lastIndexOf("\">"));
// System.out.println(str2);
//拼接每篇文章的url
String url = "http://www.biquge.com.tw" + str2;
//输出每篇文章的url
System.out.println("BookTest:\t" + url);
//文章的读写方法
jw.zj(url);
}
}
}
br.close();
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
System.exit(0);
}
}
Jw.java
获取正文(在请求某些不受信任的https网站时,需要信任ssl证书)
package com.etc;
import java.io.*;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Jw {
public void zj(String url) {
try {
URL url1 = new URL(url);
File file = null;
BufferedWriter bw = null;
Matcher m;
//URL url1 = new URL("http://www.biquge.com.tw/0_757/530289.html");
//信任所有ssl证书
if ("https".equalsIgnoreCase(url1.getProtocol())) {
SslUtils.ignoreSsl();
}
BufferedReader br = new BufferedReader(new InputStreamReader(url1.openStream(), "gbk"));
//标题
String titleReg = "<h1>[\\s\\S]*</h1>";
//正文
String contextReg = "(([ ]{2,})([\\s\\S]{1,})(<br\\s*/?>))";
Pattern titlePattern = Pattern.compile(titleReg);
Pattern contextPattern = Pattern.compile(contextReg);
String str = null;
String title = null;//正文中标题
String context = null;//正文的每一行
int i = 0;
while ((str = br.readLine()) != null) {
// System.out.println(str + (i++));//输出单行
//标题
m = titlePattern.matcher(str);
if (m.find()) {
title = m.group();
file = new File("大主宰\\" + title.substring(4, title.lastIndexOf("<")) + ".txt");
//写出小说
bw = new BufferedWriter(new BufferedWriter(new FileWriter(file)));
bw.write(title.substring(4, title.lastIndexOf("<")) + "\r\n");
bw.flush();
}
//正文
//测试<br />正则表达式
// Matcher m2 = contextPattern.matcher("你傻吗<br />");
// System.out.println(m2.find());
Matcher m3 = contextPattern.matcher(str);//判断
if (m3.find()) {
// System.out.println("找到正文:"+m1.group()+i);
context = m3.group();
String contents = context.replaceAll(" ", "").replaceAll("<br />", "");
// System.out.println(contents);
bw.write(contents + "\r\n");
bw.flush();
}
}
System.out.println(file.getName() + "\t 下载完成!");
System.out.println("-------结束!--------");
bw.close();
br.close();
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} catch (Exception e) {
e.printStackTrace();
}
}
}
SsuTills.java
信任所有证书的方法(来源:http://javaweb.org/?p=1237)
package com.etc;
import javax.net.ssl.*;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
public class SslUtils {private static void trustAllHttpsCertificates() throws Exception {
TrustManager[] trustAllCerts = new TrustManager[1];
TrustManager tm = new miTM();
trustAllCerts[0] = tm;
SSLContext sc = SSLContext.getInstance("SSL");
sc.init(null, trustAllCerts, null);
HttpsURLConnection.setDefaultSSLSocketFactory(sc.getSocketFactory());
}
static class miTM implements TrustManager, X509TrustManager {
public X509Certificate[] getAcceptedIssuers() {
return null;
}
public boolean isServerTrusted(X509Certificate[] certs) {
return true;
}
public boolean isClientTrusted(X509Certificate[] certs) {
return true;
}
public void checkServerTrusted(X509Certificate[] certs, String authType)
throws CertificateException {
return;
}
public void checkClientTrusted(X509Certificate[] certs, String authType)
throws CertificateException {
return;
}
}
/**
* 忽略HTTPS请求的SSL证书,必须在openConnection之前调用
* @throws Exception
*/
public static void ignoreSsl() throws Exception{
HostnameVerifier hv = new HostnameVerifier() {
public boolean verify(String urlHostName, SSLSession session) {
System.out.println("Warning: URL Host: " + urlHostName + " vs. " + session.getPeerHost());
return true;
}
};
trustAllHttpsCertificates();
HttpsURLConnection.setDefaultHostnameVerifier(hv);
}
}
2018/11/11