maven依赖:
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.2</version>
</dependency>
需要的工具类:
为什么使用工具类: 没有调用工具类这个方法会报错, 请求链接时会报这个会报错:
javax.net.ssl.SSLHandshakeException: sun.security.validator.ValidatorException: PKIX path building failed: sun.security.provider.certpath.SunCertPathBuilderException: unable to find valid certification path to requested target
如图:
package com.ghx.demo.util;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import javax.net.ssl.HostnameVerifier;
import javax.net.ssl.HttpsURLConnection;
import javax.net.ssl.SSLContext;
import javax.net.ssl.SSLSession;
import javax.net.ssl.TrustManager;
import javax.net.ssl.X509TrustManager;
public class SslUtils {private static void trustAllHttpsCertificates() throws Exception {
TrustManager[] trustAllCerts = new TrustManager[1];
TrustManager tm = new miTM();
trustAllCerts[0] = tm;
SSLContext sc = SSLContext.getInstance("SSL");
sc.init(null, trustAllCerts, null);
HttpsURLConnection.setDefaultSSLSocketFactory(sc.getSocketFactory());
}
static class miTM implements TrustManager,X509TrustManager {
public X509Certificate[] getAcceptedIssuers() {
return null;
}
public boolean isServerTrusted(X509Certificate[] certs) {
return true;
}
public boolean isClientTrusted(X509Certificate[] certs) {
return true;
}
public void checkServerTrusted(X509Certificate[] certs, String authType)
throws CertificateException {
return;
}
public void checkClientTrusted(X509Certificate[] certs, String authType)
throws CertificateException {
return;
}
}
/**
* 忽略HTTPS请求的SSL证书,必须在openConnection之前调用
* @throws Exception
*/
public static void ignoreSsl() throws Exception{
HostnameVerifier hv = new HostnameVerifier() {
public boolean verify(String urlHostName, SSLSession session) {
System.out.println("Warning: URL Host: " + urlHostName + " vs. " + session.getPeerHost());
return true;
}
};
trustAllHttpsCertificates();
HttpsURLConnection.setDefaultHostnameVerifier(hv);
}
}
Demo:
package com.ghx.demo.test;
import com.ghx.demo.util.SslUtils;
import com.google.common.collect.Lists;
import org.apache.commons.collections4.CollectionUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.jupiter.api.Test;
import java.io.*;
import java.util.ArrayList;
import java.util.List;
/**
* @Author: GeHengXin
* @Description: 测试网页抓取全国的行政区域
*/
public class DemoTest2 {
@Test
public void generateInsertSql() throws Exception {
try {
//调用这个方法为了防止报:javax.net.ssl.SSLHandshakeException: sun.security.validator.ValidatorException
SslUtils.ignoreSsl();
} catch (Exception e1) {
System.out.println("utils");
}
//行政区划代码 https://www.mca.gov.cn/article/sj/xzqh/1980/(中华人民共和国民政部) 在这个网站找最新的数据替换一下链接
String url = "https://www.mca.gov.cn/article/sj/xzqh/2022/202201xzqh.html";
Document doc = Jsoup.connect(url)
//写自己的 User-Agent 在浏览器随便请求一个网页,F12 复制自己的出来粘贴到这。
.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.57")
.header("Accept", "*/*")
.maxBodySize(0)
.timeout(100000)
.get();
Elements trs = doc.select("tr");
//输出的文件路径
String filePath = "C:\\Users\\admin\\Desktop\\hhh.sql";
ArrayList<String> list = Lists.newArrayList();
ArrayList<String> ignoreList = Lists.newArrayList();
//解析数据并存放到list中
// 正则表达式
String regionCodePattern = "^[1-9]\\d{5}$";
for (Element tr : trs) {
Elements tds = tr.select("td");
if (tds.size() > 3) {
String regionCode = tds.get(1).text();
String regionArea = tds.get(2).text();
String parentCode = "";
// 解析规则
if (regionCode.matches(regionCodePattern)) {
int leveType = 2;
parentCode = regionCode.substring(0, 2) + "0000";
if (!regionCode.endsWith("00")) {
leveType = 3;
parentCode = regionCode.substring(0, 4) + "00";
}
if (regionCode.endsWith("0000")) {
leveType = 1;
parentCode = "000000";
}
//拼接sql
String content = String.format("insert into region_code (code, name, level, parent_code, dtime, note, ctime)" +
" values (%s, '%s', %s, %s, '201903', '系统生成', NOW());" + System.getProperty("line.separator"), regionCode, regionArea, leveType, parentCode);
list.add(content);
} else {
ignoreList.add(regionCode);
}
}
}
System.out.println("正则通过的总数量为:" + list.size());
System.out.println("正则未通过的总数量为:" + ignoreList.size());
System.out.println(ignoreList);
//逐行输出到文件
this.writeFileByLine(filePath, list);
}
/**
* 一行一行写入文件,解决写入中文字符时出现乱码
* 流的关闭顺序:先打开的后关,后打开的先关,
* 否则有可能出现java.io.IOException: Stream closed异常
*
* @throws IOException
*/
public void writeFileByLine(String filePath, List<String> rowList) throws IOException {
if (CollectionUtils.isEmpty(rowList)){
return;
}
//写入中文字符时解决中文乱码问题
FileOutputStream fos=new FileOutputStream(new File(filePath));
OutputStreamWriter osw=new OutputStreamWriter(fos, "UTF-8");
BufferedWriter bw=new BufferedWriter(osw);
//简写如下:
for(String row:rowList){
bw.write(row);
// bw.write(row+"\t\n");
}
//注意关闭的先后顺序,先打开的后关闭,后打开的先关闭
bw.close();
osw.close();
fos.close();
}
}
运行控制台打印:
国家民政部公布的数据(部分截图):
一共是:3213 条(我是用笨方法,从网页复制到Excel中的,没有设置表头,从第一行就是数据,其实如果单纯的只是想获取sql,也可以从Excel转化为sql, 不过咱们主题是讨论如何用Java爬虫工具,爬取数据)
导出的文件展示(只截取了文件得末尾):
对比发现,比官网得数据 少了几条,不过还能接受,至少我们的项目是满足了。至于少的几条具体是什么原因,我没有去分析,开始以为可能是正则校验,过滤掉了的那部分数据,不过打印出来看了看,正则没通过的是 11条,10个都是空的字符串,好像也对不上,有兴趣的,可以自己分析数据比对比对,这里就不展开了。如图:
参考: https://www.cnblogs.com/fengpingfan/p/10875230.html