你知道的越多,你不知道的越多
点赞再看,养成习惯
如果您有疑问或者见解,欢迎指教:
企鹅:869192208
前言
书接上回,继续折腾爬虫爬取全国区划代码的需求,既然都了解到并使用了 Jsoup 获取到数据,那 webmagic 这个工具高低得安排上,webmagic 相比于 Jsoup 能更方便的实现爬虫操作。
但是这次爬取全国五级区划数据中的区县和镇街两个层级的数据过于庞大,单靠一个 ip 去爬取数据,在爬取一定数据(8000多条)后,会限制访问,解决思路是使用代理,基本上要付费代理才比较稳定,所以虽然代码层面能够实现获取五级区划,但是实际上仍然没有实现,代码提供获取的思路,具体代码实现往下看。
引入jar包
首先,我们需要引入 webmagic-core 包和 webmagic-extension 包,需要在 pom 文件新增以下内容:
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.7.3</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.3</version>
</dependency>
<!-- fastjson -->
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.9</version>
</dependency>
<!-- lombok -->
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<scope>provided</scope>
</dependency>
代码实现
- 新建一个 AreaSpider 类,实现 PageProcessor 接口
public class AreaSpider implements PageProcessor {
/**
* 公共路径url
*/
private static String SPIDER_URL = "http://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2022/";
private Site site = Site.me().setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36").setTimeOut(1000 * 60).setCharset("UTF-8").setRetryTimes(10).setSleepTime(100);
/**
* 爬取数据的年度标识,用于区分批次
*/
private String year;
/**
* 爬取从省级到包含的那一级数据
* 例如: contain = "city,county"; 即爬取省级到县级数据
*/
private String contain;
public AreaSpider(String year, String contain) {
this.year = year;
this.contain = contain;
}
@Override
public void process(Page page) {
page.setCharset("UTF-8");
String url = page.getUrl().toString();
String[] replaceUrl = url.replace(SPIDER_URL, "").replace(".html", "").split("/");
if (url.matches(SPIDER_URL + "index.html")) {
//抓取省级行政区划
List<Map<String, Object>> province = page.getHtml().xpath("//tr[@class='provincetr']/td").nodes().stream()
.filter(selectable -> selectable.xpath("//a/text()") != null)
.filter(selectable -> selectable.links().all().size() > 0)
.map(selectable -> {
String name = selectable.xpath("//a/text()").toString();
String newUrl = selectable.links().all().get(0);
if (contain.contains("city")) {
page.addTargetRequest(newUrl);
}
String replace = newUrl.replace(SPIDER_URL, "").replace(".html", "");
String areaCode = replace + "0000";
HashMap<String, Object> map = new HashMap<>();
map.put("C_NAME", name);
map.put("C_CODE", areaCode);
map.put("C_LEVEL", 1);
map.put("C_CASCADE", "/");
map.put("C_PARENT_CODE", 0L);
map.put("C_YEAR", year);
return map;
}).collect(Collectors.toList());
page.putField("area", province);
}
//抓取市级行政单位
if (replaceUrl.length == 1 && !replaceUrl[0].equals("index") && contain.contains("city")) {
List<Map<String, Object>> city = new ArrayList<Map<String, Object>>();
List<Selectable> cityNodes = page.getHtml().xpath("//tr[@class='citytr']/td").nodes();
cityNodes.stream().forEach(node -> {
String name = node.xpath("//a/text()").toString();
if (!compile("[0-9]*").matcher(name).matches()) {
String newUrl = node.links().all().get(0);
if (contain.contains("county")) {
page.addTargetRequest(newUrl);
}
String replace = newUrl.replace(SPIDER_URL, "").replace(".html", "");
String[] split = replace.split("/");
String parentId = split[0] + "0000";
String areaCode = split[split.length - 1] + "00";
HashMap<String, Object> map = new HashMap<>();
map.put("C_NAME", name);
map.put("C_CODE", areaCode);
map.put("C_LEVEL", 2);
map.put("C_CASCADE", "/" + parentId + "/" + areaCode);
map.put("C_PARENT_CODE", Long.valueOf(parentId));
map.put("C_YEAR", year);
city.add(map);
}
});
page.putField("area", city);
}
//抓取县级行政单位
if (replaceUrl.length == 2 && contain.contains("county")) {
List<Map<String, Object>> county = new ArrayList<Map<String, Object>>();
List<Selectable> countyNodes = page.getHtml().xpath("//tr[@class='countytr']/td").nodes();
for (int i = 0; i < countyNodes.size(); i += 2) {
List<String> code = countyNodes.get(i).xpath("//*/text()").all();
List<String> name = countyNodes.get(i + 1).xpath("//*/text()").all();
String areaCode = code.get(0);
String areaName = name.get(0);
if (code.size() > 1) {
areaCode = code.get(1);
areaName = name.get(1);
String newUrl = countyNodes.get(i).links().all().get(0);
if (contain.contains("town")) {
page.addTargetRequest(newUrl);
}
}
areaCode = areaCode.substring(0, 6);
String parentId = areaCode.substring(0, 4) + "00";
HashMap<String, Object> map = new HashMap<>();
map.put("C_NAME", areaName);
map.put("C_CODE", areaCode);
map.put("C_LEVEL", 3);
map.put("C_CASCADE", "/" + areaCode.substring(0, 2) + "0000/" + parentId + "/" + areaCode);
map.put("C_PARENT_CODE", Long.valueOf(parentId));
map.put("C_YEAR", year);
county.add(map);
}
page.putField("area", county);
}
//抓取镇级行政单位
if (replaceUrl.length == 3 && contain.contains("town")) {
List<Map<String, Object>> town = new ArrayList<>();
List<Selectable> countyNodes = page.getHtml().xpath("//tr[@class='towntr']/td").nodes();
for (int i = 0; i < countyNodes.size(); i += 2) {
List<String> code = countyNodes.get(i).xpath("//*/text()").all();
List<String> name = countyNodes.get(i + 1).xpath("//*/text()").all();
String areaCode = code.get(0);
String areaName = name.get(0);
if (code.size() > 1) {
areaCode = code.get(1);
areaName = name.get(1);
String newUrl = countyNodes.get(i).links().all().get(0);
if (contain.contains("village")) {
page.addTargetRequest(newUrl);
}
}
areaCode = areaCode.substring(0, 9);
String parentId = areaCode.substring(0, 6);
HashMap<String, Object> map = new HashMap<>();
map.put("C_NAME", areaName);
map.put("C_CODE", areaCode);
map.put("C_LEVEL", 4);
map.put("C_CASCADE", "/" + areaCode.substring(0, 2) + "0000/" + areaCode.substring(0, 4) + "00/" + parentId + "/" + areaCode);
map.put("C_PARENT_CODE", Long.valueOf(parentId));
map.put("C_YEAR", year);
town.add(map);
}
page.putField("area", town);
}
//抓取乡级行政单位
if (replaceUrl.length == 4 && contain.contains("village")) {
List<Map<String, Object>> village = new ArrayList<Map<String, Object>>();
List<Selectable> countyNodes = page.getHtml().xpath("//tr[@class='villagetr']/td").nodes();
for (int i = 0; i < countyNodes.size(); i += 3) {
String areaCode = countyNodes.get(i).xpath("//*/text()").get();
String areaName = countyNodes.get(i + 2).xpath("//*/text()").get();
String parentId = areaCode.substring(0, 9);
HashMap<String, Object> map = new HashMap<>();
map.put("C_NAME", areaName);
map.put("C_CODE", areaCode);
map.put("C_LEVEL", 5);
map.put("C_CASCADE", "/" + areaCode.substring(0, 2) + "0000/" + areaCode.substring(0, 4) + "00/" + areaCode.substring(0, 6) + "/" + parentId + "/" + areaCode);
map.put("C_PARENT_CODE", Long.valueOf(parentId));
map.put("C_YEAR", year);
village.add(map);
}
page.putField("area", village);
}
}
@Override
public Site getSite() {
return site;
}
public static void main(String[] args) {
Spider.create(new AreaSpider("2022", "city, county")).addUrl("http://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2022/index.html")
.addPipeline(new SqlPipeline()).thread(16).run();
}
}
- 新建 SysCity 类,用来存储获取到的每个区划数据
@Data
@AllArgsConstructor
@NoArgsConstructor
public class SysCity {
private Integer id;
private Integer pId;
private String addrCode;
private String name;
private String Urls;
private String fatherCode;
private String type;
private List<SysCity> Childs;
}
- 新建 SqlPipeline 类,实现 Pipeline 接口,将爬取到的数据固化到数据库(当然也可以输出到 excel 文件等)
@Component
@Slf4j
public class SqlPipeline implements Pipeline {
static String driver = "com.mysql.jdbc.Driver";
static String url = "jdbc:mysql://localhost:3306/wsdc?characterEncoding=utf8&useSSL=false&serverTimezone=Asia/Shanghai";
static String username = "root";
static String password = "123456";
static Connection conn = null;
static{
try {
Class.forName(driver); //classLoader,加载对应驱动
conn = DriverManager.getConnection(url, username, password);
} catch (ClassNotFoundException e) {
e.printStackTrace();
} catch (SQLException e) {
e.printStackTrace();
}
}
@Override
public void process(ResultItems resultItems, Task task) {
List<Map<String, Object>> area = resultItems.get("area");
log.info("地区总数:{}", area.size());
if (area.size() == 0) {
System.out.println(resultItems.getRequest().getUrl() + " 此页面未爬取数据,请稍后重试!");
} else {
area.stream().forEach(stringObjectMap -> {
log.info("{}", stringObjectMap);
//String sql = "insert into bus_region ('C_NAME','C_CODE','C_LEVEL', 'C_CASCADE', 'C_PARENT_CODE', 'C_YEAR') VALUES (?, ?, ?, ?, ?, ?)";
String sql = "insert into bus_region VALUES (?, ?, ?, ?, ?, ?)";
PreparedStatement preparedStatement;
try {
preparedStatement = conn.prepareStatement(sql);
preparedStatement.setString(1, stringObjectMap.get("C_NAME").toString());
preparedStatement.setString(2, stringObjectMap.get("C_CODE").toString());
preparedStatement.setString(3, stringObjectMap.get("C_LEVEL").toString());
preparedStatement.setString(4, stringObjectMap.get("C_CASCADE").toString());
preparedStatement.setString(5, stringObjectMap.get("C_PARENT_CODE").toString());
preparedStatement.setString(6, stringObjectMap.get("C_YEAR").toString());
preparedStatement.executeUpdate();
}catch (SQLException e) {
log.error("插入数据库错误:{}", e.getMessage(), e);
}
});
}
}
}
虽然没能获取到完整的五级区划,但是省市区三级区划还是可以获取到的,毕竟数据只有三千多条,最后这五级区划数据,通过 github 找到了。