最近发现国家统计局上有了最新的省市区信息,以前的在网上找了很多,都不是最新的,于是自己使用WebMagic爬虫写了一个功能,自己去国家统计局网站里面去抓取,框架使用springboot,保存数据的时候使用的是mongodb数据库。
下面附上代码:
实体类:
import com.test.elasticsearch.anno.AutoValue;
import lombok.*;
import org.springframework.data.mongodb.core.mapping.Document;
import org.springframework.data.mongodb.core.mapping.Field;
import javax.persistence.Entity;
import javax.persistence.Id;
import java.io.Serializable;
@AllArgsConstructor
@NoArgsConstructor
@Setter
@Getter
@Builder
@Entity
@Document(collection = "t_address")
public class AddressDb implements Serializable {
// 主键
@Id
@AutoValue
@Field("id")
@Builder.Default
private Long id = 0L;
// 省市区ID
@Field("province_id")
private Integer provinceId;
// 父ID
@Field("parent_id")
private Integer parentId;
// 省市区名称
@Field("name")
private String name;
// 省市区名称
@Field("merge_name")
private String mergeName;
// 等级类型
@Field("level_type")
private Short levelType;
}
业务代码:
import cn.hutool.core.collection.CollectionUtil;
import cn.hutool.core.util.CharsetUtil;
import com.test.elasticsearch.entity.mongodb.AddressDb;
import com.test.elasticsearch.service.sprider.BaseProcessor;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.mongodb.core.MongoTemplate;
import org.springframework.data.mongodb.core.query.Criteria;
import org.springframework.data.mongodb.core.query.Query;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import java.util.List;
@Component
public class CityProcessor implements BaseProcessor {
@Autowired
private MongoTemplate mongoTemplate;
private static final String TJSJ_CITY_BASE_URL = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/";
private static final String TJSJ_CITY_WEB_URL = "^http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/[A-Za-z]+.html$";
// 获取市信息
private static final String PROVINCE_URL = "^http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/\\d{1,2}.html$";
// 获取区县信息
private static final String CITY_URL = "^http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/\\d{1,2}/\\d{1,4}.html$";
// 获取城镇信息
private static final String AREA_URL = "^http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/\\d{1,2}/\\d{1,2}/\\d{5,8}.html$";
private Site site = Site.me()
.setDomain("http://www.stats.gov.cn")
.setSleepTime(5000)
.setRetryTimes(3)
.setCharset(CharsetUtil.GBK)
.setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36");
@Override
public void process(Page page) {
List<String> urlList;
String baseUrl;
if (page.getUrl().regex(TJSJ_CITY_WEB_URL).match()) {
// 获取省份code列表
List<String> provinceCodes = page.getHtml().regex("<td><a href=\\\"(.{1,30}).html\\\">.*?<br></a></td>").all();
// 获取省份信息列表
List<String> provinceNames = page.getHtml().regex("<td><a href=\\\".*?.html\\\">(.{1,30})<br></a></td>").all();
AddressDb addressDb;
String parentName = "中国";
for (int i = 0, n = provinceCodes.size(); i < n; i++) {
addressDb = AddressDb.builder()
.provinceId(Integer.valueOf(provinceCodes.get(i) + "0000"))
.parentId(100000)
.name(provinceNames.get(i))
.mergeName(parentName + "," + provinceNames.get(i))
.levelType((short) 1).build();
mongoTemplate.insert(addressDb);
}
urlList = page.getHtml().xpath("//*[@class=\"provincetr\"]/td/a/@href").all();
if (CollectionUtil.isNotEmpty(urlList)) {
urlList.stream().forEach( str -> page.addTargetRequest(TJSJ_CITY_BASE_URL + str));
}
} else if (page.getUrl().regex(PROVINCE_URL).match()) {
// 获取数据库中省份Code和省份信息
int provinceFromId = Integer.valueOf(page.getUrl().regex("http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/(.{2,}).html").toString() + "0000");
Query query = new Query(new Criteria().and("provinceId").is(provinceFromId));
AddressDb provinceInfo = mongoTemplate.findOne(query, AddressDb.class);
if (provinceInfo != null) {
AddressDb addressDb;
String parentName = provinceInfo.getMergeName();
// 获取城市信息
List<String> cityInfos = page.getHtml().regex("<td><a href=\\\".*?.html\\\">(.{1,30})</a></td>").all();
for (int i = 0, n = cityInfos.size(); i < n; i++) {
addressDb = AddressDb.builder()
.provinceId(Integer.valueOf(cityInfos.get(i).substring(0, 6)))
.parentId(provinceInfo.getProvinceId())
.name(cityInfos.get(++i))
.mergeName(parentName + "," + cityInfos.get(i))
.levelType((short) 2).build();
mongoTemplate.insert(addressDb);
}
}
urlList = page.getHtml().xpath("//*[@class=\"citytr\"]/td/a/@href").all();
if (CollectionUtil.isNotEmpty(urlList)) {
urlList.stream().distinct().forEach( str -> page.addTargetRequest(TJSJ_CITY_BASE_URL + str));
}
} else if (page.getUrl().regex(CITY_URL).match()) {
// 获取数据库中城市Code和信息
int cityFromIdId = Integer.valueOf(page.getUrl().regex("http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/\\d{1,}/(.{2,}).html").toString() + "00");
Query query = new Query(new Criteria().and("provinceId").is(cityFromIdId));
AddressDb cityInfo = mongoTemplate.findOne(query, AddressDb.class);
// 获取区县信息
List<String> countyInfos = page.getHtml().regex("<td><a href=\\\".*?.html\\\">(.{1,30})</a></td>").all();
if (cityInfo != null) {
AddressDb addressDb;
String parentName = cityInfo.getMergeName();
for (int i = 0, n = countyInfos.size(); i < n; i++) {
addressDb = AddressDb.builder()
.provinceId(Integer.valueOf(countyInfos.get(i).substring(0, 6)))
.parentId(cityInfo.getProvinceId())
.name(countyInfos.get(++i))
.mergeName(parentName + "," + countyInfos.get(i))
.levelType((short) 3).build();
mongoTemplate.insert(addressDb);
}
}
baseUrl = page.getUrl().regex("^http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/\\d{1,2}/").toString();
urlList = page.getHtml().xpath("//*[@class=\"countytr\"]/td/a/@href").all();
if (CollectionUtil.isNotEmpty(urlList)) {
urlList.stream().distinct().forEach( str -> page.addTargetRequest(baseUrl + str));
}
} else if (page.getUrl().regex(AREA_URL).match()) {
// 获取区县Code
int areaFormId = Integer.valueOf(page.getUrl().regex("http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/\\d{1,}/\\d{1,}/(.{2,}).html").toString());
Query query = new Query(new Criteria().and("provinceId").is(areaFormId));
AddressDb areaInfo = mongoTemplate.findOne(query, AddressDb.class);
if (areaInfo != null) {
AddressDb addressDb;
String parentName = areaInfo.getMergeName();
// 获取乡镇信息
List<String> townInfos = page.getHtml().regex("<td><a href=\\\".*?.html\\\">(.{1,30})</a></td>").all();
for (int i = 0, n = townInfos.size(); i < n; i++) {
addressDb = AddressDb.builder()
.provinceId(Integer.valueOf(townInfos.get(i).substring(0, 8)))
.parentId(areaInfo.getProvinceId())
.name(townInfos.get(++i))
.mergeName(parentName + "," + townInfos.get(i))
.levelType((short) 4).build();
mongoTemplate.insert(addressDb);
}
}
}
}
@Override
public Site getSite() {
return site;
}
}
controller层代码:
import com.test.elasticsearch.service.sprider.processor.CityProcessor;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.RestController;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.downloader.HttpClientDownloader;
import us.codecraft.webmagic.proxy.Proxy;
import us.codecraft.webmagic.proxy.SimpleProxyProvider;
@RestController
public class NovelController {
// 使用代理服务器,防止自己的ip被网站给封了(毕竟发送请求的数量有点多)
private Proxy[] proxies = new Proxy[]{
new Proxy("ip", port)
};
@Autowired
private CityProcessor cityProcessor;
@GetMapping("/crawel/city/get")
public void crawelCity(String url) {
HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
httpClientDownloader.setProxyProvider(SimpleProxyProvider.from(proxies));
Spider.create(cityProcessor).addUrl(url).setDownloader(httpClientDownloader).thread(5).run();
}
}
模拟请求:http://localhost:8080/crawel/city/get?url=http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/index.html
抓取信息存储到数据库后显示效果如下:
目前只抓取了省市信息,区、乡镇信息由于请求的数量很大,暂时没有去抓取,以后如果要用到的话再说吧!!!