##网络爬虫 README_Crawler.md
网络爬虫技术 (java[Jsoup]、python[Beautiful Soup])
2:查看
JAVA爬虫实例 java
一、爬取省市区数据
1. 国家统计局数据~抓取省市县
统计用区划和城乡划分代码在国家统计局网站上面查找
-
国家统计局:统计用区划代码和城乡划分代码,省后面是4个0,市是两个0;
-
国家民政部:中华人民共和国行政区划代码,去掉末尾6个零;
参考:网络爬虫技术
2.获取省市区代码
2.1 java版本~~~java爬取国家统计局统计用区划代码和城乡划分代码
```java
/**
* @auther: yabo
* @date: 2019/4/7 16:38
* @description: Java 爬取国家统计局统计用区划代码和城乡划分代码 https://blog.csdn.net/duanluan/article/details/83378013
*/
import com.alibaba.fastjson.JSON;
import okhttp3.OkHttpClient;
import okhttp3.Request;
import okhttp3.Response;
import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.io.IOUtils;
import org.dozer.DozerBeanMapper;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.*;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.*;
import java.util.concurrent.TimeUnit;
public class JavaCrawlerTest {
private static DozerBeanMapper dozerBeanMapper = new DozerBeanMapper();
private static List<AdministrativeDivision> administrativeDivisionService = new ArrayList<>();
private static final String INDEX_URL = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/";
private static final String CHARSET_NAME_GB2312 = "GB2312";
private static final String CHARSET_NAME_GBK = "GBK";
private static final String[] CLASS_NAMES = {".citytr", ".countytr"};
//private static final String[] CLASS_NAMES = {".citytr", ".countytr", ".towntr", ".villagetr"};
public static void main(String[] args) throws Exception {
InputStream inputStream = new URL(INDEX_URL).openStream();
Document doc = Jsoup.parse(inputStream, CHARSET_NAME_GB2312, INDEX_URL);
inputStream.close();
Elements provinceElements = doc.select(".provincetr");
List<Map> privinceList = new LinkedList<>();
Map privinceMap;
for (Element provinceElement : provinceElements) {
Elements privinceLinks = provinceElement.select("a");
for (Element privinceLink : privinceLinks) {
privinceMap = new LinkedHashMap();
privinceMap.put("name", privinceLink.text());
List<Map> childList;
while (true) {
// 递归获取 Child
getChild(privinceMap, INDEX_URL + privinceLink.attr("href"), 0);
childList = (List<Map>) privinceMap.get("child");
// 莫名其妙,不知道为什么会出现 childList 为空的情况。
if (CollectionUtils.isNotEmpty(childList)) {
break;
}
System.out.println("childList 为空");
}
String code = childList.get(0).get("code").toString();
privinceMap.put("code", code.substring(0, 2) + "0000000000");
privinceList.add(privinceMap);
}
}
// 递归保存
String json = JSON.toJSONString(privinceList);
IOUtils.copy(new StringReader(json),
new FileOutputStream(new File("/Users/duandazhi/Downloads/aa/a.json")),
Charset.forName("UTF-8")
);
save(privinceList, "0", 0);
System.out.println("............");
System.out.println(json);
System.out.println("............");
}
/**
* 递归保存
*
* @param list
* @param parentCode
* @param level
* @throws InterruptedException
*/
private static void save(List<Map> list, String parentCode, int level) throws InterruptedException {
if (level == CLASS_NAMES.length + 1) {
return;
}
level += 1;
if (list != null) {
for (Map map : list) {
AdministrativeDivision administrativeDivision = dozerBeanMapper.map(map, AdministrativeDivision.class);
administrativeDivision.setParentCode(parentCode);
administrativeDivision.setLevel(level);
while (true) {
try {
administrativeDivisionService.add(administrativeDivision);
} catch (Exception e) {
if ("connection holder is null".equals(e.getMessage())) {
TimeUnit.MINUTES.sleep(1);
continue;
}
break;
}
break;
}
save((List<Map>) map.get("child"), administrativeDivision.getCode(), level);
}
}
}
/**
* 获取 child
*
* @param map
* @param url
* @param level
*/
private static void getChild(Map map, String url, int level) {
if (level == CLASS_NAMES.length) {
return;
}
System.out.println(url);
Document doc;
while (true) {
try {
OkHttpClient okHttpClient = new OkHttpClient();
Request request = new Request.Builder().url(url).build();
Response response = okHttpClient.newCall(request).execute();
if (!response.isSuccessful()) {
continue;
}
byte[] bodyBytes = response.body().bytes();
String bodyText = new String(bodyBytes, CHARSET_NAME_GB2312);
if (bodyText.contains("�")) {
bodyText = new String(bodyBytes, CHARSET_NAME_GBK);
}
doc = Jsoup.parse(bodyText);
break;
} catch (IOException e) {
// e.printStackTrace();
System.out.println(e.getMessage());
}
}
List<Map> childList = new LinkedList<>();
Elements Elements = doc.select(CLASS_NAMES[level]);
level += 1;
Map childMap;
for (Element element : Elements) {
Elements links = element.select("td a");
// 市辖区
boolean isContinue = true;
if (links.size() == 0) {
links = element.select("td");
isContinue = false;
}
Element codeLink = links.first();
childMap = new LinkedHashMap();
childMap.put("code", codeLink.text());
childMap.put("name", links.last().text());
if (isContinue) {
getChild(childMap, url.substring(0, url.lastIndexOf("/") + 1) + codeLink.attr("href"), level);
}
childList.add(childMap);
}
map.put("level", level - 1);
map.put("child", childList);
}
/**
* @auther: yabo
* @date: 2019/4/7 18:13
* @description: 这个类,必须重新弄一个文件,不能使用内部类
*/
@lombok.Getter
@lombok.Setter
@lombok.AllArgsConstructor
@lombok.NoArgsConstructor
public class AdministrativeDivision {
private String name;
private String code;
private String parentCode;
private int level;
}
}
```
2.2 Github上面行政区数据
共同点:都不兼容低版本IE
对比:GitHub上vue的stars数量大约是angular的两倍
二、抓取快递公司
1. 到快递100网站抓取
*快递100提供快递物流信息的查看
- 快递100所有的快递公司页面
- 一个快递公司详情页面
- java抓包神器-jsoup elements对象支持类似于CSS (或jquery)的选择器语法,来实现非常强大和灵活的查找功能。
- 线程池显著提高效率~Java多线程(ExecutorService), 等待所有线程执行完毕
2. 代码实现
2.1 核心代码
java:
/**
* 到快递100同步快递公司信息
* 快递公司列表:https://www.kuaidi100.com/all/
* 快递公司详情:https://www.kuaidi100.com/all/yt.shtml
*
* @return 同步条数
* @see InfAreaService#getLatestAreasFromHttp()
* <p>
* jsoup elements对象支持类似于CSS (或jquery)的选择器语法,来实现非常强大和灵活的查找功能。
* https://www.open-open.com/jsoup/selector-syntax.htm
* x
* Java多线程(ExecutorService), 等待所有线程执行完毕. https://blog.csdn.net/q258523454/article/details/81978855
*/
public synchronized int getLatestExpressFromHttp() {
/**
* 线程池
* Executors.newFixedThreadPool(10);
* 这里不限制大小,线程会按照最大能力,开启,限制了大小,比如:10, 就一共只开启这么多线程
*/
//private static ExecutorService executorService = Executors.newCachedThreadPool();
//private static Set<Long> threadIdSet = new HashSet<>();
//使用多线程技术进行提高速度
boolean isStartThread = true;
threadIdSet.clear();
//请求网页,获取
//String httpResult = new RestTemplate().getForEntity("https://www.kuaidi100.com/all/", String.class).getBody();
String httpResult = HttpClientSimpleUtil.getInstance().doGetRequest("https://www.kuaidi100.com/all/");
//jsoup解析html
Document doc = Jsoup.parse(httpResult);
Elements columnList = doc.select(".column-list");
//1: 解析所有快递公司和详情页网址
List<InfExpressEntity> expressEntityList = new ArrayList<>();
for (int i = 0; i < columnList.size(); ++i) {
Elements elementsDdA = columnList.get(i).select("a");
for (int j = 0; j < elementsDdA.size(); ++j) {
//快递公司名称
String expressName = elementsDdA.get(j).text().trim();
//快递公司网址
String expressHref = String.valueOf(elementsDdA.get(j).attr("href")).replace(" ", "");
InfExpressEntity expressEntity = new InfExpressEntity(expressName, expressHref);
expressEntityList.add(expressEntity);
}
}
//1-2: 使用多线程缩减访问网页的时间,快递大概600条数据
//请求网页,获取
List<InfExpressEntity> finalExpressEntityList = expressEntityList;
if (isStartThread) {
final CountDownLatch countDownLatch = new CountDownLatch(finalExpressEntityList.size());
for (int i = 0; i < finalExpressEntityList.size(); ++i) {
executorService.execute(() -> {
try {
InfExpressEntity expressEntity = finalExpressEntityList.get(i);
logger.debug("线程"
+ Thread.currentThread().getId()
+ "开始出发" + "发送请求:" + expressEntity.getOfficialWebsite());
threadIdSet.add(Thread.currentThread().getId());
String result = HttpClientSimpleUtil.getInstance().doGetRequest(expressEntity.getOfficialWebsite());
logger.debug("收到结果......");
expressEntity.setBrief(result);
} finally {
countDownLatch.countDown();
//这里必须进行计数统计
}
});
}
try {
//限定2分钟必须执行完毕
countDownLatch.await(60 * 2, TimeUnit.SECONDS);
} catch (InterruptedException e) {
e.printStackTrace();
} finally {
logger.error("所有线程都已经执行完毕...., 一共开启线程数量:" + threadIdSet.size() + "idValues:" + threadIdSet);
}
}
//2: 解析详情页网址
for (int i = 0; i < expressEntityList.size(); ++i) {
InfExpressEntity expressEntity = expressEntityList.get(i);
//请求网页,获取
String httpResult2 = expressEntity.getBrief();
if (!isStartThread) {
httpResult2 = HttpClientSimpleUtil.getInstance().doGetRequest(expressEntity.getOfficialWebsite());
}
//jsoup解析html
Document doc2 = Jsoup.parse(httpResult2);
//快递公司电话
String expressTel = doc2.select("#allcompanytel").text();
expressEntity.setTel(expressTel);
//快递公司官方网站
String expressWebsite = doc2.select("#allcompanyurl").attr("href");
expressEntity.setOfficialWebsite(expressWebsite);
//快递公司图标
Elements expressIconEl = doc2.select(".com-logo");
if (expressIconEl != null) {
String expressIcon = expressIconEl.select("img").attr("src");
expressEntity.setIcon(expressIcon);
}
//快递公司code
String expressCode = doc2.select("#companyCode").attr("value");
expressEntity.setExpressCode(expressCode.trim());
//快递公司简介
if (doc2.select(".ex-txt").select("p").size() > 0) {
String expressBrief = doc2.select(".ex-txt").select("p").get(0).text();
expressEntity.setBrief(expressBrief);
}
logger.debug("同步快递信息:" + (i + 1) + "--" + expressEntity.getExpressName());
expressEntity.setSort(i + 1);
expressEntity.setCreateTime(LocalDateTime.now());
expressEntity.setCreateUser(1L);
expressEntity.setUpdateTime(LocalDateTime.now());
expressEntity.setUpdateUser(1L);
//常用快递公司就启用,否则禁用
expressEntity.setIsEnable(Arrays.asList(EXPRESS_COMMON).contains(expressEntity.getExpressName()));
//拼音
expressEntity.setNamePinyin(PinyinUtil.getPinYin(expressEntity.getExpressName()));
}
logger.info("快递100的快递公司信息同步完成..." + "快递公司条数: " + expressEntityList.size());
//3: 删除所有的快递公司
infExpressDao.truncateAll();
//4: 去重、排序 ,最后 重新保存所有
expressEntityList = Lists.newArrayList(new HashSet<>(expressEntityList));
logger.info("去重之后的快递公司条数: " + expressEntityList.size());
expressEntityList.sort(Comparator.comparing(InfExpressEntity::getIsEnable).thenComparing(en -> XaUtils.isNotBlank(en.getOfficialWebsite()) && XaUtils.isNotBlank(en.getTel())));
infExpressDao.saveBatch(expressEntityList);
return expressEntityList.size();
}
2.2 实体类
java:
/**
* InfExpressEntity.java
* 快递公司 快速编码同快递100
*
* @author ourslookAdmin
* @email ab601026460@qq.com
* @date 2019-04-13 17:08:41
* <p>
* ⚠️⚠️说明:排序字段保存的时候必须有默认值,否则null排序,1:索引失效 2: 排序有问题; 如:sort、时间等需要排序,必须不能为空
*/
@Getter
@Setter
@NoArgsConstructor
@ApiModel(description = "快递公司 快速编码同快递100")
public class InfExpressEntity implements Serializable {
private static final long serialVersionUID = 1L;
/**
* 主键
*/
@ApiModelProperty("主键 自增id")
private Long id;
/**
* 快递公司编码
*/
@ApiModelProperty("快递公司编码, 同快递100")
@Length(message = "快递公司编码最长128个字符", max = 128)
@JsonDeserialize(using = CustomStringTrimDeserializer.class)
private String expressCode;
/**
* 快递公司名称
*/
@ApiModelProperty("快递公司名称")
@Length(message = "快递公司名称最长32个字符", max = 32)
@JsonDeserialize(using = CustomStringTrimDeserializer.class)
private String expressName;
/**
* icon图标
*/
@ApiModelProperty("icon图标")
@Length(message = "icon图标最长65535个字符", max = 65535)
@JsonDeserialize(using = CustomStringTrimDeserializer.class)
private String icon;
/**
* 官方客服电话
*/
@ApiModelProperty("官方客服电话 注意电话格式,如:11185、4001000001、010-52310990、800-810-8000")
@Length(message = "官方客服电话最长64个字符", max = 64)
@JsonDeserialize(using = CustomStringTrimDeserializer.class)
private String tel;
/**
* 官方网站
*/
@ApiModelProperty("官方网站")
private String officialWebsite;
/**
* 快递公司简介
*/
@ApiModelProperty("快递公司简介")
private String brief;
/**
* 排序
*/
@ApiModelProperty("排序 默认0,倒序")
private Integer sort;
/**
* 是否启用
*/
@ApiModelProperty("是否启用 1:启用 0:禁用;删除就物理删除,默认0")
private Boolean isEnable;
/**
* 创建人
*/
@ApiModelProperty("创建人")
private Long createUser;
/**
* 创建时间
*/
@ApiModelProperty("创建时间")
@JsonFormat(pattern = "yyyy-MM-dd HH:mm:ss", timezone = "Asia/Shanghai")
private LocalDateTime createTime;
/**
* 更新人
*/
@ApiModelProperty("更新人")
private Long updateUser;
/**
* 更新时间
*/
@ApiModelProperty("更新时间")
@JsonFormat(pattern = "yyyy-MM-dd HH:mm:ss", timezone = "Asia/Shanghai")
private LocalDateTime updateTime;
public InfExpressEntity(String expressName, String officialWebsite) {
this.expressName = expressName;
this.officialWebsite = officialWebsite;
}
}