网络爬虫Crawler~~~~python 爬虫~~~省市区~~抓~~快递公司~多线程

##网络爬虫 README_Crawler.md

网络爬虫技术 (java[Jsoup]、python[Beautiful Soup])

2:查看

JAVA爬虫实例 java

一、爬取省市区数据

1. 国家统计局数据~抓取省市县

统计用区划和城乡划分代码在国家统计局网站上面查找

参考:网络爬虫技术

2.获取省市区代码

2.1 java版本~~~java爬取国家统计局统计用区划代码和城乡划分代码
```java
   /**
    * @auther: yabo
    * @date: 2019/4/7 16:38
    * @description: Java 爬取国家统计局统计用区划代码和城乡划分代码 https://blog.csdn.net/duanluan/article/details/83378013
    */
   import com.alibaba.fastjson.JSON;
   import okhttp3.OkHttpClient;
   import okhttp3.Request;
   import okhttp3.Response;
   import org.apache.commons.collections4.CollectionUtils;
   import org.apache.commons.io.IOUtils;
   import org.dozer.DozerBeanMapper;
   import org.jsoup.Jsoup;
   import org.jsoup.nodes.Document;
   import org.jsoup.nodes.Element;
   import org.jsoup.select.Elements;
   
   import java.io.*;
   import java.net.URL;
   import java.nio.charset.Charset;
   import java.util.*;
   import java.util.concurrent.TimeUnit;
   
   public class JavaCrawlerTest {
   
       private static DozerBeanMapper dozerBeanMapper = new DozerBeanMapper();
   
       private static List<AdministrativeDivision> administrativeDivisionService = new ArrayList<>();
   
       private static final String INDEX_URL = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/";
       private static final String CHARSET_NAME_GB2312 = "GB2312";
       private static final String CHARSET_NAME_GBK = "GBK";
       private static final String[] CLASS_NAMES = {".citytr", ".countytr"};
       //private static final String[] CLASS_NAMES = {".citytr", ".countytr", ".towntr", ".villagetr"};
   
       public static void main(String[] args) throws Exception {
           InputStream inputStream = new URL(INDEX_URL).openStream();
           Document doc = Jsoup.parse(inputStream, CHARSET_NAME_GB2312, INDEX_URL);
           inputStream.close();
           Elements provinceElements = doc.select(".provincetr");
   
           List<Map> privinceList = new LinkedList<>();
           Map privinceMap;
           for (Element provinceElement : provinceElements) {
               Elements privinceLinks = provinceElement.select("a");
               for (Element privinceLink : privinceLinks) {
                   privinceMap = new LinkedHashMap();
                   privinceMap.put("name", privinceLink.text());
   
                   List<Map> childList;
                   while (true) {
                       // 递归获取 Child
                       getChild(privinceMap, INDEX_URL + privinceLink.attr("href"), 0);
                       childList = (List<Map>) privinceMap.get("child");
                       // 莫名其妙,不知道为什么会出现 childList 为空的情况。
                       if (CollectionUtils.isNotEmpty(childList)) {
                           break;
                       }
                       System.out.println("childList 为空");
                   }
   
                   String code = childList.get(0).get("code").toString();
                   privinceMap.put("code", code.substring(0, 2) + "0000000000");
                   privinceList.add(privinceMap);
               }
           }
   
           // 递归保存
           String json = JSON.toJSONString(privinceList);
           IOUtils.copy(new StringReader(json),
                   new FileOutputStream(new File("/Users/duandazhi/Downloads/aa/a.json")),
                   Charset.forName("UTF-8")
           );
           save(privinceList, "0", 0);
           System.out.println("............");
           System.out.println(json);
           System.out.println("............");
       }
   
   
       /**
        * 递归保存
        *
        * @param list
        * @param parentCode
        * @param level
        * @throws InterruptedException
        */
       private static void save(List<Map> list, String parentCode, int level) throws InterruptedException {
           if (level == CLASS_NAMES.length + 1) {
               return;
           }
           level += 1;
   
           if (list != null) {
               for (Map map : list) {
                   AdministrativeDivision administrativeDivision = dozerBeanMapper.map(map, AdministrativeDivision.class);
                   administrativeDivision.setParentCode(parentCode);
                   administrativeDivision.setLevel(level);
                   while (true) {
                       try {
                           administrativeDivisionService.add(administrativeDivision);
                       } catch (Exception e) {
                           if ("connection holder is null".equals(e.getMessage())) {
                               TimeUnit.MINUTES.sleep(1);
                               continue;
                           }
                           break;
                       }
                       break;
                   }
   
                   save((List<Map>) map.get("child"), administrativeDivision.getCode(), level);
               }
           }
       }
   
       /**
        * 获取 child
        *
        * @param map
        * @param url
        * @param level
        */
       private static void getChild(Map map, String url, int level) {
           if (level == CLASS_NAMES.length) {
               return;
           }
           System.out.println(url);
   
           Document doc;
           while (true) {
               try {
                   OkHttpClient okHttpClient = new OkHttpClient();
                   Request request = new Request.Builder().url(url).build();
                   Response response = okHttpClient.newCall(request).execute();
                   if (!response.isSuccessful()) {
                       continue;
                   }
                   byte[] bodyBytes = response.body().bytes();
                   String bodyText = new String(bodyBytes, CHARSET_NAME_GB2312);
                   if (bodyText.contains("�")) {
                       bodyText = new String(bodyBytes, CHARSET_NAME_GBK);
                   }
                   doc = Jsoup.parse(bodyText);
                   break;
               } catch (IOException e) {
                   // e.printStackTrace();
                   System.out.println(e.getMessage());
               }
           }
   
           List<Map> childList = new LinkedList<>();
           Elements Elements = doc.select(CLASS_NAMES[level]);
           level += 1;
           Map childMap;
           for (Element element : Elements) {
               Elements links = element.select("td a");
               // 市辖区
               boolean isContinue = true;
               if (links.size() == 0) {
                   links = element.select("td");
                   isContinue = false;
               }
   
               Element codeLink = links.first();
               childMap = new LinkedHashMap();
               childMap.put("code", codeLink.text());
               childMap.put("name", links.last().text());
   
               if (isContinue) {
                   getChild(childMap, url.substring(0, url.lastIndexOf("/") + 1) + codeLink.attr("href"), level);
               }
   
               childList.add(childMap);
           }
           map.put("level", level - 1);
           map.put("child", childList);
       }
   
       /**
        * @auther: yabo
        * @date: 2019/4/7 18:13
        * @description: 这个类,必须重新弄一个文件,不能使用内部类
        */
       @lombok.Getter
       @lombok.Setter
       @lombok.AllArgsConstructor
       @lombok.NoArgsConstructor
       public class AdministrativeDivision {
           private String name;
           private String code;
           private String parentCode;
           private int level;
       }
   }

```
2.2 Github上面行政区数据

中国??

共同点:都不兼容低版本IE
对比:GitHub上vue的stars数量大约是angular的两倍

二、抓取快递公司

1. 到快递100网站抓取

*快递100提供快递物流信息的查看

2. 代码实现

2.1 核心代码
java:
 /**
 * 到快递100同步快递公司信息
 * 快递公司列表:https://www.kuaidi100.com/all/
 * 快递公司详情:https://www.kuaidi100.com/all/yt.shtml
 *
 * @return 同步条数
 * @see InfAreaService#getLatestAreasFromHttp()
 * <p>
 * jsoup elements对象支持类似于CSS (或jquery)的选择器语法,来实现非常强大和灵活的查找功能。
 * https://www.open-open.com/jsoup/selector-syntax.htm
 * x
 * Java多线程(ExecutorService), 等待所有线程执行完毕. https://blog.csdn.net/q258523454/article/details/81978855
 */
 public synchronized int getLatestExpressFromHttp() {
    /**
     * 线程池
     * Executors.newFixedThreadPool(10);
     * 这里不限制大小,线程会按照最大能力,开启,限制了大小,比如:10, 就一共只开启这么多线程
     */
    //private static ExecutorService executorService = Executors.newCachedThreadPool();
    //private static Set<Long> threadIdSet = new HashSet<>();

    //使用多线程技术进行提高速度
    boolean isStartThread = true;
    threadIdSet.clear();

    //请求网页,获取
    //String httpResult = new RestTemplate().getForEntity("https://www.kuaidi100.com/all/", String.class).getBody();
    String httpResult = HttpClientSimpleUtil.getInstance().doGetRequest("https://www.kuaidi100.com/all/");

    //jsoup解析html
    Document doc = Jsoup.parse(httpResult);
    Elements columnList = doc.select(".column-list");

    //1: 解析所有快递公司和详情页网址
    List<InfExpressEntity> expressEntityList = new ArrayList<>();
    for (int i = 0; i < columnList.size(); ++i) {
        Elements elementsDdA = columnList.get(i).select("a");
        for (int j = 0; j < elementsDdA.size(); ++j) {
            //快递公司名称
            String expressName = elementsDdA.get(j).text().trim();
            //快递公司网址
            String expressHref = String.valueOf(elementsDdA.get(j).attr("href")).replace(" ", "");

            InfExpressEntity expressEntity = new InfExpressEntity(expressName, expressHref);
            expressEntityList.add(expressEntity);
        }
    }

    //1-2: 使用多线程缩减访问网页的时间,快递大概600条数据
    //请求网页,获取
    List<InfExpressEntity> finalExpressEntityList = expressEntityList;

    if (isStartThread) {
        final CountDownLatch countDownLatch = new CountDownLatch(finalExpressEntityList.size());

        for (int i = 0; i < finalExpressEntityList.size(); ++i) {
            executorService.execute(() -> {
                try {
                    InfExpressEntity expressEntity = finalExpressEntityList.get(i);
                    logger.debug("线程"
                            + Thread.currentThread().getId()
                            + "开始出发" + "发送请求:" + expressEntity.getOfficialWebsite());
                    threadIdSet.add(Thread.currentThread().getId());
                    String result = HttpClientSimpleUtil.getInstance().doGetRequest(expressEntity.getOfficialWebsite());
                    logger.debug("收到结果......");
                    expressEntity.setBrief(result);
                } finally {
                    countDownLatch.countDown();
                    //这里必须进行计数统计
                }

            });
        }

        try {
            //限定2分钟必须执行完毕
            countDownLatch.await(60 * 2, TimeUnit.SECONDS);
        } catch (InterruptedException e) {
            e.printStackTrace();
        } finally {
            logger.error("所有线程都已经执行完毕...., 一共开启线程数量:" + threadIdSet.size() + "idValues:" + threadIdSet);
        }
    }

    //2: 解析详情页网址
    for (int i = 0; i < expressEntityList.size(); ++i) {
        InfExpressEntity expressEntity = expressEntityList.get(i);
        //请求网页,获取
        String httpResult2 = expressEntity.getBrief();

        if (!isStartThread) {
            httpResult2 = HttpClientSimpleUtil.getInstance().doGetRequest(expressEntity.getOfficialWebsite());
        }

        //jsoup解析html
        Document doc2 = Jsoup.parse(httpResult2);
        //快递公司电话
        String expressTel = doc2.select("#allcompanytel").text();
        expressEntity.setTel(expressTel);
        //快递公司官方网站
        String expressWebsite = doc2.select("#allcompanyurl").attr("href");
        expressEntity.setOfficialWebsite(expressWebsite);
        //快递公司图标
        Elements expressIconEl = doc2.select(".com-logo");
        if (expressIconEl != null) {
            String expressIcon = expressIconEl.select("img").attr("src");
            expressEntity.setIcon(expressIcon);
        }
        //快递公司code
        String expressCode = doc2.select("#companyCode").attr("value");
        expressEntity.setExpressCode(expressCode.trim());
        //快递公司简介
        if (doc2.select(".ex-txt").select("p").size() > 0) {
            String expressBrief = doc2.select(".ex-txt").select("p").get(0).text();
            expressEntity.setBrief(expressBrief);
        }

        logger.debug("同步快递信息:" + (i + 1) + "--" + expressEntity.getExpressName());

        expressEntity.setSort(i + 1);
        expressEntity.setCreateTime(LocalDateTime.now());
        expressEntity.setCreateUser(1L);
        expressEntity.setUpdateTime(LocalDateTime.now());
        expressEntity.setUpdateUser(1L);
        //常用快递公司就启用,否则禁用
        expressEntity.setIsEnable(Arrays.asList(EXPRESS_COMMON).contains(expressEntity.getExpressName()));
        //拼音
        expressEntity.setNamePinyin(PinyinUtil.getPinYin(expressEntity.getExpressName()));
    }

    logger.info("快递100的快递公司信息同步完成..." + "快递公司条数: " + expressEntityList.size());

    //3: 删除所有的快递公司
    infExpressDao.truncateAll();

    //4: 去重、排序 ,最后 重新保存所有
    expressEntityList = Lists.newArrayList(new HashSet<>(expressEntityList));
    logger.info("去重之后的快递公司条数: " + expressEntityList.size());
    expressEntityList.sort(Comparator.comparing(InfExpressEntity::getIsEnable).thenComparing(en -> XaUtils.isNotBlank(en.getOfficialWebsite()) && XaUtils.isNotBlank(en.getTel())));
    infExpressDao.saveBatch(expressEntityList);

    return expressEntityList.size();
}
2.2 实体类
java:
/** 
* InfExpressEntity.java
 * 快递公司 快速编码同快递100
 *
 * @author ourslookAdmin
 * @email ab601026460@qq.com
 * @date 2019-04-13 17:08:41
 * <p>
 * ⚠️⚠️说明:排序字段保存的时候必须有默认值,否则null排序,1:索引失效 2: 排序有问题; 如:sort、时间等需要排序,必须不能为空
 */
@Getter
@Setter
@NoArgsConstructor
@ApiModel(description = "快递公司 快速编码同快递100")
public class InfExpressEntity implements Serializable {
    private static final long serialVersionUID = 1L;

    /**
     * 主键
     */
    @ApiModelProperty("主键 自增id")
    private Long id;
    /**
     * 快递公司编码
     */
    @ApiModelProperty("快递公司编码, 同快递100")
    @Length(message = "快递公司编码最长128个字符", max = 128)
    @JsonDeserialize(using = CustomStringTrimDeserializer.class)
    private String expressCode;
    /**
     * 快递公司名称
     */
    @ApiModelProperty("快递公司名称")
    @Length(message = "快递公司名称最长32个字符", max = 32)
    @JsonDeserialize(using = CustomStringTrimDeserializer.class)
    private String expressName;
    /**
     * icon图标
     */
    @ApiModelProperty("icon图标")
    @Length(message = "icon图标最长65535个字符", max = 65535)
    @JsonDeserialize(using = CustomStringTrimDeserializer.class)
    private String icon;
    /**
     * 官方客服电话
     */
    @ApiModelProperty("官方客服电话 注意电话格式,如:11185、4001000001、010-52310990、800-810-8000")
    @Length(message = "官方客服电话最长64个字符", max = 64)
    @JsonDeserialize(using = CustomStringTrimDeserializer.class)
    private String tel;
    /**
     * 官方网站
     */
    @ApiModelProperty("官方网站")
    private String officialWebsite;
    /**
     * 快递公司简介
     */
    @ApiModelProperty("快递公司简介")
    private String brief;
    /**
     * 排序
     */
    @ApiModelProperty("排序 默认0,倒序")
    private Integer sort;
    /**
     * 是否启用
     */
    @ApiModelProperty("是否启用 1:启用 0:禁用;删除就物理删除,默认0")
    private Boolean isEnable;
    /**
     * 创建人
     */
    @ApiModelProperty("创建人")
    private Long createUser;
    /**
     * 创建时间
     */
    @ApiModelProperty("创建时间")
    @JsonFormat(pattern = "yyyy-MM-dd HH:mm:ss", timezone = "Asia/Shanghai")
    private LocalDateTime createTime;
    /**
     * 更新人
     */
    @ApiModelProperty("更新人")
    private Long updateUser;
    /**
     * 更新时间
     */
    @ApiModelProperty("更新时间")
    @JsonFormat(pattern = "yyyy-MM-dd HH:mm:ss", timezone = "Asia/Shanghai")
    private LocalDateTime updateTime;

    public InfExpressEntity(String expressName, String officialWebsite) {
        this.expressName = expressName;
        this.officialWebsite = officialWebsite;
    }
}
  • 2
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 3
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

Dazer007

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值