spring boot+jsoup解析器拉取网页数据

1.概述

     因为项目是从无到有的开发所以很多数据都需要自己想办法、如项目需要全国地址信息而数据没有就需要自己去网站拉去了

2.拉取中华人民共和国民政部发布的最新全国各地区域信息

     网站地址:http://www.mca.gov.cn/article/sj/xzqh/2020/

 找到对应的数据标签

3.创建数据库表存储地区信息

CREATE TABLE [dbo].[S_SYSAREA] (
  [id] int  IDENTITY(1,1) NOT NULL,
  [area_code] nvarchar(50) COLLATE Chinese_PRC_CI_AS  NULL,
  [area_name] nvarchar(50) COLLATE Chinese_PRC_CI_AS  NULL,
  [area_level] int  NULL,
  [province_code] nvarchar(50) COLLATE Chinese_PRC_CI_AS  NULL,
  [city_code] nvarchar(50) COLLATE Chinese_PRC_CI_AS  NULL,
  [creator_name] nvarchar(50) COLLATE Chinese_PRC_CI_AS  NULL,
  [modifier_name] nvarchar(50) COLLATE Chinese_PRC_CI_AS  NULL,
  [gmt_created] datetime2(7)  NULL,
  [gmt_modified] datetime2(7)  NULL
)
GO

ALTER TABLE [dbo].[S_SYSAREA] SET (LOCK_ESCALATION = TABLE)
GO

EXEC sp_addextendedproperty
'MS_Description', N'主键ID',
'SCHEMA', N'dbo',
'TABLE', N'S_SYSAREA',
'COLUMN', N'id'
GO

EXEC sp_addextendedproperty
'MS_Description', N'地区编码',
'SCHEMA', N'dbo',
'TABLE', N'S_SYSAREA',
'COLUMN', N'area_code'
GO

EXEC sp_addextendedproperty
'MS_Description', N'地区名称',
'SCHEMA', N'dbo',
'TABLE', N'S_SYSAREA',
'COLUMN', N'area_name'
GO

EXEC sp_addextendedproperty
'MS_Description', N'地区级别(1、省,2、市,3、区/县)',
'SCHEMA', N'dbo',
'TABLE', N'S_SYSAREA',
'COLUMN', N'area_level'
GO

EXEC sp_addextendedproperty
'MS_Description', N'省编码',
'SCHEMA', N'dbo',
'TABLE', N'S_SYSAREA',
'COLUMN', N'province_code'
GO

EXEC sp_addextendedproperty
'MS_Description', N'市编码',
'SCHEMA', N'dbo',
'TABLE', N'S_SYSAREA',
'COLUMN', N'city_code'
GO

EXEC sp_addextendedproperty
'MS_Description', N'创建人',
'SCHEMA', N'dbo',
'TABLE', N'S_SYSAREA',
'COLUMN', N'creator_name'
GO

EXEC sp_addextendedproperty
'MS_Description', N'修改人',
'SCHEMA', N'dbo',
'TABLE', N'S_SYSAREA',
'COLUMN', N'modifier_name'
GO

EXEC sp_addextendedproperty
'MS_Description', N'创建日期',
'SCHEMA', N'dbo',
'TABLE', N'S_SYSAREA',
'COLUMN', N'gmt_created'
GO

EXEC sp_addextendedproperty
'MS_Description', N'主键ID',
'SCHEMA', N'dbo',
'TABLE', N'S_SYSAREA',
'COLUMN', N'gmt_modified'
GO

4.项目pom.xml拉取maven依赖

        <!--jsoup-->
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.10.2</version>
        </dependency>

5.准备实体接收来取到的数据存入数据库

 (一)项目中创建实体Area.java类


/**
 * 实体
 * 区,县
 * */
public class Area {
    private String code;
    private String name;

    public String getCode() {
        return code;
    }

    public void setCode(String code) {
        this.code = code;
    }

    public String getName() {
        return name;
    }

    public void setName(String name) {
        this.name = name;
    }

}

 (二)项目中创建实体City .java类

/**
 * 实体
 * 市
 * */
import java.util.List;

public class City {
    private String code;
    private String name;
    private List<Area> areaList;

    public String getCode() {
        return code;
    }

    public void setCode(String code) {
        this.code = code;
    }

    public String getName() {
        return name;
    }

    public void setName(String name) {
        this.name = name;
    }

    public List<Area> getAreaList() {
        return areaList;
    }

    public void setAreaList(List<Area> areaList) {
        this.areaList = areaList;
    }
}

 (三)项目中创建实体Province .java类

/**
 * 实体
 * 省份
 * */
import java.util.List;

public class Province {

    private String code;
    private String name;
    private List<City> cityList;

    public String getCode() {
        return code;
    }

    public void setCode(String code) {
        this.code = code;
    }

    public String getName() {
        return name;
    }

    public void setName(String name) {
        this.name = name;
    }

    public List<City> getCityList() {
        return cityList;
    }

    public void setCityList(List<City> cityList) {
        this.cityList = cityList;
    }
}

6.创建数据库表实体

/**
 * <p>
 * 数据库地区信息实体
 * </p>
 *
 * @author zhangpeng
 * @since 2021-01-06
 */
import com.baomidou.mybatisplus.annotation.IdType;
import java.io.Serializable;
import java.util.Date;
import com.baomidou.mybatisplus.annotation.TableId;
import com.baomidou.mybatisplus.annotation.TableName;
import com.baomidou.mybatisplus.annotation.TableField;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.Accessors;

@Data
@EqualsAndHashCode(callSuper = false)
@Accessors(chain = true)
@TableName("S_SYSAREA")
public class SSysarea implements Serializable {

    private static final long serialVersionUID = 1L;

    /**
     * 主键ID
     */
    @TableId(value = "id", type = IdType.AUTO)
    private Integer id;

    /**
     * 地区编码
     */
    @TableField("area_code")
    private String areaCode;

    /**
     * 地区名称
     */
    @TableField("area_name")
    private String areaName;

    /**
     * 地区级别(1、省,2、市,3、区/县)
     */
    @TableField("area_level")
    private Integer areaLevel;

    /**
     * 省编码
     */
    @TableField("province_code")
    private String provinceCode;

    /**
     * 市是编码
     */
    @TableField("city_code")
    private String cityCode;

    /**
     * 创建人
     */
    private String creatorName = "zhangpeng";

    /**
     * 修改人
     */
    private String modifierName;

    /**
     * 创建日期
     */
    private Date gmtCreated=new Date();

    /**
     * 修改日期
     */
    private Date gmtModified=new Date();



}

7.创建mapper数据批量插入接口

/**
 * <p>
 *  Mapper 接口
 *  批量新增
 * </p>
 *
 * @author zhangpeng
 * @since 2021-01-06
 */
import org.apache.ibatis.annotations.InsertProvider;
import org.springframework.data.repository.query.Param;
import org.springframework.stereotype.Repository;
import com.baomidou.mybatisplus.core.mapper.BaseMapper;
import java.util.List;

@Repository
public interface SSysareaMapper extends BaseMapper<SSysarea> {

    @InsertProvider(type = addAddress.class,method = "addProvince")
    int addProvince(@Param("list") List<SSysarea> list);

}

8.编写sql语句存入数据

/**
 * 动态sql
 * */
import org.springframework.data.repository.query.Param;
import java.util.List;

public class addAddress {

    public String  addProvince(@Param("list") List<SSysarea> list){
        StringBuffer sb = new StringBuffer(128);
        sb.append("insert into S_SYSAREA(area_code,area_name,area_level,province_code,city_code,creator_name,modifier_name,gmt_created,gmt_modified) values");

        for (int i = 0; i < list.size(); i++) {
            sb.append("(#{list[").append(i).append("].areaCode}")
                    .append(",#{list[").append(i).append("].areaName}")
                    .append(",#{list[").append(i).append("].areaLevel}")
                    .append(",#{list[").append(i).append("].provinceCode}")
                    .append(",#{list[").append(i).append("].cityCode}")
                    .append(",#{list[").append(i).append("].creatorName}")
                    .append(",#{list[").append(i).append("].modifierName}")
                    .append(",#{list[").append(i).append("].gmtCreated}")
                    .append(",#{list[").append(i).append("].gmtModified})");
            if (i < list.size() - 1) {
                sb.append(",");
            }
        }

        return sb.toString();
    }

}

9.在测试类建测试拉取数据

	
import com.szylt.projects.project.address.Area;
import com.szylt.projects.project.address.City;
import com.szylt.projects.project.address.Province;
import com.szylt.projects.project.entity.SSysarea;
import com.szylt.projects.project.mapper.SSysareaMapper;
import lombok.extern.slf4j.Slf4j;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.test.context.junit4.SpringRunner;



    /**
     * 测试
     */
    @Test
    public void test() throws IOException {
        String url = "http://preview.www.mca.gov.cn/article/sj/xzqh/2020/2020/202101041104.html";
        Document document = Jsoup.connect(url).timeout(100000).maxBodySize(0).get();
        Elements element = document.getElementsByClass("xl7032423");
        Elements element2 = document.getElementsByClass("xl7132423");
        //System.out.println(element.html());
        List<String> stringList = new ArrayList<>();
        List<String> stringName = new ArrayList<>();
        List<String> stringCode = new ArrayList<>();
        for (Element el :element){
            String sa =el.getElementsByClass("xl7032423").eq(0).text();
            //System.out.println(sa);
            stringList.add(sa);
        }
        for (Element el :element2){
            String sa =el.getElementsByClass("xl7132423").eq(0).text();
            //System.out.println(sa);
            stringList.add(sa);
        }
        for (int i = 0; i < stringList.size(); i++) {
            if (i % 2 == 0) {
                //地区代码
                stringCode.add(stringList.get(i));
            } else {
                //地区名字
                stringName.add(stringList.get(i));
            }
        }

        System.out.println("stringName  size= " + stringName.size() + "   stringCode   size= " + stringCode.size());
        if (stringName.size() != stringCode.size()) {
            throw new RuntimeException("数据错误");
        }
        
        List<SSysarea> list = new ArrayList<>();
        SSysarea sysarea = null;
        List<Province> provinceList = processData(stringName, stringCode);
        
        for (Province p:provinceList) {
            //System.out.println(p.getCode() + "\t" + p.getName());
            sysarea = new SSysarea();
            sysarea.setAreaCode(p.getCode());
            sysarea.setAreaName(p.getName());
            sysarea.setAreaLevel(1);
            list.add(sysarea);
            for (City c:p.getCityList()) {
                //System.out.println(c.getCode() + "\t" + c.getName());
                sysarea = new SSysarea();
                sysarea.setAreaCode(c.getCode());
                sysarea.setAreaName(c.getName());
                sysarea.setProvinceCode(p.getCode());
                sysarea.setAreaLevel(2);
                list.add(sysarea);
                for (Area a:c.getAreaList()) {
                    //System.out.println(a.getCode() + "\t" + a.getName());
                    sysarea = new SSysarea();
                    sysarea.setAreaCode(a.getCode());
                    sysarea.setAreaName(a.getName());
                    sysarea.setProvinceCode(p.getCode());
                    sysarea.setCityCode(c.getCode());
                    sysarea.setAreaLevel(3);
                    list.add(sysarea);
                }

            }

        }

        System.out.println("===================放入数据库的数据=================");
        for (SSysarea s:list) {
            System.out.println(s.getId() + "\t" + s.getAreaCode() + "\t" + s.getAreaName() + "\t" + s.getAreaLevel() + "\t" + s.getProvinceCode() +
                    "\t" + s.getCityCode() + "\t" + s.getCreatorName() + "\t" + s.getGmtCreated() );
        }

        System.out.println(list.size());
        //限制条数
        int pointsDataLimit = 1000;
        Integer size = list.size();
        //判断是否有必要分批
        if(pointsDataLimit<size){
            //分批数
            int part = size/pointsDataLimit;
            for (int i = 0; i < part; i++) {
                //1500条插入一次
                List<SSysarea> listPage = list.subList(0, pointsDataLimit);
                sSysareaMapper.addProvince(listPage);
                //剔除已经插入的
                list.subList(0, pointsDataLimit).clear();
            }
            if (list.size()>0){
                //新增最后剩下的
                sSysareaMapper.addProvince(list);
            }
        }else{
            sSysareaMapper.addProvince(list);
        }

    }


    /**
     * 生成省份列表数据
     * 
     * @param stringName 名称
     * @param stringCode 编号
     * @return
     */
    private static List<Province> processData(List<String> stringName, List<String> stringCode) {
        List<Province> provinceList = new ArrayList<>();
        for (int i = 0; i < stringCode.size(); i++) {
            String provinceName = stringName.get(i);
            String provinceCode = stringCode.get(i);
            if (provinceCode.endsWith("0000")) {
                Province province = new Province();
                province.setCode(provinceCode);
                province.setName(provinceName);
                List<City> cities = new ArrayList<>();
                province.setCityList(cities);
                provinceList.add(province);
                //香港,澳门,台湾,没有市级行政单位划分,城市 地区 和省份保持一致
                if (provinceName.contains("香港") || provinceName.contains("澳门") || provinceName.contains("台湾")) {
                    City city = new City();
                    List<Area> areas = new ArrayList<>();
                    city.setName(provinceName);
                    city.setCode(provinceCode);
                    city.setAreaList(areas);
                    cities.add(city);
                    Area area = new Area();
                    area.setName(provinceName);
                    area.setCode(provinceCode);
                    areas.add(area);
                }
                //直辖市 城市和省份名称一样
                if (provinceName.contains("北京") || provinceName.contains("上海") || provinceName.contains("天津") || provinceName.contains("重庆")) {
                    City city = new City();
                    List<Area> areas = new ArrayList<>();
                    city.setName(provinceName);
                    city.setCode(provinceCode);
                    city.setAreaList(areas);
                    cities.add(city);
                    //县区
                    for (int k = 0; k < stringCode.size(); k++) {
                        String areaName = stringName.get(k);
                        String areaCode = stringCode.get(k);
                        if (!provinceCode.equals(areaCode) && areaCode.startsWith(provinceCode.substring(0, 2))) {
                            Area area = new Area();
                            area.setName(areaName);
                            area.setCode(areaCode);
                            areas.add(area);
                        }
                    }
                }
                for (int j = 0; j < stringCode.size(); j++) {
                    String cityName = stringName.get(j);
                    String cityCode = stringCode.get(j);
                    //遍历获取地级市
                    if (!cityCode.equals(provinceCode) && cityCode.startsWith(provinceCode.substring(0, 2)) && cityCode.endsWith("00")) {
                        City city = new City();
                        List<Area> areas = new ArrayList<Area>();
                        city.setName(cityName);
                        city.setCode(cityCode);
                        city.setAreaList(areas);
                        cities.add(city);
                        //遍历获取县区
                        for (int k = 0; k < stringCode.size(); k++) {
                            String areaName = stringName.get(k);
                            String areaCode = stringCode.get(k);
                            if (!areaCode.equals(cityCode) && areaCode.startsWith(cityCode.substring(0,4))) {
                                Area area = new Area();
                                area.setName(areaName);
                                area.setCode(areaCode);
                                areas.add(area);
                            }
                        }
                    }
                }
            }
        }
        return provinceList;
    }

这些步骤走完全国的地区信息便以被存入到数据库拉-同理也可拉取其他信息

  • 3
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值