项目总结--2(Jsoup的基本使用)

爬取网页中的省市区三级的数据

1.在做省市区的三级联动的过程中,需要爬取对应的省级行政的所有的数据,供前台进行AJAX调用。

由于业务需求,需要让用户选择地址信息。所以在想在网上找一份最新的省市区及乡镇的数据,数据来源是当前的这个地址

2021年统计用区划代码和城乡划分代码

1.介绍org.jsoup
jsoup是一个Java的html解析器
2.Maven依赖

<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
      <dependency>
          <groupId>org.jsoup</groupId>
          <artifactId>jsoup</artifactId>
          <version>1.9.2</version>
      </dependency>

3.编写实体类,用于储存数据
RegionEntry.java

package cn.jiangdoc.utils;

import java.util.ArrayList;
import java.util.List;

public class RegionEntry {
    private String code;
    private String name;
    private List<RegionEntry> sub = new ArrayList<>();

    public String getCode() {
        return code;
    }

    public void setCode(String code) {
        this.code = code;
    }

    public String getName() {
        return name;
    }

    public void setName(String name) {
        this.name = name;
    }

    public List<RegionEntry> getSub() {
        return sub;
    }

    public void setSub(List<RegionEntry> sub) {
        this.sub = sub;
    }

    public RegionEntry(String code, String name, List<RegionEntry> sub) {
        this.code = code;
        this.name = name;
        this.sub = sub;
    }

    public RegionEntry() {
    }
}

4.正式开始我们的爬虫数据
AddressData .java

package cn.jiangdoc.utils;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**
 * 抓取
 *
 * @author jiangdoc
 * @date 2019-3-16
 */
public class AddressData {
    public static String SITE_URL = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/";
    private static List<RegionEntry> regions = new ArrayList<RegionEntry>();
    public static void main(String[] args) {
        System.out.println("抓取开始:" + new Date());
        getProvince();
        StringBuffer content = new StringBuffer();
        for (RegionEntry one : regions) {
            content.append("insert into sys_province values(null,'").append(one.getCode()).append("', '").append(one.getName()).append("', 1 );\r\n");
            for (RegionEntry two : one.getSub()) {
                content.append("insert into sys_city values(null,'").append(one.getCode()).append("', '").append(two.getCode()+"','").append(two.getName()).append("', 2);\r\n");
                for (RegionEntry three : two.getSub()) {
                    content.append("insert into sys_county values(null,'").append(one.getCode()).append("', '").append(two.getCode()).append("', '").append(three.getCode()).append("', '").append(three.getName()).append("', 3 );\r\n");
                    for(RegionEntry four:three.getSub()){
                        content.append("insert into sys_town values(null,'").append(one.getCode()).append("', '").append(two.getCode()).append("', '").append(three.getCode()).append("', '").append(four.getCode()).append("','").append(four.getName()).append("', 4 );\r\n");
                    }
                }
            }
        }
        FileOutputStream out = null;
//        Region.writeFile(content.toString());
        try{
           out = new FileOutputStream(new File("G:\\log\\city.txt"));
            byte[] bytes = content.toString().getBytes();
            out.write(bytes);
            out.flush();
        }catch(Exception e){
            e.printStackTrace();
        }finally{
            if(out!=null)
                try{
                out.close();
            }catch (Exception e){
                e.printStackTrace();
                }
        }
        System.out.println("抓取完毕:" + new Date());
    }
    private static void getProvince() {
        Document doc;
        try {
            doc = Jsoup.connect(SITE_URL).get(); //Jsoup.connect(SITE_URL).get();
            Elements links = doc.select("tr.provincetr").select("a");
            RegionEntry region = null;
            for (Element e : links) {
                region = new RegionEntry();
                String href = e.attr("href");
                String[] arr = href.split("\\.");
                String code = arr[0];
                if (arr[0].length() < 6) {
                    for (int i = 0; i < 6 - arr[0].length(); i++) {
                        code += "0";
                    }
                }
                region.setCode(code);
                region.setName(e.text());
//                href的绝地路径
                String absHref = e.attr("abs:href");
                System.out.println(absHref);
                getCity(absHref, region);
                regions.add(region);
                try {
                    Thread.sleep(1000);
                } catch (InterruptedException e1) {
                    // TODO Auto-generated catch block
                    e1.printStackTrace();
                }
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
    /**
     * 获取市地址
     * @param url
     * @param region
     */
    private static void getCity(String url, RegionEntry region) {
        Document doc;
        try {
            doc = Jsoup.connect(url).get(); //Jsoup.connect(url).get().charset(charset);
//            <tr class='citytr'><td><a href='65/6501.html'>650100000000</a></td><td><a href='65/6501.html'>乌鲁木齐市</a></td></tr>
            Elements links = doc.select("tr.citytr");
            RegionEntry city;
            for (Element e : links) {
                city = new RegionEntry();
                Elements alist = e.select("a");
                Element codeE = alist.get(0);
                Element codeN = alist.get(1);
                String name = codeN.text();
                String code = codeE.text();
                if ("市辖区".equals(name)) {
                    name = region.getName();
                    //code = region.getCode();
                }
                city.setCode(code);
                city.setName(name);
                String absHref = codeE.attr("abs:href");
                getArea(absHref, city);
                region.getSub().add(city);
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
    /**
     * 获取区县地址
     * @param url
     * @param region
     */
    private static void getArea(String url, RegionEntry region) {
        Document doc;
        try {
            doc = Jsoup.connect(url).get(); // Jsoup.connect(url).get();
            //<tr class='countytr'><td><a href='01/130102.html'>130102000000</a></td><td><a href='01/130102.html'>长安区</a></td></tr>
            Elements links = doc.select("tr.countytr");
            RegionEntry area;
            for (Element e : links) {
                area = new RegionEntry();
                Elements alist = e.select("a");
                if (alist.size() > 0) {
                    Element codeE = alist.get(0);
                    String code = codeE.text();
                    area.setCode(code);
                    Element codeN = alist.get(1);
                    String name = codeN.text();
                    area.setName(name);
                    String absHref = codeE.attr("abs:href");
                    getTown(absHref, area);
                    region.getSub().add(area);
                } else {
                    alist = e.select("td");
                    area.setCode(alist.get(0).text());
                    area.setName(alist.get(1).text());
                    region.getSub().add(area);
                }
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
    //乡镇
    private static void getTown(String url, RegionEntry region) {
        Document doc;
        try {
            doc = Jsoup.connect(url).get(); // Jsoup.connect(url).get();
            //<tr class='towntr'><td><a href='07/110107001.html'>110107001000</a></td><td><a href='07/110107001.html'>八宝山街道办事处</a></td></tr>
            Elements links = doc.select("tr.towntr");
            RegionEntry town;
            for (Element e : links) {
                town = new RegionEntry();
                Elements alist = e.select("a");
                if (alist.size() > 0) {
                    Element codeE = alist.get(0);
                    String code = codeE.text();
                    town.setCode(code);
                    Element codeN = alist.get(1);
                    String name = codeN.text();
                    town.setName(name);
                    region.getSub().add(town);
                } else {
                    alist = e.select("td");
                    town.setCode(alist.get(0).text());
                    town.setName(alist.get(1).text());
                    region.getSub().add(town);
                }
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}

需要主要的是这个可能会出现爬取超时的情况,建议把Thread.sleep的时间设置的长一点

或者在下面加上,这个后面可能需要进行数据校验,大概爬下来的数据的SQL大概在5,6MB左右基本数据都是全的

Connection connect = Jsoup.connect("url");
        Map<String> header = new HashMap<String>();
        header.put("Content-type", "application/json");
        Connection data = connect.headers(header);
        Document document = data.get();[code=java]

省市区后台接口设置

这种爬取设置的时候需要根据对应的数据源的信息进行相关的判断,在进行相关的设置的时候,则需要提供对应的接口其实主要的接口ID的传值还是在前台进行处理,前台的JS 在处理的时候需要吧对应的父级的ID 传进来,后台的接到ID直接进行查询就可以了,整体的业务是非常简单的。

1.Entity实体类设计 

@Data
@Entity
@Table(name = "tb_area")
@NoArgsConstructor
@EntityListeners(AuditingEntityListener.class)
public class Area {

    @Id
    @Column(columnDefinition = "varchar(32) comment '主键 id' ")
    private String id;


    @Column(columnDefinition = "varchar(32) comment '父级id' ")
    private String parentId;

    @Column(columnDefinition = "varchar(32) comment '编码' ")
    private String code;

    @Column(columnDefinition = "varchar(32) comment '名称' ")
    private String name;

    @Column(columnDefinition = "varchar(32) comment '等级' ")
    private String level;

    @Column(columnDefinition = "datetime comment '创建时间' ")
    @CreatedDate
    private LocalDateTime createTime;

    @Column(columnDefinition = "datetime comment '更新时间' ")
    @LastModifiedDate
    private LocalDateTime updateTime;

    @Column(columnDefinition = "tinyint(1) comment '是否删除' ")
    private Boolean delFlag = false;

2.Dao(AreaRepository)数据库持久层设置

public interface AreaRepository extends JpaRepository<Area, String>, JpaSpecificationExecutor<Area>  {

    List<AreaDTO> findAllByLevelAndDelFlagIsFalse(String level);

    List<AreaDTO> findAllByParentIdAndDelFlagIsFalse(String parentId);

    List<AreaDTO> findAllByCodeAndDelFlagIsFalse(String code);
}

3.Service(业务层处理)都是根据对应的DAO进行直接的查询的没有过多的复杂的逻辑

@Service
public class AreaService {

    @Autowired
    private AreaRepository areaRepository;

    @Cacheable(cacheNames = "level:info")
    public List<AreaDTO> findProvinceList(String level) {
        return areaRepository.findAllByLevelAndDelFlagIsFalse(level);
    }


    public List<AreaDTO> findByCodeList(String code) {

        return areaRepository.findAllByCodeAndDelFlagIsFalse(code);
    }


    @Cacheable(cacheNames = "provinceId:info")
    public List<AreaDTO> findCityList(String provinceId) {
        return areaRepository.findAllByParentIdAndDelFlagIsFalse(provinceId);
    }

    @Cacheable(cacheNames = "cityId:info")
    public List<AreaDTO> findOrganyList(String cityId) {
        return areaRepository.findAllByParentIdAndDelFlagIsFalse(cityId);
    }

    @Cacheable(cacheNames = "organId:info")
    public List<AreaDTO> findStreetList(String organId) {
        return areaRepository.findAllByParentIdAndDelFlagIsFalse(organId);
    }

4.Controller (控制层)直接通过不同的API调用进行显示处理就可以了

@Api(tags = "地址相关")
@RestController
@RequestMapping("/area")
public class AreaController {
    @Autowired
    private AreaService areaService;

    @ApiOperation("获取省信息")
    @GetMapping(value = "/province")
    public ObjectResponse<List<AreaDTO>> getProvince(@ApiParam(value = "行政等级") @RequestParam(value = "level", defaultValue = "1", required = false) String level) {
        return ObjectResponse.newResponse(areaService.findProvinceList(level));
    }


    @ApiOperation("获取市信息")
    @GetMapping("/city")
    public ObjectResponse<List<AreaDTO>> getCity(@ApiParam(value = "省级ID")@RequestParam(value = "province_id") String provinceId) {

        return ObjectResponse.newResponse(areaService.findCityList(provinceId));
    }

    @ApiOperation("获取区信息")
    @GetMapping("/organ")
    public ObjectResponse<List<AreaDTO>> getOrgan(@ApiParam(value = "市级ID") @RequestParam(value = "city_id") String cityId) {
        return ObjectResponse.newResponse(areaService.findOrganyList(cityId));
    }

    @ApiOperation("获取街道信息")
    @GetMapping("/street")
    public ObjectResponse<List<AreaDTO>> getStreet(@ApiParam(value = "区级ID") @RequestParam(value = "organ_id") String organId) {
        return ObjectResponse.newResponse(areaService.findStreetList(organId));
    }

如上所示,省市区街道的信息就可以进行联动操作了,在进行操作的时候,其实最核心的地方还是需要将对应的地址抓取下来,只有地址有了才可以确定唯一的数据可以通过程序去拉取到。

这个里面需要注意的一个地方,在统计局的地方会有一个市辖区的三级地址的概念,其实这个地址不是真是存在的,仅仅只是为了统计方便为了使用的。比如这个黑色的这个市辖区,他是下面没有下属街道的,但是为了编码方便还是需要在给这个上面进行统计操作

常见模块--分页查;查详情;保存;接口

1.管理系统中,通常常见的模块,就是分页查询,和查询该条记录的明细,修改信息,保存信息;下面分别从上至底进行分析;

Controller层(链路请求分析)

 保存/修改可以共用一个接口新,在进行操作的时候可以进行明细的区分和保存可以先进行findByID进行查询如果当前记录不存在直接进行保存,如果存在则直接进行修改操作具体

---->Controller层的保存/修改都走这条URI

@ApiOperation("医院保存/修改")
    @PostMapping()
    public ObjectResponse<Void> save(@Valid @RequestBody HospitalInputDTO hospitalInputDTO) {
        hospitalService.save(hospitalInputDTO);
        return ObjectResponse.messageResponse("保存成功");
    }


---->Service层的保存/修改请求的目标路径
 public void save(HospitalInputDTO inputDTO) {
        Hospital hospital;
        if (StringUtils.isNotEmpty(inputDTO.getId())) {
            hospital = hospitalRepository.findById(inputDTO.getId()).orElseThrow(() -> new BaseException("医院不存在!"));
        } else {
            hospital = new Hospital();
            hospital.setId(RandomGenerator.buildUUID());
        }
        hospital.setName(inputDTO.getName());
        hospital.setBigLogo(inputDTO.getBigLogo());
        hospital.setSmallLogo(inputDTO.getSmallLogo());
        hospital.setLevel(inputDTO.getLevel());
        hospital.setSampleFee(inputDTO.getSampleFee());

        //挂号费默认0
        hospital.setRegFee("0");
        hospital.setDelFlag(Boolean.FALSE);
        hospitalRepository.save(hospital);
    }
在执行修改操作的时候,默认前端会将对应的数据的DTO里面塞入ID,和对应的修改的字段回填至对应的信息内容,将对应的详情信息填充之后,在进行回写的操作,保存操作,在进行操作的时候因为不会传入对应的ID信息,所以要新赋值一个ID操作,在整个保存的时候ID在表中都是不可自增的操作。

分页查询/查询某条明细记录,分页查询因为要结合对应的查询条件进行查询的操作,所以整体的操作还是非常简单的,在结合对应的查询某条商品明细的查询直接根据ID进行查询的所以信息的查询都是非常简单的。

---->Controller 层的分页查询
@ApiOperation("医院列表")
    @GetMapping("/page")
    public PageResponse<HospitalOutputDTO> page(@ApiIgnore Session session,
                                                @ApiParam(value = "医院名称") @RequestParam(value = "name", required = false) String name,
                                                @RequestParam(value = "page", defaultValue = "1", required = false) int page,
                                                @RequestParam(value = "size", defaultValue = "20", required = false) int size) {
        Page<HospitalOutputDTO> hospitalOutputDTOPage = hospitalService.page(session, name, page, size);
        return PageResponse.newResponse(hospitalOutputDTOPage.getContent(), (int)hospitalOutputDTOPage.getTotalElements(), page, size);
    }

--->Controller 层的查询某条记录明细

 @ApiOperation("医院详情")
    @GetMapping("/detail")
    public ObjectResponse<HospitalDTO> detail(@ApiParam("医院主键id") @RequestParam(value = "id") String id) {
        HospitalDTO hospitalDTO = hospitalService.detail(id);
        return ObjectResponse.newResponse(hospitalDTO);
    }

---->Service 层分页查询,有涉及到动态条件查询,直接的复制就可以了
public Page<HospitalOutputDTO> page(Session session, String name, int page, int size) {
        HospitalDTO query = new HospitalDTO();
        query.setName(name);

        Specification<Hospital> specification = querySpecification(query);
        Pageable pageable = PageRequest.of(page - 1, size, Sort.by(Sort.Direction.DESC, "updateTime"));
        Page<Hospital> antigenPage = hospitalRepository.findAll(specification, pageable);
        List<HospitalOutputDTO> collect = antigenPage.stream().map(HospitalOutputDTO::new).collect(Collectors.toList());
        return new PageImpl<>(collect, pageable, antigenPage.getTotalElements());
    }

    private Specification<Hospital> querySpecification(HospitalDTO query) {
        return (root, criteriaQuery, criteriaBuilder) -> {
            List<Predicate> predicateList = new ArrayList<>();

            if (StringUtils.isNotEmpty(query.getName())) {
                predicateList.add(criteriaBuilder.like(root.get("name").as(String.class), "%" + query.getName() + "%"));
            }
            predicateList.add(criteriaBuilder.equal(root.get("delFlag").as(Boolean.class), Boolean.FALSE));
            return criteriaBuilder.and(predicateList.toArray(new Predicate[0]));
        };
    }

---->Servic层的查询某条记录明细,直接进行某条记录的findById操作就可以了

public HospitalDTO detail(String id) {
        Hospital hospital = hospitalRepository.findById(id).orElseThrow(() -> new BaseException("医院不存在!"));
        return new HospitalDTO(hospital);
    }

资源地址如下图所示:https://download.csdn.net/download/zgz102928/85236645

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值