使用jsoup写java爬虫,爬取全国地区及编码数据

我也是参考了这篇博客http://www.cnblogs.com/sanmubird/p/7857474.html写的程序,是可以实现的。只需要修改一下MyCrawler代码即可。

package com.lenovo.crawl.main;

import com.lenovo.crawl.entity.Region;
import com.lenovo.crawl.link.LinkFilter;
import com.lenovo.crawl.link.Links;
import com.lenovo.crawl.page.Page;
import com.lenovo.crawl.page.PageParserTool;
import com.lenovo.crawl.page.RequestAndResponseTool;
import com.lenovo.crawl.util.DBCPUtils;
import com.lenovo.crawl.util.FileTool;
import org.apache.commons.dbutils.QueryRunner;
import org.apache.commons.dbutils.handlers.BeanHandler;
import org.apache.commons.dbutils.handlers.BeanListHandler;
import org.jsoup.nodes.Attributes;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.select.Elements;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.sql.Connection;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class Crawler {
    static Pattern pattern = Pattern.compile("^[0-9]+.*");
    static Pattern pattern_ = Pattern.compile("^[0-9]*");
    /**
     * 使用种子初始化 URL 队列
     *
     * @param seeds 种子 URL
     * @return
     */
    private void initCrawlerWithSeeds(String[] seeds) {
        for (int i = 0; i < seeds.length; i++){
            Links.addUnvisitedUrlQueue(seeds[i]);
        }
    }
    /**
     * 抓取过程
     *
     * @param seeds
     * @return
     */
    public void crawling(String[] seeds){
        Crawler crawler=new Crawler();
        //创建一个File的实例对象
        File file=new File("D:\\ja\\tempIn");
        //判断file是否存在,不存在就创建出一个文件目录
        if(!file.exists()){
            file.mkdirs();
        }
        File file1=new File(file,"result.txt");
        //判断file1是否存在,不存在就创建出一个文件
        if(!file1.exists()){
            file.mkdirs();
        }
        //创建FileOutInputStream的对象
        FileOutputStream fos=null;
        try{
            fos=new FileOutputStream(file1,true);
        //初始化 URL 队列
        initCrawlerWithSeeds(seeds);

        //定义过滤器,提取以 http://www.baidu.com 开头的链接
        LinkFilter filter = new LinkFilter() {
            public boolean accept(String url) {
                if (url.startsWith("http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018"))
                    return true;
                else
                    return false;
            }
        };

        //循环条件:待抓取的链接不空且抓取的网页不多于 1000
        while (!Links.unVisitedUrlQueueIsEmpty()) {
            //先从待访问的序列中取出第一个;
            String visitUrl = (String) Links.removeHeadOfUnVisitedUrlQueue();
            if (visitUrl == null){
                continue;
            }
            //根据URL得到page;
            Page page = RequestAndResponseTool.sendRequstAndGetResponse(visitUrl);
            //对page进行处理: 访问DOM的某个标签
            Elements es = PageParserTool.select(page,"a");
            Elements esTr = PageParserTool.select(page,"tr");
            if(!esTr.isEmpty()){
                for (Element e:esTr)
                {
                   if(e.attributes().get("class").equals("villagetr")){
                       List<Node> nodes=e.childNodes();
                       if(nodes!=null&&nodes.size()==3){
                           System.out.println(nodes.get(0).outerHtml().replaceAll("<td>","").replaceAll("</td>",""));
                           System.out.println(nodes.get(2).outerHtml().replaceAll("<td>","").replaceAll("</td>",""));
                           String code="                "+(nodes.get(0).outerHtml().replaceAll("<td>","").replaceAll("</td>","")+" ");
                           String name="                "+(nodes.get(2).outerHtml().replaceAll("<td>","").replaceAll("</td>","")+" ");
                           fos.write(code.getBytes());
                           fos.write("\r\n".getBytes());
                           fos.write(name.getBytes());
                           fos.write("\r\n".getBytes());
                       }
                   }
                   if(e.attributes().get("class").equals("provincetr")){
                       List<Node> nodes=e.childNodes();
                       if(nodes!=null&&nodes.size()>0){
                           for (Node node:nodes) {
                              Node value=node.childNodes().get(0);
                              String name=value.childNodes().get(0).outerHtml();
                              Attributes attributes=value.attributes();
                              String a=attributes.get("href");
                              a=a.substring(0,2);
                              a=a+"0000000000";
                               System.out.println(name);
                               System.out.println(a);
                               fos.write(a.getBytes());
                               fos.write("\r\n".getBytes());
                               fos.write(name.getBytes());
                               fos.write("\r\n".getBytes());
                           }
                       }
                   }
                   if(e.attributes().get("class").equals("citytr")){
                       List<Node> nodes=e.childNodes();
                       if(nodes!=null&&nodes.size()>0){
                              Node codeNode=nodes.get(0).childNodes().get(0);
                              String code=codeNode.childNodes().get(0).outerHtml();
                              Node nameCode=nodes.get(1).childNodes().get(0);
                              String name=nameCode.childNodes().get(0).outerHtml();
                              code="    "+code;
                              name="    "+name;
                           System.out.println(name);
                           System.out.println(code);
                           fos.write(code.getBytes());
                           fos.write("\r\n".getBytes());
                           fos.write(name.getBytes());
                           fos.write("\r\n".getBytes());
                       }
                   }
                   if(e.attributes().get("class").equals("countytr")){
                       List<Node> nodes=e.childNodes();
                       if(nodes!=null&&nodes.size()>0){
                              Node codeNode=nodes.get(0).childNodes().get(0);
                              String code=codeNode.childNodes().get(0).outerHtml();
                              Node nameCode=nodes.get(1).childNodes().get(0);
                              String name=nameCode.childNodes().get(0).outerHtml();
                           code="        "+code;
                           name="        "+name;
                           System.out.println(name);
                           System.out.println(code);
                           fos.write(code.getBytes());
                           fos.write("\r\n".getBytes());
                           fos.write(name.getBytes());
                           fos.write("\r\n".getBytes());
                       }
                   }
                   if(e.attributes().get("class").equals("towntr")){
//                       Region region=new Region();
                       List<Node> nodes=e.childNodes();
                       if(nodes!=null&&nodes.size()>0){
                              Node codeNode=nodes.get(0).childNodes().get(0);
                              String code=codeNode.childNodes().get(0).outerHtml();
                              Node nameCode=nodes.get(1).childNodes().get(0);
                              String name=nameCode.childNodes().get(0).outerHtml();
                           code="            "+code;
                           name="            "+name;
                           System.out.println(name);
                           System.out.println(code);
                           fos.write(code.getBytes());
                           fos.write("\r\n".getBytes());
                           fos.write(name.getBytes());
                           fos.write("\r\n".getBytes());
                       }
                   }
                }
            }
            if(!es.isEmpty()){
                for (Element e:es
                     ) {
                    String key=e.attributes().get("href");
                    Matcher m = pattern.matcher(key);
                    if(m.matches()){
                        System.out.println(e.html().replaceAll("<br>", ""));
                        fos.write((e.html().replaceAll("<br>", "")+" ").getBytes());
                        fos.write("\r\n".getBytes());
                        if(!pattern_.matcher(e.html()).matches()) {
                            crawling(new String[]{visitUrl.substring(0, visitUrl.lastIndexOf("/") + 1) + key});
                        }
                    }
                }
            }
            //将保存文件
            FileTool.saveToLocal(page);
            //将已经访问过的链接放入已访问的链接中;
            Links.addVisitedUrlSet(visitUrl);
            //得到超链接
            Set<String> links = PageParserTool.getLinks(page,"img");
            for (String link : links) {
                Links.addUnvisitedUrlQueue(link);
            }
        }
        }catch(Exception e){
            System.out.println("--------IO异常----------");
        }finally {
            try{
                fos.close();
            }catch(Exception e){
                System.out.println("------被要关闭的文件不存在-------");
            }
        }
    }
    public static void readFileByLines(String fileName) {
        File file = new File(fileName);
        BufferedReader reader = null;
        try {
            QueryRunner qr = new QueryRunner(DBCPUtils.getDataSource());
            Connection connection=DBCPUtils.getDataSource().getConnection();
            String sqlInsert = "INSERT INTO region VALUES (?,?,?,?,?,?)";
            reader = new BufferedReader(new FileReader(file));
            List<String> codeList=new ArrayList<String>();
            List<String> nameList=new ArrayList<String>();
            String tempString = null;
            int line = 1;
            // 一次读入一行,直到读入null为文件结束
            while ((tempString = reader.readLine()) != null) {
                if(line%2==0){
                    nameList.add(tempString);
                }else{
                    codeList.add(tempString);
                }
                line++;
            }
            for(int i=0;i<codeList.size();i++){
                String code=codeList.get(i);
                String name=nameList.get(i);
                String privonceCode=code.substring(0,2);
                String cityCode=code.substring(2,4);
                String countyCode=code.substring(4,6);
                String otherCode=code.substring(6,12);
//              肯定是县、区
                if((!countyCode.equals("00"))&&otherCode.equals("000000")){
                    String parentCode=privonceCode+cityCode+"00"+otherCode;
                    Object[] params ={code,"","",name,parentCode,"3"};
                    qr.update(sqlInsert,params);
                }
//              肯定是市
                if((otherCode.equals("000000")&&countyCode.equals("00"))&&(!cityCode.equals("00"))){
                    String parentCode=privonceCode+"00"+countyCode+otherCode;
                    Object[] params ={code,"",name,"",parentCode,"2"};
                    qr.update(sqlInsert,params);
                }
//              肯定是省
                if((!privonceCode.equals("00")&&(code.substring(2,12).equals("0000000000")))){
                    Object[] params ={code,name,"","","","1"};
                    qr.update(sqlInsert,params);
                }
            }
            reader.close();
            connection.close();
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            if (reader != null) {
                try {
                    reader.close();
                } catch (Exception e1) {
                }
            }
        }
    }
    public static void updateRegion() throws Exception{
            QueryRunner qr = new QueryRunner(DBCPUtils.getDataSource());
            Connection connection=DBCPUtils.getDataSource().getConnection();
            String sql="select * from region";
            List<Region> list=qr.query(sql,new BeanListHandler<Region>(Region.class));
//            Object [][] params=new Object[344][];
//            int k=0;
            for (int i=0;i<list.size();i++) {
                Region region=list.get(i);
                String level=region.getLevel();
                if(level.equals("1")){
                    Object [] paramUpdate={" ",region.getCode()};
                    String updateSql="update region set provinceCode=? where code=?";
                    qr.update(updateSql,paramUpdate);
                }
                if(level.equals("2")){
                    String parentCode=region.getParentCode();
                    String sql2="select * from region where code=?";
                    Object [] param={parentCode};
                    Region region1=qr.query(sql2,new BeanHandler<Region>(Region.class),param);
                    String province=region1.getProvince();
                    region.setProvince(province);
                    Object [] paramUpdate={region.getProvince(),region1.getCode()," ",region.getCode()};
                    String updateSql="update region set province=?,provinceCode=?,cityCode=? where code=?";
                    qr.update(updateSql,paramUpdate);
                }
                if(level.equals("3")){
                    String parentCode=region.getParentCode();
                    String sql2="select * from region where code=?";
                    Object [] param={parentCode};
                    Region region1=qr.query(sql2,new BeanHandler<Region>(Region.class),param);
                    String province=region1.getProvince();
                    String city=region1.getCity();
                    region.setProvince(province);
                    region.setCity(city);
                    Object [] paramUpdate={region.getProvince(),region.getCity(),region1.getProvinceCode(),region1.getCode(),region.getCode()};
                    String updateSql="update region set province=?,city=?,provinceCode=?,cityCode=? where code=?";
                    qr.update(updateSql,paramUpdate);
                }
            }
            connection.close();
    }
    //main 方法入口
    public static void main(String[] args) throws  Exception{
        Crawler crawler=new Crawler();
        crawler.crawling(new String[]{"http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/"});
    }
}

 region对象如下:

package com.lenovo.crawl.entity;

public class Region {
    private String code;
    private String provinceCode;
    private String province;
    private String cityCode;
    private String city;
    private String countyCode;
    private String county;
    private String parentCode;
    private String level;

    public String getCode() {
        return code;
    }

    public void setCode(String code) {
        this.code = code;
    }

    public String getProvinceCode() {
        return provinceCode;
    }

    public void setProvinceCode(String provinceCode) {
        this.provinceCode = provinceCode;
    }

    public String getProvince() {
        return province;
    }

    public void setProvince(String province) {
        this.province = province;
    }

    public String getCityCode() {
        return cityCode;
    }

    public void setCityCode(String cityCode) {
        this.cityCode = cityCode;
    }

    public String getCity() {
        return city;
    }

    public void setCity(String city) {
        this.city = city;
    }

    public String getCountyCode() {
        return countyCode;
    }

    public void setCountyCode(String countyCode) {
        this.countyCode = countyCode;
    }

    public String getCounty() {
        return county;
    }

    public void setCounty(String county) {
        this.county = county;
    }

    public String getParentCode() {
        return parentCode;
    }

    public void setParentCode(String parentCode) {
        this.parentCode = parentCode;
    }

    public String getLevel() {
        return level;
    }

    public void setLevel(String level) {
        this.level = level;
    }

    public Region() {
    }

    public Region(String code, String provinceCode, String province, String cityCode, String city, String countyCode, String county, String parentCode, String level) {
        this.code = code;
        this.provinceCode = provinceCode;
        this.province = province;
        this.cityCode = cityCode;
        this.city = city;
        this.countyCode = countyCode;
        this.county = county;
        this.parentCode = parentCode;
        this.level = level;
    }

    @Override
    public boolean equals(Object o) {
        if (this == o) return true;
        if (o == null || getClass() != o.getClass()) return false;

        Region region = (Region) o;

        if (code != null ? !code.equals(region.code) : region.code != null) return false;
        if (provinceCode != null ? !provinceCode.equals(region.provinceCode) : region.provinceCode != null)
            return false;
        if (province != null ? !province.equals(region.province) : region.province != null) return false;
        if (cityCode != null ? !cityCode.equals(region.cityCode) : region.cityCode != null) return false;
        if (city != null ? !city.equals(region.city) : region.city != null) return false;
        if (countyCode != null ? !countyCode.equals(region.countyCode) : region.countyCode != null) return false;
        if (county != null ? !county.equals(region.county) : region.county != null) return false;
        if (parentCode != null ? !parentCode.equals(region.parentCode) : region.parentCode != null) return false;
        return level != null ? level.equals(region.level) : region.level == null;
    }

    @Override
    public int hashCode() {
        int result = code != null ? code.hashCode() : 0;
        result = 31 * result + (provinceCode != null ? provinceCode.hashCode() : 0);
        result = 31 * result + (province != null ? province.hashCode() : 0);
        result = 31 * result + (cityCode != null ? cityCode.hashCode() : 0);
        result = 31 * result + (city != null ? city.hashCode() : 0);
        result = 31 * result + (countyCode != null ? countyCode.hashCode() : 0);
        result = 31 * result + (county != null ? county.hashCode() : 0);
        result = 31 * result + (parentCode != null ? parentCode.hashCode() : 0);
        result = 31 * result + (level != null ? level.hashCode() : 0);
        return result;
    }
}

我是先把数据爬取完,写到了txt文件里,然后去解析txt文件,把数据整理成了结构化的(为了实现三级联动),各自加了一个level的属性。然后写到了数据库里。(上面的代码,如果不想这样输出,可以写个递归。按层级输出)

我的txt连接在这:https://download.csdn.net/download/qq_29281307/11191484

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值