2021年行政地址信息 java 获取demo

网上想找个 4级行政地址,全都要付款,我想了下自己拿吧。

该代码不依赖任何爬虫框架,就是自己做字符串判断。

做到了省/市/区县/街镇,第五级没有爬取 感觉用不上,想要也可以自己接着弄一层。注意第4层数据量已经到了4万了,要分批取数了,不能一次性读sql了。

如果不想执行代码的,我把sql和数据放上来,下载地址;

https://download.csdn.net/download/wangzhi291/83673474https://download.csdn.net/download/wangzhi291/83673474

建表语句如下:

CREATE TABLE `mall_areabb` (
  `area_id` bigint NOT NULL AUTO_INCREMENT,
  `area_name` varchar(50) DEFAULT NULL,
  `parent_id` bigint DEFAULT NULL,
  `level` int DEFAULT NULL,
  `path` varchar(200) DEFAULT NULL,
  PRIMARY KEY (`area_id`),
  KEY `parent_id` (`parent_id`) COMMENT '上级id'
) ENGINE=InnoDB AUTO_INCREMENT=659011502001 DEFAULT CHARSET=utf8mb3;

代码如下:



import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.mysql.cj.jdbc.MysqlDataSource;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.springframework.jdbc.core.JdbcTemplate;

/**
 *爬取国家行政地址信息
 * @author andy.wang
 * @时间 2022年03月17日
 */
public class Plzl {


    private static String domainName="http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/";


    public static void main(String[] args) throws Exception {
        String s=domainName+"index.html";
        Plzl p = new Plzl();
        //第一层
        Map<String,String> m = p.requestByGetMethod(s,"<a##</a>");
        MysqlDataSource dataSource = new MysqlDataSource();
        dataSource.setUrl("jdbc:mysql://localhost:3306/dashu?useUnicode=true&characterEncoding=utf8&serverTimezone=Asia/Shanghai&nullCatalogMeansCurrent=true");
        dataSource.setUser("root");
        dataSource.setPassword("123456");
        JdbcTemplate jdbcTemplate = new JdbcTemplate(dataSource);
        insertDataOne(m,jdbcTemplate);
        //第二层
        List<Map<String,Object>> columns = jdbcTemplate.queryForList(" select * from mall_areabb where level=1 ");
        for (int i=0;i<columns.size() ;i++) {
            Map<String,Object> two=columns.get(i);
            String path =two.get("path").toString();
            Map<String,String> twoMap = p.requestByGetMethod(domainName+path,"<a##</a>");
            insertDataTwo(twoMap, jdbcTemplate,2,Long.valueOf(two.get("area_id").toString()));
//            System.out.println(twoMap.toString());
        }
        columns = jdbcTemplate.queryForList(" select * from mall_areabb where level=2 ");
        for (int i=0;i<columns.size() ;i++) {
            Map<String,Object> two=columns.get(i);
            String path =two.get("path").toString();
            Map<String,String> twoMap = p.requestByGetMethod(domainName+path,"<a##</a>");
            insertDataThree(twoMap, jdbcTemplate,3,Long.valueOf(two.get("area_id").toString()),path.substring(0,path.indexOf("/")+1));
//            System.out.println(twoMap.toString());
        }
        columns = jdbcTemplate.queryForList(" select * from mall_areabb where level=3 ");
        for (int i=0;i<columns.size() ;i++) {
            Map<String,Object> two=columns.get(i);
            String path =two.get("path").toString();
            Map<String,String> twoMap = p.requestByGetMethod(domainName+path,"<a##</a>");
            insertDataThree(twoMap, jdbcTemplate,4,Long.valueOf(two.get("area_id").toString()),path.substring(0,path.lastIndexOf("/")+1));
//            System.out.println(twoMap.toString());
        }
    }
    public static void  insertDataOne(Map<String,String> m, JdbcTemplate t){
        String sql =" insert into mall_areabb(area_id,area_name,parent_id,level,path) values(?,?,0,1,?);";
        for (String k :m.keySet()) {
            Object[] o =new Object[3];
            o[0]=Long.valueOf(k.substring(0,k.indexOf("."))+"0000000000");
            o[1]=m.get(k);
            o[2]=k;
            t.update(sql,o);
        }
    }

    public static void  insertDataTwo(Map<String,String> m, JdbcTemplate t,int level,Long pid){
        String sql =" insert into mall_areabb(area_id,area_name,parent_id,level,path) values(?,?,?,"+level+",?)";
        List<Object[]> list =new ArrayList<>();
        for (String k :m.keySet()) {
            Object[] o =new Object[4];
            String[] v =m.get(k).split(":");
            o[0]=Long.valueOf(v[0]);
            o[1]=v[1];
            o[2]=pid;
            o[3]=k;
            list.add(o);
        }
        t.batchUpdate(sql,list);
    }
    public static void  insertDataThree(Map<String,String> m, JdbcTemplate t,int level,Long pid,String path){
        String sql =" insert into mall_areabb(area_id,area_name,parent_id,level,path) values(?,?,?,"+level+",?)";
        List<Object[]> list =new ArrayList<>();
        for (String k :m.keySet()) {
            Object[] o =new Object[4];
            String[] v =m.get(k).split(":");
            o[0]=Long.valueOf(v[0]);
            o[1]=v[1];
            o[2]=pid;
            o[3]=path+k;
            list.add(o);
        }
        t.batchUpdate(sql,list);
    }




    private CloseableHttpClient getHttpClient(){
        return HttpClients.createDefault();
    }

    private void closeHttpClient(CloseableHttpClient client) throws IOException{
        if (client != null){
            client.close();
        }
    }


    /**
     * 通过GET方式发起http请求
     */
    public Map<String,String> requestByGetMethod(String url,String key){
        Map<String,String> map=new HashMap<>();
        //创建默认的httpClient实例
        CloseableHttpClient httpClient = getHttpClient();
        try {
            //用get方法发送http请求
            HttpGet get = new HttpGet(url);
            CloseableHttpResponse httpResponse = null;
            //发送get请求
            httpResponse = httpClient.execute(get);
            try{
                //response实体
                HttpEntity entity = httpResponse.getEntity();
                if (null != entity){
                    String s =EntityUtils.toString(entity,"utf-8");
                    map = read(s,key);
                }
            }
            finally{
                httpResponse.close();
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
        finally{
            try{
                closeHttpClient(httpClient);
            } catch (IOException e){
                e.printStackTrace();
            }
        }
        return map;
    }



    private Map<String,String> read(String s,String key) throws IOException{
        if(s==null||s.length()<10){
            return null;
        }
        Map<String,String> m =new HashMap<>();
        // 以自定编码的方式读取输入流
        if (s.lastIndexOf("content")>0) {
            s =s.substring(s.lastIndexOf("<table"),s.indexOf("</table>"));
            String[] keys = key.split("##");
            while (true){
                Integer count = s.indexOf(keys[1]);
                if(s.indexOf(keys[0])<0){
                    break;
                }
                String ss =s.substring(s.indexOf(keys[0]),count);
                String path=ss.substring(ss.indexOf("href=\"")+6,ss.indexOf("\">"));
                String name =delHTMLTag(ss);
                if(m.get(path)==null){
                    m.put(path,name);
                }else {
                    m.put(path,m.get(path)+":"+name);
                }
                s=s.substring(count+5);
            }
        }
        return m;
    }

    public String delHTMLTag(String htmlStr){
        String regEx_script="<script[^>]*?>[\\s\\S]*?<\\/script>"; //定义script的正则表达式
        String regEx_style="<style[^>]*?>[\\s\\S]*?<\\/style>"; //定义style的正则表达式
        String regEx_html="<[^>]+>"; //定义HTML标签的正则表达式

        Pattern p_script=Pattern.compile(regEx_script,Pattern.CASE_INSENSITIVE);
        Matcher m_script=p_script.matcher(htmlStr);
        htmlStr=m_script.replaceAll(""); //过滤script标签

        Pattern p_style=Pattern.compile(regEx_style,Pattern.CASE_INSENSITIVE);
        Matcher m_style=p_style.matcher(htmlStr);
        htmlStr=m_style.replaceAll(""); //过滤style标签

        Pattern p_html=Pattern.compile(regEx_html,Pattern.CASE_INSENSITIVE);
        Matcher m_html=p_html.matcher(htmlStr);
        htmlStr=m_html.replaceAll(""); //过滤html标签

        return htmlStr.trim(); //返回文本字符串
    }


}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值