大数据环境下 实现每天千万级地址关联 10万/秒

地名作为最常用的社会公共信息,不仅与人们的日常生活息息相关,而且是政府行政行为、经济建设不可缺少的基础信息资源。在政务系统中有许多需要将业务地址关联到标准地址的场景,addresstool致力于解决地址关联匹配算法中的速度和准确性问题。经实测,单核addresstool的地址关联速度在5000/秒-20000/秒之间(取决于业务地址质量),关联匹配正确率达到98%。hadoop分布式环境地址关联匹配速度能达到10万+/秒(具体取决与大数据节点数和地址质量,节点足够。每秒实现百万级地址关联)。

本文大数据环境为hive数据库,通过udf将addresstool封装,最后实现分布式计算。
直接上代码

public class AddressLink extends GenericUDF {
    private PrimitiveObjectInspector addressIO;
    private static AddressTool addressTool;



    private String bld(String building){
        if(building!=null&&!building.isEmpty() ){
            if(building.endsWith("栋")||building.endsWith("幢")){
                return building.substring(0,building.length()-1);
            }else if(building.endsWith("号楼")){
                return building.substring(0,building.length()-2);
            }
        }

        return building;
    }

    private String unit(String unit){
        if(unit!=null&&!unit.isEmpty() ){
            if(unit.endsWith("单元")){
                return unit.substring(0,unit.length()-2);
            }
        }

        return unit;
    }

    private String room(String room){
        if(room!=null&&!room.isEmpty() ){
            if(room.endsWith("室")||room.endsWith("户")){
                return room.substring(0,room.length()-1);
            }
        }

        return room;
    }


    @Override
    public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {
        if (arguments[0] instanceof ObjectInspector) {
            addressIO = (PrimitiveObjectInspector) arguments[0];
        }else{
            throw new UDFArgumentLengthException("The function GetMapValue accepts  1 argument. simple: GetSqName(sq_name)");
        }
        addressTool = new AddressTool();
        DataTable data = new DataTable();
        try{
            //注册Driver
            String driver = "org.postgresql.Driver";//prop.getProperty("driver");
            String url = "jdbc:postgresql://*****:5432/postgres";//prop.getProperty("url");
            String username = "******";//prop.getProperty("user");
            String password = "******";//prop.getProperty("password");
            Class.forName(driver);
            Connection connection = DriverManager.getConnection(url, username, password);
            Statement statement = connection.createStatement();

            // 数据初始化
            ResultSet res = statement.executeQuery("select id,province,city,county,town,community,road,road_no,aoi,sub_aoi,building,unit,room,address from st_address order by aoi,road,road_no");
            int cnt = 0;
            while (res.next()) {
                HashMap<String,String> mp = new HashMap<>();
                if(res.getString("id")!=null&& !Objects.equals(res.getString("id"), "")){mp.put("id",res.getString("id"));}
                if(res.getString("province")!=null&& !Objects.equals(res.getString("province"), "")){mp.put("province",res.getString("province"));}
                if(res.getString("city")!=null&& !Objects.equals(res.getString("city"), "")){mp.put("city",res.getString("city"));}
                if(res.getString("county")!=null&& !Objects.equals(res.getString("county"), "")){mp.put("county",res.getString("county"));}
                if(res.getString("town")!=null&& !Objects.equals(res.getString("town"), "")){mp.put("town",res.getString("town"));}
                if(res.getString("community")!=null&& !Objects.equals(res.getString("community"), "")){mp.put("community",res.getString("community"));}
                if(res.getString("road")!=null&& !Objects.equals(res.getString("road"), "")){mp.put("road",res.getString("road"));}
                if(res.getString("road_no")!=null&& !Objects.equals(res.getString("road_no"), "")){mp.put("road_no",res.getString("road_no"));}
                if(res.getString("aoi")!=null&& !Objects.equals(res.getString("aoi"), "")){mp.put("aoi",res.getString("aoi"));}
                if(res.getString("sub_aoi")!=null&& !Objects.equals(res.getString("sub_aoi"), "")){mp.put("sub_aoi",res.getString("sub_aoi"));}
                if(res.getString("building")!=null&& !Objects.equals(res.getString("building"), "")){mp.put("building",bld(res.getString("building")));}
                if(res.getString("unit")!=null&& !Objects.equals(res.getString("unit"), "")){mp.put("unit",unit(res.getString("unit")));}
                if(res.getString("room")!=null&& !Objects.equals(res.getString("room"), "")){mp.put("room",room(res.getString("room")));}
                if(res.getString("address")!=null&& !Objects.equals(res.getString("address"), "")){mp.put("address",res.getString("address"));}
                data.addAddressDic(mp);
                cnt = cnt + 1;
            }

            //标准数据地址数据加载到addresstool
            data.initData(addressTool);
            data = null;
            statement.close();
            connection.close();

        } catch (Exception throwables) {
            throwables.printStackTrace();
        }

        return ObjectInspectorFactory.getStandardMapObjectInspector(
                PrimitiveObjectInspectorFactory.javaStringObjectInspector,
                PrimitiveObjectInspectorFactory.javaStringObjectInspector);
    }

    @Override
    public Object evaluate(DeferredObject[] arguments) throws HiveException {
        if(arguments[0].get()==null){
            return null;
        }


        String address =  PrimitiveObjectInspectorUtils.getString(arguments[0].get(), this.addressIO);
        // 中文地址中的异常字符预处理
        while(address.contains(" ")){address = address.replace(" ","");}
        while(address.contains("--")){address = address.replace("--","-");}
        while(address.contains("——")){address = address.replace("——","-");}
        while(address.contains("- ")){address = address.replace("- ","-");}
        while(address.contains(" -")){address = address.replace(" -","-");}
        while(address.contains("— ")){address = address.replace("— ","-");}
        while(address.contains(" —")){address = address.replace(" —","-");}


        // 地址关联
        StandardAddress stdAddress = addressTool.getStdAddress(address);
        Map<String,String> result = stdAddress.getStdAddress();
        // 地址级别判断
        if(stdAddress.addressLevel!=null&& !stdAddress.addressLevel.equals("")){
            result.put("addressLevel",stdAddress.addressLevel);
        }else{
            result.put("addressLevel","未知");
        }

        // 地址关联级别判断
        if(stdAddress.linkLevel!=null&& !stdAddress.linkLevel.equals("")){
            result.put("linkLevel",stdAddress.linkLevel);
        }else{
            result.put("linkLevel","未关联");
        }


        return result;
    }


    @Override
    public String getDisplayString(String[] children) {
        return "Address(" + children[0] + ")";
    }

}

addresstool在分布式节点下计算速度超级快,经实测,1千万地址数据在sparksql调用udf方式,耗时3.5分钟。5千万数据耗时8分钟。

详细代码见git上AddressLink类

通过addresstool与大数据结合,成功实现每日千万级业务地址全量关联更新。

java资源下载
https://download.csdn.net/download/u011024436/89035851
源码学习
https://gitee.com/addresstool/address

使用中有问题或者建议,欢迎联系邮箱addresstool@163.com

  • 27
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

addresstool

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值