中文地址结构化要素解析

最近在做标准地址匹配,比如输入 金丽花园1#2-102
解析成如下格式并且匹配到标准地址

{
    "status": 20000,
    "message": "查询成功",
    "data": {
        "output": [
            {
                "type": "poi",
                "start": 0,
                "end": 4,
                "prob": 4,
                "span": "金丽花园"
            },
            {
                "type": "houseno",
                "start": 4,
                "end": 7,
                "prob": 7,
                "span": "1#2"
            }
        ],
        "full_bzdz_name": "山东省淄博市张店区科苑街道丽景苑社区北西五路21号金丽花园1号楼2单元102室",
        "houseid": "",
        "table_name": "bzdz_household",
        "id": "1726"
    }
}

api采用了modelscopeMGeo门址地址结构化要素解析-中文-地址领域-base模型,使用fastapi 封装成api


#!/usr/bin/env python3
from typing import List, Optional
from datetime import timedelta
from fastapi import APIRouter, Depends, HTTPException, status
from fastapi.security import OAuth2PasswordRequestForm
from fastapi.responses import JSONResponse
import requests,re
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
import pandas as pd
from clickhouse_driver import Client
from elasticsearch import Elasticsearch
from elasticsearch import helpers



task = Tasks.token_classification
model = 'damo/mgeo_geographic_elements_tagging_chinese_base'
pipeline_ins = pipeline(task=task, model=model)


router = APIRouter()
client = Client(host='0.0.0.0', port='000', database='xxx',password='xxx')
def read_ck(sql, client):
    data, columns = client.execute(sql, columnar=True, with_column_types=True)
    df = pd.DataFrame({re.sub(r'\W', '_', col[0]): d for d, col in zip(data, columns)})
    return df.to_dict("records")


host='0.0.0.0'
port=0000
user="xxxxx"
pwd='xxxx'
es = Elasticsearch(host+':'+str(port), http_auth=(user, pwd), maxsize=15) 


@router.get("/addressdeal/", summary='解析地址')
def addressdeal(address: str):
    

    client = Client(host='0.0.0.0', port='000', database='xxxx',password='xxxxxx')
    data=pipeline_ins(input=address)
    tab_type=3

    house_str=''
    table_name=''
    resp={}
    resp['output']=data['output']
    resp['full_bzdz_name']=''
    resp['houseid']=''
    resp['table_name']=''


    if not data['output']:
        return JSONResponse(content=resp, status_code=status.HTTP_200_OK)

    for ou in data['output']:
        ou['start']=int(ou['start'])
        ou['end']=int(ou['end'])
        ou['prob']=float(ou['end'])


    end_item=data['output'][-1]

    if end_item['type'] == 'cellno':
        table_name = 'bzdz_household'
        house_str=address[end_item['end']:]
        tab_type=3
        

    # 匹配到楼栋
    if end_item['type'] in ['houseno']:
        table_name = 'bzdz_build'
        build_numbers = re.findall(r'\d+', end_item['span'])
        tab_type=2
        if build_numbers and len(build_numbers)>1:
            table_name = 'bzdz_household'
            house_str=address[end_item['end']:]
            tab_type=3

    # 小区
    if end_item['type'] in ['poi','subpoi']:
        table_name = 'bzdz_village_info'
        tab_type=1



    if  table_name =='':
        return JSONResponse(content=resp, status_code=status.HTTP_200_OK)

    dict={
        "一":'1',
        "二":'2',
        "三":'3',
        "四":'4',
        "五":'5',
        "六":'6',
        "七":'7',
        "八":'8',
        "九":'9',
        "十":'10',
        "十一":'11',
        "十二":'12',
        "十三":'13',
        "十四":'14',
        "十五":'15',
        "十六":'16',
        "十七":'17',
        "十八":'18',
        "十九":'19',
        "二十":'20',
    }

    like_str=''
    search_re=''
    fg='\\\\D+'
    search_list=[]
    village_str=''
    for item in  data['output']:
        kw=item['span']
        if item['type'] in ['cellno','poi','subpoi','houseno','road','roadno']:
            if item['type'] in ['houseno','cellno']:
                for k,v in dict.items():
                    if k in kw:
                        kw=kw.replace(k,v)
                build_numbers = re.findall(r'\d+', kw)
                if build_numbers:
                    # kw= str(build_numbers[0])
                    for bn in build_numbers:
                        search_list.append(str(bn))
            # 小区
            if item['type'] in ['poi','subpoi','community'] and  len(kw) >2:
                village_str+=kw+"|"
                like_str+=f" and ( name like '%{kw}%' or short_name like '%{kw}%' or old_name like '%{kw}%'   )"
                tab_type=3
    if house_str:
        house_numbers = re.findall(r'\d+', house_str)
        if house_numbers:
            kw= str(house_numbers[0])+"室"
            search_list.append(kw)
        # like_str+=f" and ( full_bzdz_name like '%{house_str}%' or short_name like '%{house_str}%' or old_name like '%{house_str}%'   )"
    re_str=''
    search_re=''
    # '.*莲池生活\\D+13号楼\\D+1\\D+101室')
    if village_str:
        village_str=village_str.rstrip('|')
        search_re= f'.*({village_str})\\\\D*'
    if search_list:
        search_re += fg.join(str(n) for n in search_list)
        re_str+= f" and   (extractAll(full_bzdz_name, '{search_re}')[1]) != ''"

        sql=f"select * from {table_name} where 1 {re_str}"
        print(sql)
        address_lt=read_ck(sql,client)
        if address_lt:
            resp['full_bzdz_name']=address_lt[0]['full_bzdz_name']
            resp['id']=address_lt[0]['id']
            resp['table_name']=table_name

    return JSONResponse(content=resp, status_code=status.HTTP_200_OK)


  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值