1. 字符串提取信息(姓名, 手机号码, 地址)
import re
def extract_info(list_value):
mobile = str()
name = address = list_value[0]
for i in list_value:
if i.isnumeric() and len(i) == 11:
mobile = i
else:
if len(i) > len(name):
address = i
else:
name = i
return name, mobile, address
address = "测试 18119990001 , 北京市北京市东城区静宁路昌运大厦4楼401号"
delivery_address = re.sub('[\s,,]+', ',', address).split(",")
print(extract_info(delivery_address))
2. 地址分词
分词库: jieba(“结巴”中文分词:做最好的 Python 中文分词组件)
安装使用参考: github: https://github.com/fxsjy/jieba
Python代码如下:
query()是我封装的数据库查询方法(自行实现)
import jieba
def address_match(delivery_addresses):
if not delivery_addresses:
return {}
delivery_addresses = re.sub('[\s,,]+', ',', delivery_addresses).split(",") # 正则替换
name, mobile, address = str(), str(), str()
try:
name, mobile, address = extract_info(delivery_addresses)
except:
pass
seg_list = jieba.lcut(address, cut_all=False)
# 去除单个的关键分词(影响查询结果)
for key in seg_list:
if key.encode() in ['区', '县', '镇', '乡']:
seg_list.remove(key)
# province
provinces = query('SELECT * FROM area WHERE display=1 AND level=1 AND pid=0;')
province_dict = {}
for province in provinces:
for key in seg_list:
if key in province.get("name"):
province_dict = province
break
# cities
if not province_dict:
cities = query('SELECT * FROM area WHERE display=1 AND level=2;')
else:
cities = query(f'SELECT * FROM area WHERE display=1 AND level=2 AND pid={province_dict.get("id")};')
city_dict = {}
for city in cities:
for key in seg_list:
if key in city.get("name"):
city_dict = city
break
# counties
if not city_dict:
counties = query('SELECT * FROM area WHERE display=1 AND level=3;')
else:
counties = query(f'SELECT * FROM area WHERE display=1 AND level=3 AND pid={city_dict.get("id")};')
dis_dict = {}
index = 0
for county in counties:
for key in seg_list:
if key in county.get("name"):
dis_dict = county
index = address.index(key) + 2
if not dis_dict:
return {}
else:
if not city_dict:
city = query(f'SELECT * FROM area WHERE id={dis_dict.get("pid")};')
city_dict = city
if not province_dict:
province = query(f'SELECT * FROM area WHERE id={city_dict.get("pid")};')
province_dict = province
address_dict = {
'province': province_dict.get('name'),
'province_id': province_dict.get('id'),
'city': city_dict.get('name'),
'city_id': city_dict.get('id'),
'county': dis_dict.get('name'),
'county_id': dis_dict.get('id'),
'address': address[int(index) + len(key):],
'mobile': mobile,
'name': name
}
return address_dict
请求示例:
address="测试 18119990001 , 北京市北京市东城区静宁路昌运大厦4楼401号"
address_dict = await address_match(address)
返回结果:
{
"province": "北京",
"province_id": 1,
"city": "北京市",
"city_id": 2,
"county": "东城区",
"county_id": 3,
"address": "静宁路昌运大厦4楼401号",
"mobile": "19919990001",
"name": "测试"
}