前言:
如何从文章内提取地址?方法有很多种,我这边提一个我的思路。
1、整理一份标准地址库,至少包括以下几列:省、市、镇、街道、路。
有人会说整理很难,没错,很难找到成熟的地址库(这种都是能卖Q的),通过政府官网可以获得的数据也仅仅是省、市、镇、街道/居委之类的,如果要整理成一份详尽的,难度很高。
以上海为例,我主要是通过百度文库获得了一份相对来说比较全的地址库,但也仅仅是路和区的对应表,无法整理到镇或街道。所以我又在上海统计局官网中找到了居委会和区的对应表,经过多次整理获得一份相对比较准确的清单。
2、获得地址库后,就仅需要从最大级别开始依次遍历获取可能的地址字符串,
比如以下文本
可以很清楚看到就是办理地点后面的内容,是一个标准化的格式。
我们也可以先找南安市,找到后再找后面是否有“号”之类的字,根据我手上的数据统计,90%以上的地址最后是以“号”、“室”之类结尾的,加上正则表达式即可完成提取。
3、提取后的内容可能会有杂音,比如上例中的“著名的”,当然一般来说都是比较干净的。
代码:
代码比较烂,没有特别整理,仅供参考
def possible_address(possible_position, html_text):
possible_text = re.split(r'[,,\s]', html_text[possible_position:], 1)[0]
re_value = re.compile('.*[0-9a-zA-Z/之号室栋楼幢座号巷对面米侧旁出口路东西南北上铺站]')
for keyword in ['室', '栋', '楼','幢', '座', '号', '巷', '对面', '米', '侧', '旁','出口', '路东', '路西', '路南', '路北',
'楼上', '铺','路','大道']:
keyword_position = possible_text.find(keyword)
if keyword in possible_text and (4 <=keyword_position<=30):
right_text = re.search(re_value, possible_text).group(0)
return right_text
def get_right_address(html_text):
'''通过省和城市来提取正确的地址'''
temp_address = ''
#先看省
province_count, city_count = 0, 0
for province in new_province_list:
province_position = html_text.find(province)
if province_position != -1:
right_address = possible_address(province_position, html_text)
if right_address:
return right_address
else:
province_count += 1
if province_count == 1:
temp_address = province
else:
temp_address = ''
else:
#再看市
for city in new_city_list:
city_position = html_text.find(city)
if city_position != -1:
right_address = possible_address(city_position, html_text)
if temp_address != '':
if right_address:
if temp_address not in right_address:
right_address = temp_address + right_address
return right_address
else:
temp_address = temp_address + city
else:
if right_address:
return right_address
else:
temp_address = temp_address + city
else:
#再看区
for district in district_list:
district_position = html_text.find(district)
if district_position != -1:
right_address = possible_address(district_position, html_text)
if temp_address != '':
if right_address:
if temp_address not in right_address:
right_address = temp_address + right_address
return right_address
else:
temp_address = temp_address + district
else:
if right_address:
return right_address
else:
temp_address = temp_address + district
else:
#再看镇
for township in township_list:
township_position = html_text.find(township)
if township_position != -1:
# print(township)
right_address = possible_address(township_position, html_text)
if temp_address != '':
# print(temp_address)
if right_address:
if temp_address not in right_address:
right_address = temp_address + right_address
return right_address
else:
temp_address = temp_address + township
# print(temp_address)
else:
if right_address:
return right_address
else:
temp_address = temp_address + township
else:
#再看街
for street in street_list:
street_position = html_text.find(street)
if street_position != -1:
right_address = possible_address(street_position, html_text)
if temp_address != '':
if right_address:
if temp_address not in right_address:
right_address = temp_address + right_address
return right_address
else:
temp_address = temp_address + street
else:
if right_address:
return right_address
else:
temp_address = temp_address + street
else:
return temp_address