在做selenium自动化的过程中,发现对于需要点击填写省市区的过程中,出现了原数据不完整 具有缺失值的问题
例如:原地址为:黑龙江省 讷河市 某某某
但需要匹配的地址其实是:黑龙江省 齐齐哈尔市 讷河市 某某某
这样就出现了无法填写的问题
为了解决这个问题 在思考了很久后 最终决定先爬取一下全国省市区的表单。然后通过运算将全部地址输出
例如:我仅仅知道【讷河市】,但是可以自动匹配出 【黑龙江省】【齐齐哈尔市】
过程不赘述了,简单来说就是用原地址的字符串循环匹配查找,需要的可以直接用
下面有几个测试的例子,分别是缺省 缺市 的情况 都可以补全,但是目前还存在一些小问题 后续会继续完善,完善了再发。
需要表单的也可以在主页下载
from openpyxl import load_workbook
def province_2(workbook,address):
col = 1
row = 1
flag_6 = 0
data_list = []
address_province = address[0:2:1]
for sheet in workbook.worksheets:
nrows = sheet.max_row
for row in range(nrows) :
row =row+1#下面cell的取值不能从0开始 所以在这里加一
if address_province in sheet.cell(row,2).value:
address_other = address.replace(sheet.cell(row,2).value, '')
data_list.append([sheet.cell(row,1).value, sheet.cell(row,2).value, sheet.cell(row,3).value,address_other])
# 将匹配成功的值存储成一个n行3列的数组
row = 0
js = 0
while (js+2<=len(address_other)):
address_part = address_other[js:js+2]
nrows = sheet.max_row
for row in range(len(data_list)) :
if address_part in data_list[row][2]:
flag_6=1
address_other = address_other.replace(data_list[row][2], '')
consequence=[None]*4
consequence[0] = data_list[row][0]
consequence[1] = data_list[row][1]
consequence[2] = data_list[row][2]
consequence[3] = address_other
break
js = js+1
if (flag_6 == 1):
break
if (flag_6 == 0):
consequence=[None]*4
consequence[0] = data_list[0][0]
consequence[1] = data_list[0][1]
consequence[2] = data_list[0][2]
consequence[3] = data_list[0][3]
return consequence
def main(workbook,address):
sheet_names = workbook.sheetnames
sheet = workbook.worksheets[2]
address_province = address[0:2:1]
address_other = address[2:]
flag= 0
for sheet in workbook.worksheets:
if address_province in sheet.title: #直接匹配省
flag = 1 #修改标志位
#找到对应省的表,下面进行后续市级县的选择
if '省' in address_other:
address_other = address_other.replace('省', '')
js = 0
flag_1= 0 #做 市级循环完成是否进入区级循环的判定
flag_2 = 0 #做 区级循环完成是否跳出字符循环的判定
data_list = []
while (js+2<=len(address_other)):
address_part = address_other[js:js+2]
nrows = sheet.max_row
for row in range(nrows) :
row =row+1#下面cell的取值不能从0开始 所以在这里加一
if address_part in sheet.cell(row,2).value: #在第二列 市找到 再进行区级判定
flag_1 =1
address_other = address_other.replace(sheet.cell(row,2).value, '')
# 如果匹配成功,将(row,1)、(row,2)的值与(row,3)的值存储起来
data_list.append([sheet.cell(row,1).value, sheet.cell(row,2).value, sheet.cell(row,3).value,address_other]) #形成一个待处理的数组
if(flag_1 == 1 ):#市级查询成功 跳出循环 在下方进行区级处理
break
if (flag_1 == 0): #说明市级循环查询失败
for row in range(nrows) :
row =row+1
if (sheet.cell(row,3).value == None):
break
if address_part in sheet.cell(row,3).value:
flag_2 = 1
address_other = address_other.replace(sheet.cell(row,3).value, '')
data_list.append([sheet.cell(row,1).value, sheet.cell(row,2).value, sheet.cell(row,3).value,address_other]) #形成一个待处理的数组
if(flag_2 == 1):
consequence=[None]*4
consequence[0] = data_list[0][0]
consequence[1] = data_list[0][1]
consequence[2] = data_list[0][2]
consequence[3] = data_list[0][3]
break
if(flag_2 == 1):#说明区级循环查询成功
break
js =js+1
if(flag_2==1):#连跳两级结束
break
if(flag_2 == 0 and flag_1 == 0):
consequence=[None]*4
consequence[0]= sheet.title
consequence[1] = "无"
consequence[2] = "无"
consequence[3] = "无"
break
if(len(data_list)==1):
address_other = address_other[1:]
consequence=[None]*4
consequence[0]= data_list[0][0]
consequence[1] = data_list[0][1]
consequence[2] = "无"
consequence[3] = address_other
break
flag_4 = 0
js=0
while (js+2<=len(address_other)):
address_part2 = address_other[js:js+2]
for row in range(len(data_list)):
if address_part2 in data_list[row][2]:
flag_4=1
address_other = address_other.replace(data_list[row][2], '')
consequence=[None]*4
consequence[0] = data_list[row][0]
consequence[1] = data_list[row][1]
consequence[2] = data_list[row][2]
consequence[3] = address_other
break
if(flag_4==1):
break
if(flag_4==0):#对于区级遍历,如果遍历所有任然没有找到结果 默认第一个!
consequence=[None]*4
consequence[0] = data_list[0][0]
consequence[1] = data_list[0][1]
consequence[2] = data_list[0][2]
consequence[3] = data_list[0][3]
js = js+1
break
#当未在表明中找到对应省的时候开始下述操作
if (flag == 0):
consequence = province_2(workbook,address)
return consequence
def matching(address):
workbook = load_workbook('C:\\Users\\Admin\\Desktop\\省市地址\\省市地址.xlsx') #引用表
address = address
consequence = main(workbook,address)
province = consequence [0]
city = consequence [1]
area = consequence[2]
other = consequence[3]
print(province,city,area,other)
return consequence
# address="湖南省邵东县灵官殿镇双中村金东二组31号附2号"
# address = "新疆阿图什市上阿图什镇兰干村上阿图什路007号" #两个字的省 #完成
# address = "广州市天河区下元岗东街一巷10号" #缺少省 #完成
#address = "内蒙古二连浩特市肯特街北七街坊明达小区西区13号楼5单元0202号" #三个字的省 缺少市 #完成
#address = "广西北流市民乐镇会众村犁头铺组12号" #缺少市级 #完成
# address = "山东省文登市西楼街19-1号3单元305室" #完成
#address ="四川省泸县嘉明镇大同村一组8号" #完成
address = "四川省德阳市旌阳区九顶山路69号7栋7楼1号" #完成
matching(address)