省市区地址匹配——解决身份证地址缺失值问题

在做selenium自动化的过程中,发现对于需要点击填写省市区的过程中,出现了原数据不完整 具有缺失值的问题

例如:原地址为:黑龙江省 讷河市 某某某

但需要匹配的地址其实是:黑龙江省 齐齐哈尔市 讷河市 某某某

这样就出现了无法填写的问题

为了解决这个问题 在思考了很久后 最终决定先爬取一下全国省市区的表单。然后通过运算将全部地址输出

例如:我仅仅知道【讷河市】,但是可以自动匹配出 【黑龙江省】【齐齐哈尔市】

过程不赘述了,简单来说就是用原地址的字符串循环匹配查找,需要的可以直接用

下面有几个测试的例子,分别是缺省 缺市 的情况 都可以补全,但是目前还存在一些小问题 后续会继续完善,完善了再发。

需要表单的也可以在主页下载


from openpyxl import load_workbook 


def province_2(workbook,address):
    col = 1
    row = 1
    flag_6 = 0
    data_list = []  
    address_province = address[0:2:1]
    for sheet in workbook.worksheets:
        nrows = sheet.max_row
        for row in range(nrows) : 
            row =row+1#下面cell的取值不能从0开始 所以在这里加一
            if address_province in sheet.cell(row,2).value:
                address_other  = address.replace(sheet.cell(row,2).value, '') 
                data_list.append([sheet.cell(row,1).value, sheet.cell(row,2).value, sheet.cell(row,3).value,address_other])  
                # 将匹配成功的值存储成一个n行3列的数组  
    row = 0
    js = 0
    while (js+2<=len(address_other)):
        address_part = address_other[js:js+2]      
        nrows = sheet.max_row
        for row in range(len(data_list)) : 
            if  address_part in data_list[row][2]:
                flag_6=1
                address_other  = address_other.replace(data_list[row][2], '')  
                consequence=[None]*4
                consequence[0] = data_list[row][0]
                consequence[1] = data_list[row][1]
                consequence[2] = data_list[row][2]
                consequence[3] = address_other
                break
        js = js+1
        if (flag_6 == 1):
            break
    if (flag_6 == 0):
        consequence=[None]*4
        consequence[0] = data_list[0][0]
        consequence[1] = data_list[0][1]
        consequence[2] = data_list[0][2]
        consequence[3] = data_list[0][3]
    return consequence
def main(workbook,address):
    sheet_names = workbook.sheetnames 
    sheet = workbook.worksheets[2]
    address_province = address[0:2:1]
    address_other = address[2:]
    flag= 0
    for sheet in workbook.worksheets:
        if address_province in sheet.title: #直接匹配省
            flag = 1                   #修改标志位
            #找到对应省的表,下面进行后续市级县的选择
            if '省' in address_other:    
                address_other = address_other.replace('省', '')
            js = 0
            flag_1= 0 #做 市级循环完成是否进入区级循环的判定
            flag_2 = 0 #做 区级循环完成是否跳出字符循环的判定
            data_list = []
            while (js+2<=len(address_other)):
                address_part = address_other[js:js+2]
                nrows = sheet.max_row
                for row in range(nrows) : 
                    row =row+1#下面cell的取值不能从0开始 所以在这里加一
                    if address_part in sheet.cell(row,2).value:                             #在第二列 市找到 再进行区级判定
                        flag_1 =1
                        address_other = address_other.replace(sheet.cell(row,2).value, '')                                             
                        # 如果匹配成功,将(row,1)、(row,2)的值与(row,3)的值存储起来  
                        data_list.append([sheet.cell(row,1).value, sheet.cell(row,2).value, sheet.cell(row,3).value,address_other])    #形成一个待处理的数组
                if(flag_1 == 1 ):#市级查询成功 跳出循环 在下方进行区级处理
                    break
                if (flag_1 == 0): #说明市级循环查询失败
                    
                    for row in range(nrows) : 
                        row =row+1
                        if (sheet.cell(row,3).value  == None):
                            break
                        if address_part in sheet.cell(row,3).value:    
                            flag_2 = 1                     
                            address_other = address_other.replace(sheet.cell(row,3).value, '')
                            data_list.append([sheet.cell(row,1).value, sheet.cell(row,2).value, sheet.cell(row,3).value,address_other])    #形成一个待处理的数组                      
                        if(flag_2 == 1):   
                            consequence=[None]*4
                            consequence[0] = data_list[0][0]
                            consequence[1] = data_list[0][1]
                            consequence[2] = data_list[0][2]
                            consequence[3] = data_list[0][3]
                            break
                if(flag_2 == 1):#说明区级循环查询成功
                    break
                js =js+1
            if(flag_2==1):#连跳两级结束
                break   
            if(flag_2 == 0 and flag_1 == 0):
                consequence=[None]*4
                consequence[0]= sheet.title
                consequence[1] = "无"
                consequence[2] = "无"
                consequence[3] = "无"
                break  

            if(len(data_list)==1):
                address_other = address_other[1:]
                consequence=[None]*4
                consequence[0]= data_list[0][0]
                consequence[1] = data_list[0][1]
                consequence[2] = "无"
                consequence[3] = address_other
                break
            flag_4 = 0
            js=0
            while (js+2<=len(address_other)):
                address_part2 = address_other[js:js+2]
                for row in range(len(data_list)):  
                    if  address_part2 in data_list[row][2]:

                        flag_4=1
                        address_other  = address_other.replace(data_list[row][2], '')                         
                        consequence=[None]*4
                        consequence[0] = data_list[row][0]
                        consequence[1] = data_list[row][1]
                        consequence[2] = data_list[row][2]
                        
                        consequence[3] = address_other
            
                        break
                if(flag_4==1):
                    break
                if(flag_4==0):#对于区级遍历,如果遍历所有任然没有找到结果   默认第一个!
                    consequence=[None]*4
                    consequence[0] = data_list[0][0]
                    consequence[1] = data_list[0][1]
                    consequence[2] = data_list[0][2]
                    consequence[3] = data_list[0][3]
                js = js+1
            break
    #当未在表明中找到对应省的时候开始下述操作
    if (flag == 0):
        consequence = province_2(workbook,address)
    return consequence

def matching(address):

    
    workbook = load_workbook('C:\\Users\\Admin\\Desktop\\省市地址\\省市地址.xlsx') #引用表
    address = address
    consequence = main(workbook,address)
    province = consequence [0]
    city = consequence [1]
    area = consequence[2]
    other = consequence[3]
    print(province,city,area,other)
    return consequence


# address="湖南省邵东县灵官殿镇双中村金东二组31号附2号"

# address = "新疆阿图什市上阿图什镇兰干村上阿图什路007号"  #两个字的省                        #完成
# address = "广州市天河区下元岗东街一巷10号"              #缺少省                            #完成
#address = "内蒙古二连浩特市肯特街北七街坊明达小区西区13号楼5单元0202号"   #三个字的省 缺少市  #完成
#address = "广西北流市民乐镇会众村犁头铺组12号"                  #缺少市级                  #完成 
# address = "山东省文登市西楼街19-1号3单元305室"                                            #完成
#address ="四川省泸县嘉明镇大同村一组8号"                                                  #完成
address = "四川省德阳市旌阳区九顶山路69号7栋7楼1号"                      #完成
matching(address)                             

                

  • 8
    点赞
  • 10
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值