xlsx和xls表格解析之跨列情况

冰吸生椰拿铁.

已于 2024-01-13 17:29:23 修改

阅读量440

点赞数 8

文章标签： python 爬虫

于 2024-01-13 17:28:45 首次发布

本文链接：https://blog.csdn.net/weixin_66451233/article/details/135573038

版权

采集过程碰见难搞的跨行跨列问题，先把简单点的代码丢这里，后面思考框架该怎么做

xls情况

workbook = xlrd.open_workbook('anqing.xls')
sheet = workbook.sheet_by_name('Sheet1')
key_row_index = ''
for row_index in range(sheet.nrows):
    if '纳税人名称' in sheet.row_values(row_index):        #命中关键词标题，记录下来
          key_row_index = row_index            
          break
header = sheet.row_values(key_row_index)
row = sheet.row_values(key_row_index+1)                    #读跨行的第二行标题
combined_header = [h1 or h2 for h1, h2 in zip(header, row)]#并集合并列表，形成完整的属性列表
row_lst = []
example_dicts = []
tax_fields = ['xx税', 'xx税', 'xx税']
for index in range(key_row_index+2,sheet.nrows):
    row = sheet.row_values(index)
    row_dict = {combined_header[i]: row[i] for i in range(len(combined_header)) if row[i] != ''}                #根据属性列表的索引来获取每一行的字典
    example_dicts.append(row_dict)
split_dicts = []
for d in example_dicts:#遍历每一行形成的字典
    for tax in tax_fields:#遍历关键词命中
        if tax in d:
           new_dict = {k: d[k] for k in d if k not in tax_fields or k == tax}#创建字典
           split_dicts.append(new_dict)
for split_dict in split_dicts:
    print(split_dict)

xlsx情况

workbook = openpyxl.load_workbook('hainan.xlsx')
sheet = workbook['排序版']
key_row_index = ''
for row in sheet.iter_rows():
    if '纳税人名称' in [cell.value for cell in row]:
        key_row_index = row[0].row
        break
header1 = [cell.value for cell in sheet[key_row_index]]
header2 = [cell.value for cell in sheet[key_row_index+1]]
combined_header = [h1 or h2 for h1, h2 in zip(header1, header2)]
tax_fields = ['xx税', 'xx税', 'xx税']
for row in sheet.iter_rows(min_row=key_row_index+2, values_only=True):
    row_dict = {k: v for k, v in zip(combined_header, row) if v is not None and v != ''}
    example_dicts.append(row_dict)
split_dicts = []
for d in example_dicts:
    for tax in tax_fields:
        if tax in d:
            new_dict = {k: d[k] for k in d if k not in tax_fields or k == tax}
            split_dicts.append(new_dict)
for split_dict in split_dicts:
    print(spilt_dict)