采集过程碰见难搞的跨行跨列问题,先把简单点的代码丢这里,后面思考框架该怎么做
xls情况
workbook = xlrd.open_workbook('anqing.xls')
sheet = workbook.sheet_by_name('Sheet1')
key_row_index = ''
for row_index in range(sheet.nrows):
if '纳税人名称' in sheet.row_values(row_index): #命中关键词标题,记录下来
key_row_index = row_index
break
header = sheet.row_values(key_row_index)
row = sheet.row_values(key_row_index+1) #读跨行的第二行标题
combined_header = [h1 or h2 for h1, h2 in zip(header, row)]#并集合并列表,形成完整的属性列表
row_lst = []
example_dicts = []
tax_fields = ['xx税', 'xx税', 'xx税']
for index in range(key_row_index+2,sheet.nrows):
row = sheet.row_values(index)
row_dict = {combined_header[i]: row[i] for i in range(len(combined_header)) if row[i] != ''} #根据属性列表的索引来获取每一行的字典
example_dicts.append(row_dict)
split_dicts = []
for d in example_dicts:#遍历每一行形成的字典
for tax in tax_fields:#遍历关键词命中
if tax in d:
new_dict = {k: d[k] for k in d if k not in tax_fields or k == tax}#创建字典
split_dicts.append(new_dict)
for split_dict in split_dicts:
print(split_dict)
xlsx情况
workbook = openpyxl.load_workbook('hainan.xlsx')
sheet = workbook['排序版']
key_row_index = ''
for row in sheet.iter_rows():
if '纳税人名称' in [cell.value for cell in row]:
key_row_index = row[0].row
break
header1 = [cell.value for cell in sheet[key_row_index]]
header2 = [cell.value for cell in sheet[key_row_index+1]]
combined_header = [h1 or h2 for h1, h2 in zip(header1, header2)]
tax_fields = ['xx税', 'xx税', 'xx税']
for row in sheet.iter_rows(min_row=key_row_index+2, values_only=True):
row_dict = {k: v for k, v in zip(combined_header, row) if v is not None and v != ''}
example_dicts.append(row_dict)
split_dicts = []
for d in example_dicts:
for tax in tax_fields:
if tax in d:
new_dict = {k: d[k] for k in d if k not in tax_fields or k == tax}
split_dicts.append(new_dict)
for split_dict in split_dicts:
print(spilt_dict)