用于抽取word文档中指定表格的指定数据
供学习交流
import pandas as pd
import docx
import os
def get_data_from_path(save_path):
document = docx.Document(save_path)
col_keys = [] # 获取列名
col_values = [] # 获取列值
i = 0 # 用来筛选指定表格(对于文档中有多个表格的情况)
for table in document.tables:
if i == 0:
i += 1
continue
if i > 1 : break
i += 1
for row_index, row in enumerate(table.rows):
for col_index, cell in enumerate(row.cells):
if row_index > 0:
if col_index == 0:
col_keys.append(cell.text.replace('\n',''))
if col_index == 2:
col_values.append(cell.text)
# print(f'col keys is {col_keys}')
# print(f'col values is {col_values}')
return col_keys, col_values
def docx2csv(wordlist_path, csv_path):
key, value = get_data_from_path(wordlist_path)
res = [] # 存txt
size = len(key)
for i in range(size):
txt_temp = key[i] + ',' + value[i].replace('\n', ',')
txt_temp = txt_temp.split(',')
res.append(txt_temp)
df = pd.DataFrame(res)
df.to_csv(csv_path, encoding='utf_8_sig', index=False, header = None)
print('3Q')
wordlist_path = 'E:\\数字电子技术.docx'
csv_path = r'E:\1.csv'
docx2csv(wordlist_path, csv_path)
docxs_path = r'E:\电子系课程' # 文件夹路径
file_list = os.listdir(docxs_path)
print(file_list)
for fl in file_list:
if not os.path.splitext(fl)[1] == ".docx": # 筛选文件类型,注意”.“
continue
docx_path = os.path.join(docxs_path, fl) # 文件路径
csv_name = fl.split('.')[0] + '.csv' # csv文件名
csv_path = os.path.join(docxs_path, csv_name) # 文件路径
docx2csv(docx_path, csv_path)