import pdfplumber , os , re ,time , sys , io , copy
from docx import Document
from openpyxl import Workbook
from openpyxl import load_workbook
#define the work path
work_path = r'填入你的路径'
# define a function to get the every files name in the work path
def get_file_name(work_path):
file_name = []
file_name_path = []
for root, dirs, files in os.walk(work_path):
for file in files:
file_name.append(file)
file_name_path.append(os.path.join(root, file))
return file_name, file_name_path
#考虑编码问题导致导入失败
'''import chardet
# define a function to get the files encoding
def get_file_encoding(file_name_path):
file_encoding = []
for file in file_name_path:
# print(file)
with open(file, 'rb') as f:
# data = f.read()
# print(data)
print(chardet.detect(f.read(100)))
# print(file_encoding)
return file_encoding
print(get_file_encoding(file_name_path))'''
# 加载docx文件
def loaddocx(file):
# df = pd.DataFrame()
doc = Document(file)
body = ''
for paragraph in doc.paragraphs:
body += f'{paragraph.text}\n'
# print(body)
return body
if __name__ == '__main__':
file_name, file_name_path = get_file_name(work_path)
print(file_name)
print(file_name_path)
for file in file_name_path:
print(file)
if file.endswith('.docx'):
all_body = ''
# print(file)
body = loaddocx(file)
all_body += body
# print(all_body)
之前写的一个python遍历读取word文档读取内容的demo,没有写成类的形式,使用了docx库的Document,当然有可能会有字节编码问题,有时需要考虑代码里已注释掉的方法获取编码类型。