将某个目录下所有word文档中内容提取出来并存入txt文件中。
文档中内容如下:
直接上代码:
#!/usr/bin/env python
# encoding: utf-8
'''
@file: 1_word.py
@time: 2020/11/27 21:15
'''
import docx
import os
file_dir = './doc'
for root, dirs, files in os.walk(file_dir, topdown=True):
pass
contents_list = []
for file in files:
file_name = './doc/' + file
file_content_list = []
file_content = docx.Document(file_name)
for para in file_content.paragraphs:
file_content_list.append(para.text)
contents_list.append(' '.join(file_content_list))
print(contents_list)
with open('corpus.txt', 'w', encoding='utf8') as f:
for str_line in contents_list:
f.write(str_line + '\n')
效果如下: