本期属于NLP-文本分析 数据预处理系列
本期主要处理的问题是原始数据中每一个文本是用.doc或.docx文档保存的情况,涉及如何读取.docx文档内容,然后预处理、分词后进行保存,并将所有的预处理后的文本保存为一个文档,即语料库。
# python3.6
# encoding : utf-8 -*-
# @author : YingqiuXiong
# @e-mail : 1916728303@qq.com
# @file : report_process.py
# @Time : 2022/12/12 22:03
import os
import re
import jieba
from win32com.client import Dispatch
import docx
def preprocess_text(text, stopwords):
"""
用于中文文档预处理
主要包括去标点,去数字,分词和去除停用词,如果是长文本只保留词数量在10个以上的文档
"""
# for c in string.punctuation: # 去标点
# text = text.replace(c, ' ')
# for c in string.digits: # 去数字
# text = text.replace(c, '')
text = re.sub("[^\u4e00-\u9fa5]", "", text) # 文本中所有的非汉字(标点,数字,英文字母,空格等)去除
word_list = list(jieba.cut(text.strip("\n").strip()))
for word in word_list:
if len(word) < 2 or word in stopwords:
word_list.remove(word) # 移除停用词
doc = " ".join(word_list)
return doc
# 构建停用词表
stopwords_path = "./data/stopwords/stopwords_cn"
stopwords = []
with open(stopwords_path, "r", encoding="utf-8") as f:
while True:
line = f.readline()
if not line:
break
stopwords.append(line.strip("\n").strip())
# 读取文档,分词预处理并一条文档一行进行存储
docs_dir = "D:/pythonProject/topicModelProject/data/report" # 存放所有word文档的文件夹
assert os.path.exists(docs_dir)
store_process_report = docs_dir + "/report_processed.txt" # 存放分词预处理后的结果文件
word = Dispatch('Word.Application') # 打开word应用程序
word.Visible = 0 # 后台运行,不显示
word.DisplayAlerts = 0 # 不警告
with open(store_process_report, "a", encoding="gbk") as f:
for doc_filename in os.listdir(docs_dir):
doc_path = os.path.join(docs_dir, doc_filename)
new_doc = []
if doc_path.endswith(".doc"): # 将doc文件转换为docx文件
doc = word.Documents.Open(FileName=doc_path, Encoding='gbk')
new_filename = doc_filename.replace(".doc", ".docx")
new_path = os.path.join(docs_dir, new_filename) # 转换后的docx文件新路径
try:
doc.SaveAs(new_path, 12) # 另存为.docx文件
except Exception as e:
doc.Close()
word.Quit()
print(e)
break
doc.Close()
if new_path.endswith(".docx"):
document = docx.Document(new_path)
for para in document.paragraphs:
if para.text:
new_doc.append(para.text)
# 结束后将新生成的文件删除
os.remove(new_path)
elif doc_path.endswith(".docx"):
document = docx.Document(doc_path)
for para in document.paragraphs:
if para.text:
new_doc.append(para.text)
else:
print("--->无法处理:", doc_filename)
continue
document = " ".join(new_doc[2:])
doc_processed = preprocess_text(text=document, stopwords=stopwords)
print(doc_processed)
f.write(doc_processed + "\n")
word.Quit()
感谢:
https://www.cnblogs.com/coreLeo/p/15102481.html