NLP-文本分析数据预处理系列之 .doc或.docx文档集合转换为NLP需要的语料库

本文链接：https://blog.csdn.net/xionglovexi/article/details/128327293

本期属于NLP-文本分析数据预处理系列

本期主要处理的问题是原始数据中每一个文本是用.doc或.docx文档保存的情况，涉及如何读取.docx文档内容，然后预处理、分词后进行保存，并将所有的预处理后的文本保存为一个文档，即语料库。

# python3.6                                
# encoding    : utf-8 -*-                            
# @author     : YingqiuXiong
# @e-mail     : 1916728303@qq.com                                    
# @file       : report_process.py
# @Time       : 2022/12/12 22:03
import os
import re

import jieba
from win32com.client import Dispatch
import docx


def preprocess_text(text, stopwords):
    """
    用于中文文档预处理
    主要包括去标点，去数字，分词和去除停用词，如果是长文本只保留词数量在10个以上的文档
    """
    # for c in string.punctuation:  # 去标点
    #     text = text.replace(c, ' ')
    # for c in string.digits:  # 去数字
    #     text = text.replace(c, '')
    text = re.sub("[^\u4e00-\u9fa5]", "", text)  # 文本中所有的非汉字（标点，数字，英文字母，空格等）去除
    word_list = list(jieba.cut(text.strip("\n").strip()))
    for word in word_list:
        if len(word) < 2 or word in stopwords:
            word_list.remove(word)  # 移除停用词
    doc = " ".join(word_list)
    return doc


# 构建停用词表
stopwords_path = "./data/stopwords/stopwords_cn"
stopwords = []
with open(stopwords_path, "r", encoding="utf-8") as f:
    while True:
        line = f.readline()
        if not line:
            break
        stopwords.append(line.strip("\n").strip())


# 读取文档，分词预处理并一条文档一行进行存储
docs_dir = "D:/pythonProject/topicModelProject/data/report"  # 存放所有word文档的文件夹
assert os.path.exists(docs_dir)
store_process_report = docs_dir + "/report_processed.txt"  # 存放分词预处理后的结果文件
word = Dispatch('Word.Application')  # 打开word应用程序
word.Visible = 0  # 后台运行,不显示
word.DisplayAlerts = 0  # 不警告
with open(store_process_report, "a", encoding="gbk") as f:
    for doc_filename in os.listdir(docs_dir):
        doc_path = os.path.join(docs_dir, doc_filename)
        new_doc = []
        if doc_path.endswith(".doc"):  # 将doc文件转换为docx文件
            doc = word.Documents.Open(FileName=doc_path, Encoding='gbk')
            new_filename = doc_filename.replace(".doc", ".docx")
            new_path = os.path.join(docs_dir, new_filename)  # 转换后的docx文件新路径
            try:
                doc.SaveAs(new_path, 12)  # 另存为.docx文件
            except Exception as e:
                doc.Close()
                word.Quit()
                print(e)
                break
            doc.Close()
            if new_path.endswith(".docx"):
                document = docx.Document(new_path)
                for para in document.paragraphs:
                    if para.text:
                        new_doc.append(para.text)
            # 结束后将新生成的文件删除
            os.remove(new_path)
        elif doc_path.endswith(".docx"):
            document = docx.Document(doc_path)
            for para in document.paragraphs:
                if para.text:
                    new_doc.append(para.text)
        else:
            print("--->无法处理：", doc_filename)
            continue
        document = " ".join(new_doc[2:])
        doc_processed = preprocess_text(text=document, stopwords=stopwords)
        print(doc_processed)
        f.write(doc_processed + "\n")
word.Quit()