使用多个关键字对word进行批量查找统计

最新推荐文章于 2023-07-12 23:02:55 发布

qq_24495987

最新推荐文章于 2023-07-12 23:02:55 发布

阅读量892

点赞数

分类专栏： # python相关文章标签： python

本文链接：https://blog.csdn.net/qq_24495987/article/details/128447432

版权

python相关专栏收录该内容

3 篇文章 0 订阅

订阅专栏

#coding: UTF-8
import jieba
import os
from docx import Document
import tkinter as tk
from tkinter import filedialog
# 创建关键词列表
from lxml.doctestcompare import strip


root = tk.Tk()
root.withdraw()

#获取文件路径
file_path = filedialog.askopenfilename(title=u"选择要查找关键词所在txt",filetypes = (("Txt files","*.txt"),("all files","*.*")))

keywords = []
jieba.load_userdict(file_path)  # 增加自定义词库
key_open = open(file_path, 'r', encoding='utf-8')
for words in key_open:
    keywords.append(words.strip())
key_open.close()

print(keywords)

word_dict2 = {}
word_lst = []


def word_list(word_dict, word_lst):
    # 统计词
    for item in word_lst:
        if item not in word_dict:
            word_dict[item] = 1
        else:
            word_dict[item] += 1
    return word_dict
def get_paragraphs_text(path):
    """
    获取所有段落的文本
    :param path: word路径
    :return: list类型，如：
        ['Test', 'hello world', ...]
    """
    document = Document(path)
    all_paragraphs = document.paragraphs
    paragraphs_text = []
    for paragraph in all_paragraphs:
        paragraphs_text.append(paragraph.text)
    return paragraphs_text

def get_all_tables_text(path):
    """
    获取word中所有表格的文本
    :param path: word路径
    :return: list类型的二维数组
        如：[['年龄', '排序'], ['23', '00',], ...]
    """
    document = Document(path)
    all_tables = document.tables
    text_list = []
    for table in all_tables:
        for row in table.rows:
            text = []
            for cell in row.cells:
                text.append(cell.text)
            text_list.append(text)
    return text_list

dir_path=filedialog.askdirectory(title=u"选择被查找对象文件夹")
file_path = filedialog.asksaveasfilename( defaultextension=".txt", title=u"保存查找结果",
                                       filetypes=(("Txt Files", "*.txt"), ("All Files", "*.*")))
with open(file_path, 'w') as wf1:
    for info in os.listdir(dir_path):  # 'H:\nnword' 是docx文档的地址
        domain = os.path.abspath(dir_path)
        print('《' + info + '》')  # 打印出docx的文件名
        print('------------------------------')
        info1 = os.path.join(domain, info)  # 拼接路径
        document = Document(info1)
        word_lst1 = []
        word_dict1 = {}
        # 打开 docx 文件
        for paragraph in document.paragraphs:  # 遍历每一个docx文档
            tags = jieba.cut(paragraph.text)
            for t in tags:
                if len(t) >1:
                    word_lst.append(t)
                    word_lst1.append(t)
        # 统计每篇文章的输出结果，并排序
        for table in document.tables:
            for row in table.rows:
                for cell in row.cells:
                    tags = jieba.cut(cell.text)
                    for t in tags:
                        if len(t) > 1:
                            word_lst.append(t)
                            word_lst1.append(t)
        word_list(word_dict1, word_lst1)
        word_sort1 = sorted(word_dict1.items(), key=lambda x: x[1], reverse=True)
        wf1.write(str('*' * 5) + info + str('*' * 5) + '\n')  # 打印题目
        # 将每篇文章的 统计结果输出到文本
        for i in range(len(word_sort1)):
            if word_sort1[i][0] in keywords:
                wf1.write(str(word_sort1[i][0] + '\t\t' + str(word_sort1[i][1]) + '\n'))
# 总的统计结果，
word_list(word_dict2, word_lst)
word_sort = sorted(word_dict2.items(), key=lambda x: x[1], reverse=True)

# with open("H:/out2.txt", 'w') as wf2, open("H:/out3.txt", "w") as wf3:
#     for i in range(len(word_sort)):
#         # 总的分词结果
#         wf2.write(str(word_sort[i][0]) + '\t\t' + str(word_sort[i][1]) + '\n')
#         if word_sort[i][0] in keywords:
#             # 符合关键词的
#             wf3.write("{0:15}\t{1:10}".format(str(word_sort[i][0]), str(word_sort[i][1]), chr(12288)) + '\n')
# wf2.close()
# wf3.close()