#coding: UTF-8
import jieba
import os
from docx import Document
import tkinter as tk
from tkinter import filedialog
# 创建关键词列表
from lxml.doctestcompare import strip
root = tk.Tk()
root.withdraw()
#获取文件路径
file_path = filedialog.askopenfilename(title=u"选择要查找关键词所在txt",filetypes = (("Txt files","*.txt"),("all files","*.*")))
keywords = []
jieba.load_userdict(file_path) # 增加自定义词库
key_open = open(file_path, 'r', encoding='utf-8')
for words in key_open:
keywords.append(words.strip())
key_open.close()
print(keywords)
word_dict2 = {}
word_lst = []
def word_list(word_dict, word_lst):
# 统计词
for item in word_lst:
if item not in word_dict:
word_dict[item] = 1
else:
word_dict[item] += 1
return word_dict
def get_paragraphs_text(path):
"""
获取所有段落的文本
:param path: word路径
:return: list类型,如:
['Test', 'hello world', ...]
"""
document = Document(path)
all_paragraphs = document.paragraphs
paragraphs_text = []
for paragraph in all_paragraphs:
paragraphs_text.append(paragraph.text)
return paragraphs_text
def get_all_tables_text(path):
"""
获取word中所有表格的文本
:param path: word路径
:return: list类型的二维数组
如:[['年龄', '排序'], ['23', '00',], ...]
"""
document = Document(path)
all_tables = document.tables
text_list = []
for table in all_tables:
for row in table.rows:
text = []
for cell in row.cells:
text.append(cell.text)
text_list.append(text)
return text_list
dir_path=filedialog.askdirectory(title=u"选择被查找对象文件夹")
file_path = filedialog.asksaveasfilename( defaultextension=".txt", title=u"保存查找结果",
filetypes=(("Txt Files", "*.txt"), ("All Files", "*.*")))
with open(file_path, 'w') as wf1:
for info in os.listdir(dir_path): # 'H:\nnword' 是docx文档的地址
domain = os.path.abspath(dir_path)
print('《' + info + '》') # 打印出docx的文件名
print('------------------------------')
info1 = os.path.join(domain, info) # 拼接路径
document = Document(info1)
word_lst1 = []
word_dict1 = {}
# 打开 docx 文件
for paragraph in document.paragraphs: # 遍历每一个docx文档
tags = jieba.cut(paragraph.text)
for t in tags:
if len(t) >1:
word_lst.append(t)
word_lst1.append(t)
# 统计每篇文章的输出结果,并排序
for table in document.tables:
for row in table.rows:
for cell in row.cells:
tags = jieba.cut(cell.text)
for t in tags:
if len(t) > 1:
word_lst.append(t)
word_lst1.append(t)
word_list(word_dict1, word_lst1)
word_sort1 = sorted(word_dict1.items(), key=lambda x: x[1], reverse=True)
wf1.write(str('*' * 5) + info + str('*' * 5) + '\n') # 打印题目
# 将每篇文章的 统计结果输出到文本
for i in range(len(word_sort1)):
if word_sort1[i][0] in keywords:
wf1.write(str(word_sort1[i][0] + '\t\t' + str(word_sort1[i][1]) + '\n'))
# 总的统计结果,
word_list(word_dict2, word_lst)
word_sort = sorted(word_dict2.items(), key=lambda x: x[1], reverse=True)
# with open("H:/out2.txt", 'w') as wf2, open("H:/out3.txt", "w") as wf3:
# for i in range(len(word_sort)):
# # 总的分词结果
# wf2.write(str(word_sort[i][0]) + '\t\t' + str(word_sort[i][1]) + '\n')
# if word_sort[i][0] in keywords:
# # 符合关键词的
# wf3.write("{0:15}\t{1:10}".format(str(word_sort[i][0]), str(word_sort[i][1]), chr(12288)) + '\n')
# wf2.close()
# wf3.close()
使用多个关键字对word进行批量查找统计
最新推荐文章于 2023-07-12 23:02:55 发布