# !/usr/bin/env python
# -*-coding:utf-8 -*-
# File : 文件查重.py
# Time :2023/9/2 13:28
# Author :QQ736592720
# pip install python-docx
# pip install pypiwin32
# pip install pyinstaller
# pyinstaller -F 文件查重.py
# pyinstaller -F -i 128.ico 文件查重.py
import glob
import os
import re
from docx import Document
# from win32com import client as wc
# def doc_to_docx(oldfilename, newfilename):
# word = wc.Dispatch("Word.Application")
# doc = word.Documents.Open(oldfilename)
# doc.SaveAs(newfilename, 12)
# doc.Close()
# word.Quit()
def get_docx_list(filename):
'''
获取需要查重的文件的list,逐行分割
:return: list
'''
doc = Document(filename)
ls = []
for p in doc.paragraphs:
if len(re.findall("[。;]", p.text)) > 1:
arr = re.split("。|;", p.text)
arr = [i + "。" for i in arr if i]
ls.extend(arr)
else:
ls.append(p.text)
doc.save(filename)
return ls
def get_docx_dic(filenames):
'''
获取除被查重文件以外的所有参考文献的内容,以文件名:内容,键值对保存
:return: dic
'''
dic = {}
for file in filenames:
file_key = os.path.split(file)[1][:-5] # r"C:\Users\999\Desktop\查重\A1.docx"
ls = get_docx_list(file)
dic[file_key] = "\n".join(ls)
return dic
def find_in_dic(s, dic):
'''
根据语句,在参考文献构成的dic中找内容是否重复
:return: 返回文件名
'''
for k, v in dic.items():
if v.find(s) > -1:
print("Author:QQ736592720--- repf found in : " + k + ".docx")
return k
def save_text(filename, sss):
'''
保存文件:txt
'''
with open(filename, "w", encoding="utf-8") as f:
f.write(sss)
print("Author:QQ736592720---文件保存成功 :"+filename)
def repf_search(ls, dic, weight):
'''
被查重的文件是目标文件,list,逐行遍历,删除头部的数字,非特征字符,
在dic中进行查找,如果重复,修改ls[i]
'''
ls1 = []
for i, v in enumerate(ls):
v = v.strip().replace("。", "").replace(";", "")
r = re.search("[\u4e00-\u9fa5]", v) # 从汉字开始index
if r:
kw = v[r.span()[0]:] # 从汉字开始
if len(kw) > weight:
key = find_in_dic(kw, dic)
if key:
if re.search("[\u4e00-\u9fa5]", ls[i][-1]):
ls[i] = ls[i] + "[repf:" + key + "]"
else:
ls[i] = ls[i][:-1] + "[repf:" + key + "]" + ls[i][-1]
ls1.append(ls[i])
return ls1
def main(root, weight):
# C:\Users\999\Desktop\查重\A4.docx
ls_dir0 = glob.glob(os.path.join(root, "*.docx"))
for file in ls_dir0:
print("Author:QQ736592720---当前目标文件:" + file)
save_file = file[:-5] + "_result.txt"
ls_dir1 = glob.glob(os.path.join(root, "*.docx"))
ls_dir1.remove(file)
dic = get_docx_dic(ls_dir1)
ls = get_docx_list(file)
ls = repf_search(ls, dic, weight)
save_text(save_file, "\n".join(ls))
if __name__ == '__main__':
print("Author:QQ736592720---程序开始......")
print("Author:QQ736592720---当前文字起步权重设置 = 15")
root = os.getcwd()
weight = 15 # 超过几个字的句子才开始判断
main(root, weight)
print("Author:QQ736592720---运行结束")
'''
doc和docx不通用,字符编码如果有乱编没考虑,
还有word如果重复副本文件标红高亮并添加引用标记,
还有判断的分割标记段落,行,句号,逗号,句子的特征字符长短,都需要考虑。
'''
python调用docx模块实现docx文件内容交叉查重
于 2023-09-02 18:58:38 首次发布