python调用docx模块实现docx文件内容交叉查重

光明有我VX16620122910

已于 2023-09-03 13:50:05 修改

阅读量296

点赞数

文章标签： python windows linux

于 2023-09-02 18:58:38 首次发布

本文链接：https://blog.csdn.net/u011619323/article/details/132641509

版权

# !/usr/bin/env python
# -*-coding:utf-8 -*-
# File       : 文件查重.py
# Time       ：2023/9/2 13:28
# Author     ：QQ736592720
# pip install python-docx
# pip install pypiwin32
# pip install pyinstaller
# pyinstaller -F 文件查重.py
# pyinstaller -F -i 128.ico 文件查重.py

import glob
import os
import re
from docx import Document


# from win32com import client as wc
# def doc_to_docx(oldfilename, newfilename):
#     word = wc.Dispatch("Word.Application")
#     doc = word.Documents.Open(oldfilename)
#     doc.SaveAs(newfilename, 12)
#     doc.Close()
#     word.Quit()


def get_docx_list(filename):
    '''
    获取需要查重的文件的list,逐行分割
    :return: list
    '''
    doc = Document(filename)
    ls = []
    for p in doc.paragraphs:
        if len(re.findall("[。；]", p.text)) > 1:
            arr = re.split("。|；", p.text)
            arr = [i + "。" for i in arr if i]
            ls.extend(arr)
        else:
            ls.append(p.text)
    doc.save(filename)
    return ls


def get_docx_dic(filenames):
    '''
    获取除被查重文件以外的所有参考文献的内容，以文件名：内容，键值对保存
    :return: dic
    '''

    dic = {}
    for file in filenames:
        file_key = os.path.split(file)[1][:-5]  # r"C:\Users\999\Desktop\查重\A1.docx"
        ls = get_docx_list(file)
        dic[file_key] = "\n".join(ls)
    return dic


def find_in_dic(s, dic):
    '''
    根据语句，在参考文献构成的dic中找内容是否重复
    :return: 返回文件名
    '''
    for k, v in dic.items():
        if v.find(s) > -1:
            print("Author:QQ736592720--- repf found in :  " + k + ".docx")
            return k


def save_text(filename, sss):
    '''
    保存文件：txt
    '''
    with open(filename, "w", encoding="utf-8") as f:
        f.write(sss)
    print("Author:QQ736592720---文件保存成功 ："+filename)


def repf_search(ls, dic, weight):
    '''
    被查重的文件是目标文件，list,逐行遍历，删除头部的数字，非特征字符，
    在dic中进行查找，如果重复，修改ls[i]
    '''
    ls1 = []
    for i, v in enumerate(ls):
        v = v.strip().replace("。", "").replace("；", "")
        r = re.search("[\u4e00-\u9fa5]", v)  # 从汉字开始index
        if r:
            kw = v[r.span()[0]:]  # 从汉字开始
            if len(kw) > weight:
                key = find_in_dic(kw, dic)
                if key:
                    if re.search("[\u4e00-\u9fa5]", ls[i][-1]):
                        ls[i] = ls[i] + "[repf:" + key + "]"
                    else:
                        ls[i] = ls[i][:-1] + "[repf:" + key + "]" + ls[i][-1]
                    ls1.append(ls[i])
    return ls1


def main(root, weight):
    # C:\Users\999\Desktop\查重\A4.docx
    ls_dir0 = glob.glob(os.path.join(root, "*.docx"))
    for file in ls_dir0:
        print("Author:QQ736592720---当前目标文件：" + file)
        save_file = file[:-5] + "_result.txt"
        ls_dir1 = glob.glob(os.path.join(root, "*.docx"))
        ls_dir1.remove(file)
        dic = get_docx_dic(ls_dir1)
        ls = get_docx_list(file)
        ls = repf_search(ls, dic, weight)
        save_text(save_file, "\n".join(ls))


if __name__ == '__main__':
    print("Author:QQ736592720---程序开始......")
    print("Author:QQ736592720---当前文字起步权重设置 = 15")
    root = os.getcwd()
    weight = 15  # 超过几个字的句子才开始判断
    main(root, weight)
    print("Author:QQ736592720---运行结束")
'''
doc和docx不通用，字符编码如果有乱编没考虑，
还有word如果重复副本文件标红高亮并添加引用标记，
还有判断的分割标记段落，行，句号，逗号，句子的特征字符长短，都需要考虑。
'''