Python实现作业抄袭比对

写好三分地
已于 2023-03-07 13:50:11 修改
阅读量462
点赞数
分类专栏： python
于 2022-12-23 22:28:43 首次发布
本文链接：https://blog.csdn.net/qq_42738639/article/details/128425024
版权
python 开发语言
python 专栏收录该内容
14 篇文章 2 订阅
订阅专栏
主要用于检索学生之间互相抄袭的问题，检索的能力与本身的查重对比库有关系，基本逻辑是比较字符串的相似度
# -*- coding: utf-8 -*-
"""
Created on Fri Dec 23 15:19:58 2022

@author: Administrator
"""

from docx import Document
import re, sys, datetime
import os
import difflib    #比较字符串相似度
import pandas as pd


#切分句子
def cutSentence(text):
    # 结束符号，包含中文和英文的
    end_flag = ['。', '；', '！', '？']
	
    content_len = len(text)
    sentences = []
    tmp_char = ''

    for idx, char in enumerate(text):
        # 拼接字符
        
        tmp_char += char
        if tmp_char != '':
            if (idx + 1) == content_len: # 判断是否已经到了最后一位
                sentences.append(tmp_char)
                break
    			
    		# 判断此字符是否为结束符号
            if char in end_flag:
                # 再判断下一个字符是否为结束符号，如果不是结束符号，则切分句子
                next_idx = idx + 1
                if not text[next_idx] in end_flag:
                    sentences.append(tmp_char)
                    tmp_char = ''
            else:
                continue
    return sentences

#获取段落文本
def getText(doc):
    texts = []
    for para in doc.paragraphs:
        if len(para.text) > 0:
            try:
                texts.append(cutSentence(para.text))
            except:
                print(para.text)
    return texts

#获取表格文本
def getTable(doc):
    content = []
    #读取表格
    for t in range(0, len(doc.tables)):
        nc = False
        tb = doc.tables[t]
        #读取表格的行
        for r in range(0, len(tb.rows)):
            row_cells = tb.rows[r].cells
            #读取单元格内容
            for cell in row_cells:
                #print('TSET\n', cell.text)
                if len(cell.text) > 0:
                    try:
                        for k in nocompare_table:
                            if k in cell.text:
                                nc = True
                        if nc:
                            break
                        content.extend(getText(cell))
                    except:
                        content.extend(cell.text)
    return content

#判断是否是中文
def is_Chinese(word):
    for ch in word:
        if '\u4e00' <= ch <= '\u9fff':
            return True
    return False

def msplit(s, seperators = ',|\.|\?|，|。|？|！'):
    return re.split(seperators, s)

#获取文件
def readDocx(docfile):
    doc = Document(docfile)
    print('*' * 80)
    print('文件', docfile, '加载中……')
    t1 = datetime.datetime.now()
    paras = getText(doc)
    if is_consider_table:
        segs = getTable(doc)
    else:
        segs = []
        
    if is_consider_notable:
        for p in paras:
            temp = []
            if p != []:
                for s in msplit(p[0]):
                    if len(s) > 2:
                        temp.append(s.replace(' ', ""))
                if len(temp) > 0:
                    segs.append(temp)
    t2 = datetime.datetime.now()
    print('加载完成，用时: ', t2 - t1)
    showInfo(segs, docfile)
    return segs

#显示信息
def showInfo(doc, filename = 'filename'):
    chars = 0
    segs = 0
    for p in doc:
        for s in p:
            segs = segs + 1
            chars = chars + len(s)
    print('段落数: {0:>8d} 个。'.format(len(doc)))
    print('短句数: {0:>8d} 句。'.format(segs))
    print('字符数: {0:>8d} 个。'.format(chars))
    
#获得字符串相似度 
def string_similar(s1, s2):
    return difflib.SequenceMatcher(None, s1, s2).quick_ratio()

#找最大相似字符串
def compare_common_str(str1="", str2=""):
    a = set()   # 防止存入相同长度的相同字符串
    result_str = ""  # 存入相同长度的字符串
    if len(str1) > len(str2):
        str1, str2 = str2, str1   # 把较小长度的字符串存入 str1中，将str1 进行循环迭代，优化效率
    for i in range(len(str1)):
        sub_str = ""   # 用于试探字符串最大长度
        if str1[i] in str2:
            sub_str = str1[i]
            k = i
            while True:
                if k < len(str1) - 1:  # 如果k走到头了就停止。
                    k += 1
                    sub_str = sub_str+str1[k]   # 进一步试探是否 包含元素
                    if not sub_str in str2:
                        sub_str = sub_str[:-1]
                        break
                else:
                    break
        if len(sub_str) > len(result_str):
            a = set()
            result_str = sub_str
        elif len(result_str) == len(sub_str):  # 判断两个 字符是否相等
            if sub_str != result_str:
                a.add(result_str)
                a.add(sub_str)
    else:
        if len(a) != 0:
            return a
        else:
            return result_str

def get_same_sentence(p1, p2, ratio):
    same_list = []
    if ratio == 1:           #百分百相同，找最短丢出去
        if p2 in p1:
            same_list.append(p2)
        elif p1 in p2:
            same_list.append(p1)
    elif ratio != 0:                    #部分相同
        same_list.append(compare_common_str(p1, p2))
    
    return same_list

        
        
                
#比较两个文本list中的数据相似度          
def compareParagraph(doc1, i, doc2, j, min_segment = 5): 
    """
    功能为比较两个段落的相似度，返回结果为两个段落中相同字符的长度与较短段落长度的比值。
    :param p1: 行
    :param p2: 列
    :param min_segment = 5: 最小段的长度
    """
    
    p1 = doc1[i][0]
    p2 = doc2[j][0]

        
    #空串不比较
    if len(p1) == 0 or len(p2) == 0:
        return [], -1
                       
    ratio = string_similar(p1, p2)     #字符串相似度
    count = ratio * len(p1)         #相同字符数
    
    same_list = get_same_sentence(p1, p2, ratio)   #存放相同的内容，由limit_ratio决定

    if count > 2 and ratio > 0.1:
        print(' 发现相同内容 '.center(80, '*'))
        print('文件1第{0:0>4d}段内容：{1}'.format(i + 1, p1))
        print('文件2第{0:0>4d}段内容：{1}'.format(j + 1, p2))
        print('完全相同内容：', same_list)
        print('相同字符比：{1:.2f}%\n相同字符数： {0}\n'.format(count, ratio * 100))
        
    return same_list, ratio
 
#全文比较
def compare_paper(path1, path2):
    doc1 = readDocx(path1)
    doc2 = readDocx(path2)
    same_passage = []
    ratio_count = 0
    all_count = len(doc1)
    no_count = 0
    for i in range(len(doc1)):
        for ntn in nocompare_table_name:
            if ntn in doc1[i][0]:
                no_count = no_count + 1
                continue
        if i % 100 == 0:
            print('处理进行中，已处理段落 {0:>4d} (总数 {1:0>4d} ） '.format(i, len(doc1)))
        for j in range(len(doc2)):
            temp_CP = compareParagraph(doc1, i, doc2, j)
            if len(temp_CP[0]) > 0:
                same_passage.append(temp_CP[0])
                
                if temp_CP[1] >= limit_ratio:        #重复比例大于等于limit_ratio
                    ratio_count = ratio_count + 1
                
    print(path1.split('\\')[-1].split('.')[0], '与其他文件重复片段数：{0}\n, 应比较片段数：{1}\n'
         .format(ratio_count, all_count - no_count))
    
    return round(ratio_count / (max(all_count - no_count, 1)), 4)



is_consider_table = True   #是否考虑表格的内容
is_consider_notable = False   #是否考虑非表格的内容
#不进行比较的表格所对应标题名字
nocompare_table = ['指导教师批阅意见：', '备注：', '实验目的与要求：']

#标题名字不进行比较
nocompare_table_name = ['实验结论：']

limit_ratio = 0.9    #重复的比例


hw_path = r'C:\Users\Administrator\Desktop\作业抄袭检测'
hw_files = os.listdir(hw_path) #得到文件夹下的所有文件名称

print('开始比对...'.center(80, '*'))

check_result = pd.DataFrame()
new_index = []
for file in hw_files:
    check = hw_path + '\\' + file
    cs = []
    if 'docx' in file:
        new_index.append(file)
        for second_file in hw_files:
            if 'docx' in second_file:
                compare = hw_path + '\\' + second_file
                cs.append(compare_paper(check, compare))
        
        check_result[file] = cs
        
check_result.index = new_index