主要用于检索学生之间互相抄袭的问题,检索的能力与本身的查重对比库有关系,基本逻辑是比较字符串的相似度
# -*- coding: utf-8 -*-
"""
Created on Fri Dec 23 15:19:58 2022
@author: Administrator
"""
from docx import Document
import re, sys, datetime
import os
import difflib #比较字符串相似度
import pandas as pd
#切分句子
def cutSentence(text):
# 结束符号,包含中文和英文的
end_flag = ['。', ';', '!', '?']
content_len = len(text)
sentences = []
tmp_char = ''
for idx, char in enumerate(text):
# 拼接字符
tmp_char += char
if tmp_char != '':
if (idx + 1) == content_len: # 判断是否已经到了最后一位
sentences.append(tmp_char)
break
# 判断此字符是否为结束符号
if char in end_flag:
# 再判断下一个字符是否为结束符号,如果不是结束符号,则切分句子
next_idx = idx + 1
if not text[next_idx] in end_flag:
sentences.append(tmp_char)
tmp_char = ''
else:
continue
return sentences
#获取段落文本
def getText(doc):
texts = []
for para in doc.paragraphs:
if len(para.text) > 0:
try:
texts.append(cutSentence(para.text))
except:
print(para.text)
return texts
#获取表格文本
def getTable(doc):
content = []
#读取表格
for t in range(0, len(doc.tables)):
nc = False
tb = doc.tables[t]
#读取表格的行
for r in range(0, len(tb.rows)):
row_cells = tb.rows[r].cells
#读取单元格内容
for cell in row_cells:
#print('TSET\n', cell.text)
if len(cell.text) > 0:
try:
for k in nocompare_table:
if k in cell.text:
nc = True
if nc:
break
content.extend(getText(cell))
except:
content.extend(cell.text)
return content
#判断是否是中文
def is_Chinese(word):
for ch in word:
if '\u4e00' <= ch <= '\u9fff':
return True
return False
def msplit(s, seperators = ',|\.|\?|,|。|?|!'):
return re.split(seperators, s)
#获取文件
def readDocx(docfile):
doc = Document(docfile)
print('*' * 80)
print('文件', docfile, '加载中……')
t1 = datetime.datetime.now()
paras = getText(doc)
if is_consider_table:
segs = getTable(doc)
else:
segs = []
if is_consider_notable:
for p in paras:
temp = []
if p != []:
for s in msplit(p[0]):
if len(s) > 2:
temp.append(s.replace(' ', ""))
if len(temp) > 0:
segs.append(temp)
t2 = datetime.datetime.now()
print('加载完成,用时: ', t2 - t1)
showInfo(segs, docfile)
return segs
#显示信息
def showInfo(doc, filename = 'filename'):
chars = 0
segs = 0
for p in doc:
for s in p:
segs = segs + 1
chars = chars + len(s)
print('段落数: {0:>8d} 个。'.format(len(doc)))
print('短句数: {0:>8d} 句。'.format(segs))
print('字符数: {0:>8d} 个。'.format(chars))
#获得字符串相似度
def string_similar(s1, s2):
return difflib.SequenceMatcher(None, s1, s2).quick_ratio()
#找最大相似字符串
def compare_common_str(str1="", str2=""):
a = set() # 防止存入相同长度的相同字符串
result_str = "" # 存入相同长度的字符串
if len(str1) > len(str2):
str1, str2 = str2, str1 # 把较小长度的字符串存入 str1中,将str1 进行循环迭代,优化效率
for i in range(len(str1)):
sub_str = "" # 用于试探字符串最大长度
if str1[i] in str2:
sub_str = str1[i]
k = i
while True:
if k < len(str1) - 1: # 如果k走到头了就停止。
k += 1
sub_str = sub_str+str1[k] # 进一步试探是否 包含元素
if not sub_str in str2:
sub_str = sub_str[:-1]
break
else:
break
if len(sub_str) > len(result_str):
a = set()
result_str = sub_str
elif len(result_str) == len(sub_str): # 判断两个 字符是否相等
if sub_str != result_str:
a.add(result_str)
a.add(sub_str)
else:
if len(a) != 0:
return a
else:
return result_str
def get_same_sentence(p1, p2, ratio):
same_list = []
if ratio == 1: #百分百相同,找最短丢出去
if p2 in p1:
same_list.append(p2)
elif p1 in p2:
same_list.append(p1)
elif ratio != 0: #部分相同
same_list.append(compare_common_str(p1, p2))
return same_list
#比较两个文本list中的数据相似度
def compareParagraph(doc1, i, doc2, j, min_segment = 5):
"""
功能为比较两个段落的相似度,返回结果为两个段落中相同字符的长度与较短段落长度的比值。
:param p1: 行
:param p2: 列
:param min_segment = 5: 最小段的长度
"""
p1 = doc1[i][0]
p2 = doc2[j][0]
#空串不比较
if len(p1) == 0 or len(p2) == 0:
return [], -1
ratio = string_similar(p1, p2) #字符串相似度
count = ratio * len(p1) #相同字符数
same_list = get_same_sentence(p1, p2, ratio) #存放相同的内容,由limit_ratio决定
if count > 2 and ratio > 0.1:
print(' 发现相同内容 '.center(80, '*'))
print('文件1第{0:0>4d}段内容:{1}'.format(i + 1, p1))
print('文件2第{0:0>4d}段内容:{1}'.format(j + 1, p2))
print('完全相同内容:', same_list)
print('相同字符比:{1:.2f}%\n相同字符数: {0}\n'.format(count, ratio * 100))
return same_list, ratio
#全文比较
def compare_paper(path1, path2):
doc1 = readDocx(path1)
doc2 = readDocx(path2)
same_passage = []
ratio_count = 0
all_count = len(doc1)
no_count = 0
for i in range(len(doc1)):
for ntn in nocompare_table_name:
if ntn in doc1[i][0]:
no_count = no_count + 1
continue
if i % 100 == 0:
print('处理进行中,已处理段落 {0:>4d} (总数 {1:0>4d} ) '.format(i, len(doc1)))
for j in range(len(doc2)):
temp_CP = compareParagraph(doc1, i, doc2, j)
if len(temp_CP[0]) > 0:
same_passage.append(temp_CP[0])
if temp_CP[1] >= limit_ratio: #重复比例大于等于limit_ratio
ratio_count = ratio_count + 1
print(path1.split('\\')[-1].split('.')[0], '与其他文件重复片段数:{0}\n, 应比较片段数:{1}\n'
.format(ratio_count, all_count - no_count))
return round(ratio_count / (max(all_count - no_count, 1)), 4)
is_consider_table = True #是否考虑表格的内容
is_consider_notable = False #是否考虑非表格的内容
#不进行比较的表格所对应标题名字
nocompare_table = ['指导教师批阅意见:', '备注:', '实验目的与要求:']
#标题名字不进行比较
nocompare_table_name = ['实验结论:']
limit_ratio = 0.9 #重复的比例
hw_path = r'C:\Users\Administrator\Desktop\作业抄袭检测'
hw_files = os.listdir(hw_path) #得到文件夹下的所有文件名称
print('开始比对...'.center(80, '*'))
check_result = pd.DataFrame()
new_index = []
for file in hw_files:
check = hw_path + '\\' + file
cs = []
if 'docx' in file:
new_index.append(file)
for second_file in hw_files:
if 'docx' in second_file:
compare = hw_path + '\\' + second_file
cs.append(compare_paper(check, compare))
check_result[file] = cs
check_result.index = new_index