# 翟天临的噩梦：怎样用Python检测抄袭行为？

pip install -Uscikit-learn


. ├──app.py ├── fatma.txt ├── image.png ├── john.txt └──juma.txt


·        首先载入所有必要的模块

import os from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity


·        用列表推导式（List Comprehension）读取所有文本文件

student_files =[doc for doc in os.listdir() if doc.endswith( .txt )]


·        使用Lambda功能来向量化并计算相似性。

vectorize =lambda Text: TfidfVectorizer().fit_transform(Text).toarray()similarity = lambda doc1, doc2:cosine_similarity([doc1, doc2])


·        将文本数据向量化

vectors =vectorize(student_notes)s_vectors = list(zip(student_files,vectors))


def check_plagiarism():     plagiarism_results = set()     global s_vectors     for student_a, text_vector_a in s_vectors:         new_vectors=s_vectors.copy()         current_index = new_vectors.index((student_a,text_vector_a))         del new_vectors[current_index]         for student_b , text_vector_b in new_vectors:             sim_score =similarity(text_vector_a, text_vector_b)[0][1]             student_pair= sorted((student_a, student_b))             score = (student_pair[0], student_pair[1],sim_score)             plagiarism_results.add(score)     return plagiarism_results Let’s print plagiarism results for data in check_plagiarism():     print(data)


·        最终代码

import os from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity
student_files = [doc for doc in os.listdir() if doc.endswith( .txt )] student_notes=[open(File).read() for File in student_files]
vectorize = lambda Text:TfidfVectorizer().fit_transform(Text).toarray() similarity = lambda doc1, doc2: cosine_similarity([doc1,doc2])
vectors = vectorize(student_notes) s_vectors= list(zip(student_files, vectors))
def check_plagiarism():     plagiarism_results = set()     global s_vectors     for student_a, text_vector_a in s_vectors:         new_vectors=s_vectors.copy()         current_index = new_vectors.index((student_a,text_vector_a))         del new_vectors[current_index]         for student_b , text_vector_b in new_vectors:             sim_score =similarity(text_vector_a, text_vector_b)[0][1]             student_pair= sorted((student_a, student_b))             score = (student_pair[0], student_pair[1],sim_score)             plagiarism_results.add(score)         return plagiarism_results
for data in check_plagiarism():     print(data)


·        输出：

\$ python app.py#
__________RESULT ___________ ( john.txt ,  juma.txt , 0.5465972177348937)( fatma.txt ,  john.txt , 0.14806887549598566)( fatma.txt ,  juma.txt , 0.18643448370323362)


https://hackernoon.com/how-to-detect-plagiarism-in-text-using-python-zn213tw7

ACL2018论文集50篇解读

EMNLP2017论文集28篇论文解读

2018年AI三大顶会中国学术成果全链接

ACL2017论文集：34篇解读干货全在这里

10篇AAAI2017经典论文回顾

06-19 2891
06-15 568
06-11 3286
12-24 5981
03-07 1430
07-09 7170
03-22 1万+
03-25
09-28
04-28