如前序文章所述,本章讲述通过分词、去标点符号、去除黑名单、统计词频、获取词汇集、构建特征向量,最后利用余弦定理计算相似性。
相对而言,该相似性计算逻辑、算法简单,能够快速得到两篇文章的相似性,但用途较局限,准确率待提高,后面会逐步比较其它相似性算法。(可参考http://bigdata-madesimple.com/implementing-the-five-most-popular-similarity-measures-in-python/)
以下代码实现环境为python 3.7.2,测试集见cosine_test.rar
# -*- coding: UTF-8 -*- import jieba import sys import os import re import math
##读取文章内容并分词 def get_context(file_path): file = open(file_path, 'r', encoding='UTF-8-sig', errors='ignore') try: file_context = file.read() finally: file.close() return file_context ##定义分词函数 def cut(context): seglist = [] seg = jieba.cut(context, cut_all=False) for line in seg: seglist.append(line) seg_list=sorted(seglist, key=lambda x: x[0]) return seg_list ##定义去除标点符号函数 def remove_punc(text): punc = '~`!#$%^&*()_+-=|\';":/.,?><~·!@#¥%……&*()——+-=“:’;、。,?》”《{}、\n a-zA-Z0-9 ' new_text = re.sub(r"[%s]+" %punc, "",str(text)) return new_text ##去除黑名单 def remove_backlist(text): black_text=text blacklist=get_context('d:/share/data/black_list.txt') for i in blacklist: ss=i.strip().split('\n') if ss in black_text: black_text.remove(ss) return black_text ##word count def count_word(context): ##先添加数值对,建立a\t 1 init_count=[] for i in context: j=i.strip() kv=j+'\t'+"1" init_count.append(kv) #统计词频 current_word = None count_pool = [] sum = 0 word_count = {} for words in init_count: ss = words.strip().split('\t') if len(ss) != 2: continue word = words[0] if current_word == None: current_word = word if current_word != word: for count in count_pool: sum += count word_count [current_word] = sum current_word = word count_pool = [] sum = 0 count_pool.append(1) for count in count_pool: sum += count word_count [current_word] = sum return word_count #定义取合集函数 def merge(w1,w2): h=0 w=w1 if h < len(w2): for i in w2: if i not in w1: w.append(i) h+=1 return w #定义获取特征向量函数 def get_map(word_map1, word_map2, word_map ) : map1=[] map2=[] for wd in word_map: if wd in word_map1: map1.append(str(word_map1.get(wd))) else: map1.append('0') if wd in word_map2: map2.append(str(word_map2.get(wd))) else: map2.append('0') return map1, map2 #定义余弦计算函数 def cosine(v1,v2): sum = 0 s1 = 0 s2 = 0 for x in range(0, len(v1) - 1): xi = float(v1[x]) yi = float(v2[x]) s1 += xi * xi s2 += yi * yi sum += xi * yi similarity = abs(sum / (math.sqrt(s1) * math.sqrt(s2))) return similarity #定义文本路径 file_path1 = 'd:/share/data/article1.txt' file_path2 = 'd:/share/data/article2.txt' #读取文本 init_context1 = get_context(file_path1) init_context2= get_context(file_path2) ##去除标点符号 old_context1 = remove_punc(init_context1) old_context2 = remove_punc(init_context2) # 中文分词 new_context1 = cut(old_context1) new_context2 = cut(old_context2) #去除黑名单 context1=remove_backlist(new_context1) context2=remove_backlist(new_context2) #增加键值对并统计词频 count_word1=count_word(context1) count_word2=count_word(context2) #将两篇文章取合集 words_all=merge(context1,context2) #按照合集序列分别对两篇文章词汇组进行查找,得到同一维度的特征向量 vector1, vector2=get_map(count_word1,count_word2,words_all) #利用余弦定理计算相似度 result=cosine(vector1,vector2) print(result)