# -*- coding: utf-8 -*- import jieba import jieba.analyse import math def sentence_resemble(): ''' 计算两个句子的相似度: 1,将输入的两个句子分词 2,求分词后两句子的并集(去重) 3,计算两句子各自词频 4,求词频向量 5,套用余弦定理公式求出相似度 余弦值越接近1,就表明夹角越接近0度,也就是两个向量越相似,这就叫"余弦相似性" :return: ''' str1="我喜欢看电视,不喜欢看电影" str2="我不喜欢看电视,也不喜欢看电影" # 结巴分词,得到去掉逗号的数组 str1 = jieba.cut(str1) str1 = ",".join(str1) str1_array = str1.split(",") str1_array.remove(u",") str2 = jieba.cut(str2) str2 = ",".join(str2) str2_array = str2.split(",") str2_array.remove(u",") # 求分词后两句子的并集(去重) all_array = list(set(str1_array+str2_array)) all = sorted(all_array) # 计算两句子各自词频 str1_num_dic = num_count(str1_array) str2_num_dic = num_count(str2_array) # 套用余弦定理公式求出相似度 cos = resemble_cal(all,str1_num_dic,str2_num_dic) print cos def num_count(a): d = {k: a.count(k) for k in set(a)} return d def article_resemble(): all_key=set() with open("article_1.txt","r") as f: lines = f.readlines() lines = "".join(lines) article1_dic = analyse_word(lines) for k,v in article1_dic.items(): all_key.add(k) with open("article_2.txt","r") as f: article2_lines = f.readlines() article2_lines = "".join(article2_lines) article2_dic = analyse_word(article2_lines) for k,v in article2_dic.items(): all_key.add(k) cos = resemble_cal(all_key,article1_dic,article2_dic) print cos def resemble_cal(all_key,article1_dic,article2_dic): str1_vector=[] str2_vector=[] # 计算词频向量 for i in all_key: str1_count = article1_dic.get(i,0) str1_vector.append(str1_count) str2_count = article2_dic.get(i,0) str2_vector.append(str2_count) # 计算各自平方和 str1_map = map(lambda x: x*x,str1_vector) str2_map = map(lambda x: x*x,str2_vector) str1_mod = reduce(lambda x, y: x+y, str1_map) str2_mod = reduce(lambda x, y: x+y, str2_map) # 计算平方根 str1_mod = math.sqrt(str1_mod) str2_mod = math.sqrt(str2_mod) # 计算向量积 vector_multi = reduce(lambda x, y: x + y, map(lambda x, y: x * y, str1_vector, str2_vector)) # 计算余弦值 cos = float(vector_multi)/(str1_mod*str2_mod) return cos ''' 文章关键词提取 ''' def analyse_word(content): zidian={} return_dic={} # 内容分词 fenci = jieba.cut_for_search(content) for fc in fenci: if fc in zidian: zidian[fc] += 1 else: zidian[fc] = 1 topK=30 # 关键词 比率 tfidf = jieba.analyse.extract_tags(content, topK=topK,withWeight=True) stopkeyword = [line.strip() for line in open('stop.txt').readlines()] for word_weight in tfidf: if word_weight in stopkeyword: continue frequence = zidian.get(word_weight[0], 'not found') return_dic[word_weight[0]]=frequence return return_dic if __name__=="__main__": # 比较两句子相似度 sentence_resemble() # 比较两篇文章相似度 article_resemble()
文章样例详情查看 http://pan.baidu.com/s/1qXRIGUS