# 注意:只是文本匹配而非语义匹配
# 参考地址:https://zhuanlan.zhihu.com/p/43396514
# https://www.geeksforgeeks.org/python-measure-similarity-between-two-sentences-using-cosine-similarity/
#
# https://baike.baidu.com/item/%E4%BD%99%E5%BC%A6%E7%9B%B8%E4%BC%BC%E5%BA%A6/17509249
# https://github.com/nltk/nltk
# http://www.nltk.org/
# https://github.com/fxsjy/jieba
# similarity = consine = (A.B) / (||A||.||B||) where A and B are vectors.(欧几里得点积公式)
# pip install nltk
# import nltk
# nltk.download('all') ['punkt'|'stopwords'|...] --all好像要下300M+数据
# nltk.tokenize: It is used for tokenization. Tokenization is the process by which big quantity of text is
# divided into smaller parts called tokens. word_tokenize(X) split the given sentence X into
# words and return list.
# nltk.corpus: In this program, it is used to get a list of stopwords.
# A stop word is a commonly used word (such as “the”, “a”, “an”, “in”).
import jieba
import math
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
def check_zh_similarity(s1, s2):
# 使用jieba分词并去掉stopwords,合并words,每个word都是一个维度
sw = ['。', ',']
s1_cut = [i for i in jieba.cut(s1, cut_all=True) if i not in sw]
s2_cut = [i for i in jieba.cut(s2, cut_all=True) if i not in sw]
word_set = set(s1_cut).union(set(s2_cut))
print('cut s1: %s ' % s1_cut)
print('cut s2: %s ' % s2_cut)
print('word set: %s \n' % word_set)
# 列出所有词 及 每个词在set中出现的位置(即每个单词所在的维度和该维度的编码)
word_dict = dict()
i = 0
for word in word_set:
word_dict[word] = i
i += 1
print('word dict: %s \n' % word_dict)
# 统计s1各维度编码并计算每个维度的词频
s1_cut_word_code = [word_dict[word] for word in s1_cut]
print('s1 appear code: %s ' % s1_cut_word_code)
s1_cut_word_fq = [0] * len(word_dict)
for word in s1_cut:
s1_cut_word_fq[word_dict[word]] += 1
print('s1 word appear on each code: %s ' % s1_cut_word_fq)
# 统计s2各维度编码并计算每个维度的词频
s2_cut_word_code = [word_dict[word] for word in s2_cut]
print('s2 appear code: %s ' % s2_cut_word_code)
s2_cut_word_fq = [0] * len(word_dict)
for word in s2_cut:
s2_cut_word_fq[word_dict[word]] += 1
print('s2 word appear on each code: %s \n' % s2_cut_word_fq)
# 计算余弦相似度
sigma_sum, sq1, sq2 = 0, 0, 0
for i in range(len(word_dict)):
sigma_sum += s1_cut_word_fq[i] * s2_cut_word_fq[i]
sq1 += pow(s1_cut_word_fq[i], 2)
sq2 += pow(s2_cut_word_fq[i], 2)
try:
cosine = round(float(sigma_sum) / (math.sqrt(sq1) * math.sqrt(sq2)), 6)
except ZeroDivisionError:
cosine = 0.0
print('cosine similarity: %s \n\n' % cosine)
def check_en_similarity1(s1, s2):
# 使用nltk分词并去掉stopwords,合并words,每个word都是一个维度
sw = stopwords.words('english')
others = ['!', ',', '.', '?', '-s', '-ly', '</s>', 's']
sw.extend(others)
s1_cut = word_tokenize(s1)
s2_cut = word_tokenize(s2)
s1_cut = {w for w in s1_cut if not w.lower() in sw}
s2_cut = {w for w in s2_cut if not w.lower() in sw}
word_set = set(s1_cut).union(set(s2_cut))
print('cut s1: %s ' % s1_cut)
print('cut s2: %s ' % s2_cut)
print('word set: %s \n' % word_set)
# 列出所有词 及 每个词在set中出现的位置(即每个单词所在的维度和该维度的编码)
word_dict = dict()
i = 0
for word in word_set:
word_dict[word] = i
i += 1
print('word dict: %s \n' % word_dict)
# 统计s1各维度编码并计算s1在每个维度的词频
s1_cut_word_code = [word_dict[word] for word in s1_cut]
print('s1 appear code: %s ' % s1_cut_word_code)
s1_cut_word_fq = [0] * len(word_dict)
for word in s1_cut:
s1_cut_word_fq[word_dict[word]] += 1
print('s1 word appear on each code: %s ' % s1_cut_word_fq)
# 统计s2各维度编码并计算s2在每个维度的词频
s2_cut_word_code = [word_dict[word] for word in s2_cut]
print('s2 appear code: %s ' % s2_cut_word_code)
s2_cut_word_fq = [0] * len(word_dict)
for word in s2_cut:
s2_cut_word_fq[word_dict[word]] += 1
print('s2 word appear on each code: %s \n' % s2_cut_word_fq)
# 计算余弦相似度
sigma_sum, sq1, sq2 = 0, 0, 0
for i in range(len(word_dict)):
sigma_sum += s1_cut_word_fq[i] * s2_cut_word_fq[i]
sq1 += pow(s1_cut_word_fq[i], 2)
sq2 += pow(s2_cut_word_fq[i], 2)
try:
cosine = round(float(sigma_sum) / (math.sqrt(sq1) * math.sqrt(sq2)), 6)
except ZeroDivisionError:
cosine = 0.0
print('cosine similarity: %s \n\n' % cosine)
def check_en_similarity2(s1, s2):
# tokenization
x_list = word_tokenize(s1)
y_list = word_tokenize(s2)
# sw contains the list of stopwords
sw = stopwords.words('english')
others = ['!', ',', '.', '?', '-s', '-ly', '</s>', 's']
sw.extend(others)
l1 = []
l2 = []
# remove stop words from the string
x_set = {w for w in x_list if not w.lower() in sw}
y_set = {w for w in y_list if not w.lower() in sw}
# form a set containing keywords of both strings
rvector = x_set.union(y_set)
for w in rvector:
if w in x_set:
l1.append(1) # create a vector
else:
l1.append(0)
if w in y_set:
l2.append(1)
else:
l2.append(0)
# cosine formula
c = 0
for i in range(len(rvector)):
c += l1[i] * l2[i]
cosine = round(c / float((sum(l1) * sum(l2)) ** 0.5), 6)
print("similarity: ", cosine)
return cosine
if __name__ == '__main__':
# 中文句子测试
# sentence1 = '这只皮靴号码大了。那只号码合适'
# sentence2 = '这只皮靴号码不小,那只更合适'
# check_zh_similarity(sentence1, sentence2)
# 英文句子测试
# sentence3 = 'How to create ticket in ticket system?'
# sentence4 = 'How can I create ticket?'
sentence3 = 'Do you know I love you?'
sentence4 = 'I love you so much?'
check_en_similarity1(sentence3, sentence4)
check_en_similarity2(sentence3, sentence4)
print('end!')