文本相似度

  1 import jieba
  2 import math
  3 
  4 sentenceA = "这只皮靴号码大了,那只号码合适"
  5 sentenceB = "这只皮靴号码不小,那只更合适"
  6 
  7 jieba.add_word('这只')
  8 jieba.add_word('那只')
  9 jieba.add_word('大了')
 10 jieba.add_word('不')
 11 jieba.add_word('小')
 12 jieba.add_word('更')
 13 
 14 wordA = jieba.lcut(sentenceA, cut_all=True)
 15 wordB = jieba.lcut(sentenceB)
 16 word_list = wordA + wordB
 17 
 18 def word_set_ac(word_list):
 19     word_set = set()
 20     for word in word_list:
 21         if word.strip() != ',':
 22             word_set.add(word.strip())
 23     return word_set
 24 
 25 def word_id(word_set):
 26     word_dict_id = {}
 27     for i, word in enumerate(word_set):
 28         word_dict_id[word] = i
 29     return word_dict_id
 30 
 31 def number_list(wordX, word_dict_id):
 32     number = []
 33     for word in wordX:
 34         if word.strip() != ',':
 35             number.append(word_dict_id[word.strip()])
 36     return number
 37 
 38 def word_vector(number_list, word_set):
 39     word_vector = [0] * len(word_set)
 40     for number in number_list:
 41         word_vector[number] += 1
 42     return word_vector
 43 
 44 def tf_word_vector(number_list, word_vector):
 45     word_tf_vector = []
 46     for i, val in enumerate(word_vector):
 47         word_tf_vector.append( val / float(len(number_list)))
 48     return word_tf_vector
 49 
 50 
 51 def cos_sim(vectorA, vectorB):
 52     up = 0.0
 53     down1 = 0.0
 54     down2 = 0.0
 55     for x, y in zip(vectorA, vectorB):
 56         up += x * y
 57         down1 += x ** 2
 58         down2 += y ** 2
 59     return up / (math.sqrt(down1) * math.sqrt(down2))
 60 
 61 def jaccard_sim(vectorA, vectorB):
 62     up = 0.0
 63     down = 0.0
 64     for x, y in zip(vectorA, vectorB):
 65         if x == y or (x > 0 and y> 0):
 66             up += 1
 67         if x > 0 or y > 0:
 68             down += 1
 69     return up / down
 70 
 71 def euclidean_sim(vectorA, vectorB):
 72     value = 0.0
 73     for x, y in zip(vectorA, vectorB):
 74         value += pow((x - y), 2)
 75     return math.sqrt(value)
 76
 77 if  __name__ == '__main__':
 78 
 79     print(wordA)
 80     print(wordB)
 81     word_set = word_set_ac(word_list)
 82     word_id = word_id(word_set)
 83     print(word_id)
 84 
 85     number_listA = number_list(wordA, word_id)
 86     number_listB = number_list(wordB, word_id)
 87     print(number_listA)
 88     print(number_listB)
 89     word_vectorA = word_vector(number_listA, word_set)
 90     word_vectorB = word_vector(number_listB, word_set)
 91     tf_word_vectorA = tf_word_vector(number_listA, word_vectorA)
 92     tf_word_vectorB = tf_word_vector(number_listB, word_vectorB)
 93 
 94     print('word_vectorA:', word_vectorA)
 95     print('tf_word_vectorA:', tf_word_vectorA)
 96     print('word_vectorB:', word_vectorB)
 97     print('tf_word_vectorB:', tf_word_vectorB)
 98     print('vector_cos:', cos_sim(word_vectorA, word_vectorB))
 99     print('tf_vector_cos:', cos_sim(tf_word_vectorA, tf_word_vectorB))
100     print('vector_jaccard:', jaccard_sim(word_vectorA, word_vectorB))
101     print('tf_vector_jaccard:', jaccard_sim(tf_word_vectorA, tf_word_vectorB))
102     print('vector_o:', euclidean_sim(word_vectorA, word_vectorB))
103     print('tf_vector_o:', euclidean_sim(tf_word_vectorA, tf_word_vectorB))
['这只', '皮靴', '号码', '大了', ',', '那只', '号码', '合适']
['这只', '皮靴', '号码', '不小', ',', '那只', '更', '合适']
{'那只': 0, '皮靴': 1, '不小': 2, '号码': 3, '合适': 4, '大了': 5, '更': 6, '这只': 7}
[7, 1, 3, 5, 0, 3, 4]
[7, 1, 3, 2, 0, 6, 4]
word_vectorA: [1, 1, 0, 2, 1, 1, 0, 1]
tf_word_vectorA: [0.14285714285714285, 0.14285714285714285, 0.0, 0.2857142857142857, 0.14285714285714285, 0.14285714285714285, 0.0, 0.14285714285714285]
word_vectorB: [1, 1, 1, 1, 1, 0, 1, 1]
tf_word_vectorB: [0.14285714285714285, 0.14285714285714285, 0.14285714285714285, 0.14285714285714285, 0.14285714285714285, 0.0, 0.14285714285714285, 0.14285714285714285]
vector_cos: 0.7559289460184544
tf_vector_cos: 0.7559289460184545
vector_jaccard: 0.625
tf_vector_jaccard: 0.625
vector_o: 2.0
tf_vector_o: 0.2857142857142857

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值