1 import jieba
2 import math
3
4 sentenceA = "这只皮靴号码大了,那只号码合适"
5 sentenceB = "这只皮靴号码不小,那只更合适"
6
7 jieba.add_word('这只')
8 jieba.add_word('那只')
9 jieba.add_word('大了')
10 jieba.add_word('不')
11 jieba.add_word('小')
12 jieba.add_word('更')
13
14 wordA = jieba.lcut(sentenceA, cut_all=True)
15 wordB = jieba.lcut(sentenceB)
16 word_list = wordA + wordB
17
18 def word_set_ac(word_list):
19 word_set = set()
20 for word in word_list:
21 if word.strip() != ',':
22 word_set.add(word.strip())
23 return word_set
24
25 def word_id(word_set):
26 word_dict_id = {}
27 for i, word in enumerate(word_set):
28 word_dict_id[word] = i
29 return word_dict_id
30
31 def number_list(wordX, word_dict_id):
32 number = []
33 for word in wordX:
34 if word.strip() != ',':
35 number.append(word_dict_id[word.strip()])
36 return number
37
38 def word_vector(number_list, word_set):
39 word_vector = [0] * len(word_set)
40 for number in number_list:
41 word_vector[number] += 1
42 return word_vector
43
44 def tf_word_vector(number_list, word_vector):
45 word_tf_vector = []
46 for i, val in enumerate(word_vector):
47 word_tf_vector.append( val / float(len(number_list)))
48 return word_tf_vector
49
50
51 def cos_sim(vectorA, vectorB):
52 up = 0.0
53 down1 = 0.0
54 down2 = 0.0
55 for x, y in zip(vectorA, vectorB):
56 up += x * y
57 down1 += x ** 2
58 down2 += y ** 2
59 return up / (math.sqrt(down1) * math.sqrt(down2))
60
61 def jaccard_sim(vectorA, vectorB):
62 up = 0.0
63 down = 0.0
64 for x, y in zip(vectorA, vectorB):
65 if x == y or (x > 0 and y> 0):
66 up += 1
67 if x > 0 or y > 0:
68 down += 1
69 return up / down
70
71 def euclidean_sim(vectorA, vectorB):
72 value = 0.0
73 for x, y in zip(vectorA, vectorB):
74 value += pow((x - y), 2)
75 return math.sqrt(value)
76
77 if __name__ == '__main__':
78
79 print(wordA)
80 print(wordB)
81 word_set = word_set_ac(word_list)
82 word_id = word_id(word_set)
83 print(word_id)
84
85 number_listA = number_list(wordA, word_id)
86 number_listB = number_list(wordB, word_id)
87 print(number_listA)
88 print(number_listB)
89 word_vectorA = word_vector(number_listA, word_set)
90 word_vectorB = word_vector(number_listB, word_set)
91 tf_word_vectorA = tf_word_vector(number_listA, word_vectorA)
92 tf_word_vectorB = tf_word_vector(number_listB, word_vectorB)
93
94 print('word_vectorA:', word_vectorA)
95 print('tf_word_vectorA:', tf_word_vectorA)
96 print('word_vectorB:', word_vectorB)
97 print('tf_word_vectorB:', tf_word_vectorB)
98 print('vector_cos:', cos_sim(word_vectorA, word_vectorB))
99 print('tf_vector_cos:', cos_sim(tf_word_vectorA, tf_word_vectorB))
100 print('vector_jaccard:', jaccard_sim(word_vectorA, word_vectorB))
101 print('tf_vector_jaccard:', jaccard_sim(tf_word_vectorA, tf_word_vectorB))
102 print('vector_o:', euclidean_sim(word_vectorA, word_vectorB))
103 print('tf_vector_o:', euclidean_sim(tf_word_vectorA, tf_word_vectorB))
['这只', '皮靴', '号码', '大了', ',', '那只', '号码', '合适']
['这只', '皮靴', '号码', '不小', ',', '那只', '更', '合适']
{'那只': 0, '皮靴': 1, '不小': 2, '号码': 3, '合适': 4, '大了': 5, '更': 6, '这只': 7}
[7, 1, 3, 5, 0, 3, 4]
[7, 1, 3, 2, 0, 6, 4]
word_vectorA: [1, 1, 0, 2, 1, 1, 0, 1]
tf_word_vectorA: [0.14285714285714285, 0.14285714285714285, 0.0, 0.2857142857142857, 0.14285714285714285, 0.14285714285714285, 0.0, 0.14285714285714285]
word_vectorB: [1, 1, 1, 1, 1, 0, 1, 1]
tf_word_vectorB: [0.14285714285714285, 0.14285714285714285, 0.14285714285714285, 0.14285714285714285, 0.14285714285714285, 0.0, 0.14285714285714285, 0.14285714285714285]
vector_cos: 0.7559289460184544
tf_vector_cos: 0.7559289460184545
vector_jaccard: 0.625
tf_vector_jaccard: 0.625
vector_o: 2.0
tf_vector_o: 0.2857142857142857