def _compute_cosine(text_a, text_b):
words1 = text_a.strip('http').strip('https').split('/')
words2 = text_b.strip('http').strip('https').split('/')
words1_dict = {}
words2_dict = {}
for word in words1:
word = re.sub('[^a-zA-Z]', '', word)
word = word.lower()
if word != '' and word in words1_dict:
num = words1_dict[word]
words1_dict[word] = num + 1
elif word != '':
words1_dict[word] = 1
else:
continue
for word in words2:
word = re.sub('[^a-zA-Z]', '', word)
word = word.lower()
if word != '' and word in words2_dict:
num = words2_dict[word]
words2_dict[word] = num + 1
elif word != '':
words2_dict[word] = 1
else:
continue
dic1 = sorted(words1_dict.items(), key=lambda asd: asd[1], reverse=True)
dic2 = sorted(words2_dict.items(), key=lambda asd: asd[1], reverse=True)
words_key = []
for i in range(len(dic1)):
words_key.append(dic1[i][0])
for i in range(len(dic2)):
if dic2[i][0] in words_key:
pass
else:
words_key.append(dic2[i][0])
vect1 = []
vect2 = []
for word in words_key:
if word in words1_dict:
vect1.append(words1_dict[word])
else:
vect1.append(0)
if word in words2_dict:
vect2.append(words2_dict[word])
else:
vect2.append(0)
sum = 0
sq1 = 0
sq2 = 0
for i in range(len(vect1)):
sum += vect1[i] * vect2[i]
sq1 += pow(vect1[i], 2)
sq2 += pow(vect2[i], 2)
try:
result = round(float(sum) / (math.sqrt(sq1) * math.sqrt(sq2)), 2)
except ZeroDivisionError:
result = 0.0
return result
url相似度
最新推荐文章于 2024-05-30 07:32:10 发布