(1)余弦相似性
通过测量两个向量之间的角的余弦值来度量它们之间的相似性。0度角的余弦值是1,而其他任何角度的余弦值都不大于1;并且其最小值是-1。从而两个向量之间的角度的余弦值确定两个向量是否大致指向相同的方向。所以,它通常用于文件比较。
详见百科介绍(点击打开链接)
(2)算法实现的中未使用权重(IDF ---逆文档频率),使用词项的出现次数作为向量空间的值。
python实现
#!usr/bin/evn python
#! -*- coding:utf8 -*-
from __future__ import division
import re
from math import sqrt
class Similarity(object):
def __init__(self, target1, target2):
self.target1 = target1
self.target2 = target2
def vector(self):
self.vdict1 = {}
self.vdict2 = {}
for target in re.findall('([a-zA-Z0-9_.&%]+)+', self.target1):
self.vdict1[target] = self.vdict1.get(target, 0) + 1
for target in re.findall('([a-zA-Z0-9_.&%]+)+', self.target2):
self.vdict2[target] = self.vdict2.get(target, 0) + 1
print self.vdict1
print self.vdict2
def mix(self):
# def mapminmax(vdict):
# _min = min(vdict.values())
# _max = max(vdict.values())
# _mid = _max - _min
# print _min, _max, _mid
# for key in vdict:
# vdict[key] = (vdict[key] - _min)/_mid
# return vdict
for key in self.vdict1:
self.vdict2[key] = self.vdict2.get(key, 0)
for key in self.vdict2:
self.vdict1[key] = self.vdict1.get(key, 0)
print self.vdict1
print self.vdict2
# self.vdict1 = mapminmax(self.vdict1)
# self.vdict2 = mapminmax(self.vdict2)
def similar(self):
self.vector()
self.mix()
sum = 0
for key in self.vdict1:
sum += self.vdict1[key] * self.vdict2[key]
A = sqrt(reduce(lambda x,y: x+y, map(lambda x: x*x, self.vdict1.values())))
B = sqrt(reduce(lambda x,y: x+y, map(lambda x: x*x, self.vdict2.values())))
return sum/(A*B)
if __name__ == '__main__':
t1 = "aa bb cc"
t2 = "aa bb ee"
s = Similarity(t1, t2)
print s.similar()
--------------------------------------------------------------------------------------------------------------------------
Java实现