要对字符串聚类,字符串的距离度量是一个问题.
使用最长公共子序列或者最长公共字串都有问题.
看到字符串核函数,感觉还不错,没找到它的实现,自己实现了一个效率低下的版本.
def list2dict(li):
'''
[[3,4],[3,5],[4,2]]->{3:[4,5],4:[2]}
'''
di = {i[0]: [] for i in li}
for i in li:
di[i[0]].append(i[1])
return di
def nIncreasingIndex(s, e, n):
x = range(s, e)
if n < 1:
return []
if n == 1:
return [[i] for i in x]
if s == e:
return []
ans = []
for i in x:
for j in nIncreasingIndex(i + 1, e, n - 1):
j.insert(0, i)
ans.append(j)
return ans
def getItemsOfIndexs(li, indexs):
return [li[i] for i in indexs]
def stringKernel(a, b, n, lambdaa):