vsm向量空间模型实现

最新推荐文章于 2020-05-28 22:34:28 发布

1313123131312

最新推荐文章于 2020-05-28 22:34:28 发布

阅读量884

点赞数 1

文章标签： vsm 向量空间模型 Python实现

简介
代码实现
总结

一.简介
在检索当中，主要涉及了两个核心问题:
I.相似度计算
II.索引的建立
索引建立参考链接：

http://blog.csdn.net/malefactor/article/details/7256305

这里我们重点讲解第一个问题
1.1整体流程如图：
这里写图片描述
1.2在整个流程当中，第二步骤和第三步骤对于效果影响很多，故此很关键
相似度的vsm经典模型流程如图：

经过第三步骤处理后，文档在词典当中均有唯一的表示-表示为一个长向量的形式
第四步骤参考链接：

http://blog.csdn.net/u010598982/article/details/50876831

二.代码实现

# !usr/bin/python
# copyright(c) youfuwen
# Date:2016.03.26
# E-Mail:yfwen@bjtu.edu.cn
# first: cipintongji
import math
import ast
from collections import Counter
wordsCount=0#variable for wordsfrequency
def CountKeyByWen(fileName1):
    global wordsCount
    f1=open(fileName1,'r')
    f2=open(fileName2,'r')
    table={}
    for lines in f1:
        for line in lines.split(' '):
            if line!=' ' and table.has_key(line):
                table[line]+=1
                wordsCount+=1
            elif line!=' ':
                wordsCount+=1
                table[line]=1
    dic = sorted(table.iteritems(),key= lambda asd:asd[1], reverse=True)
    # print len(dic) code for testing
    return dic
# seconde:create vocabulary
def CreateVocabulary(dic1=None, dic2=None):
    vocabulary=[]
    for dicEle in dic1:
        if dicEle[0] not in vocabulary:
            vocabulary.append(dicEle[0])
    for dicEle in dic2:
        if dicEle[0] not in vocabulary:
            vocabulary.append(dicEle[0])
    # print len(vocabulary) code for testing
    return vocabulary
# third:compute TF-IDF output: a vector
# In this code we just use TF for computing similarity
def ComputeVector(dic1=None,vocabulary=None):
    # 3.1compute cipin global wordscount wordsCount
    # 3.2create vector
    dicVector = {}
    for elem in vocabulary:
        dicVector[elem]=0
    # dicVector = sorted(dicVector.iteritems(),key= lambda asd:asd[1], reverse=True)
    dicTemp1,dicTemp2=Counter(dicVector), Counter(dic1)
    dicTemp=dict(dicTemp1+dicTemp2)
    # dicTemp = sorted(dicTemp.iteritems(),key= lambda asd:asd[1], reverse=True)
    return  dicTemp
# fourth: compute TF-IDF
def ComputeSimlirity(dic1Vector=None,dic2Vector=None):
    x=0.0 #fenzi
    #fenmu
    y1=0.0
    y2=0.0
    for k in dic1Vector:# because of the element of dic1 and dic2 are the same
        temp1=(float)(float(dic1Vector[k])/float(wordsCount))
        temp2=(float)(float(dic2Vector[k])/float(wordsCount))
        x=x+ (temp1*temp2)
        y1+=pow(temp1,2)
        y2+=pow(temp2,2)
    return x/math.sqrt(y1*y2)

if __name__=='__main__':
    fileName1='amanda_all.txt';
    fileName2='amanda_all.txt';
    dic1 = CountKeyByWen(fileName1)
    dic2 = CountKeyByWen(fileName2)
    vocabulary = CreateVocabulary(dic1, dic2)
    dic1Vector = ComputeVector(dic1, vocabulary)
    dic2Vector = ComputeVector(dic2, vocabulary)
    for elem in dic1Vector:
        print "<"+elem[0],',',str(elem[1])+">"
    sim=ComputeSimlirity(dic1Vector,dic2Vector)
    print sim
   
   1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75

tips:上面的code有人提示楼主有问题，遂做修改，欢迎大家批评指正哈！修改代码如下：

# !usr/bin/python
# copyright(c) youfuwen
# Date:2016.03.29revised version
# E-Mail:yfwen@bjtu.edu.cn
# first: count words frequency
import math
import ast
from collections import Counter
wordsCount=0#variable for wordsfrequency
def CountKeyByWen(fileName1):
    global wordsCount
    f1=open(fileName1,'r')
    f2=open(fileName2,'r')
    table={}
    for lines in f1:
        for line in lines.split(' '):
            if line!=' ' and table.has_key(line):
                table[line]+=1
                wordsCount+=1
            elif line!=' ':
                wordsCount+=1
                table[line]=1
    #dic = sorted(table.iteritems(),key= lambda asd:asd[1], reverse=True)
    # print len(dic) code for testing
    return table
# seconde:create vocabulary
def CreateVocabulary(dic1=None, dic2=None):
    vocabulary=[]
    for dicEle in dic1:
        if dicEle not in vocabulary:
            vocabulary.append(dicEle)
    for dicEle in dic2:
        if dicEle not in vocabulary:
            vocabulary.append(dicEle)
    # print len(vocabulary) code for testing
    return vocabulary
# third:compute TF-IDF output: a vector
# In this code we just use TF for computing similarity
def union_dict(*objs):
    _keys = set(sum([obj.keys() for obj in objs],[]))
    _total = {}
    for _key in _keys:
        _total[_key] = sum([obj.get(_key,0) for obj in objs])
    return _total
def ComputeVector(dic1=None,vocabulary=None):
    # 3.1compute cipin global wordscount wordsCount
    # 3.2create vector
    dicVector = {}
    for elem in vocabulary:
        dicVector[elem]=0
    # dicVector = sorted(dicVector.iteritems(),key= lambda asd:asd[1], reverse=True)
    # U"vocabulary --->dicVector"
    # U"dic1->vector"
    dicTemp=union_dict(dicVector,dic1);
    # dicTemp1,dicTemp2=Counter(dicVector), Counter(dic1)
    # dicTemp=dict(dicTemp1+dicTemp2)
    # dicTemp = sorted(dicTemp.iteritems(),key= lambda asd:asd[1], reverse=True)
    return  dicTemp
# fourth: compute TF-IDF
def ComputeSimlirity(dic1Vector=None,dic2Vector=None):
    x=0.0 #fenzi
    #fenmu
    y1=0.0
    y2=0.0
    for k in dic1Vector:# because of the element of dic1 and dic2 are the same
        temp1=(float)(float(dic1Vector[k])/float(wordsCount))
        temp2=(float)(float(dic2Vector[k])/float(wordsCount))
        x=x+ (temp1*temp2)
        y1+=pow(temp1,2)
        y2+=pow(temp2,2)
    return x/math.sqrt(y1*y2)

if __name__=='__main__':
    fileName1='a.txt';
    fileName2='b.txt';
    dic1 = CountKeyByWen(fileName1)
    dic2 = CountKeyByWen(fileName2)
    vocabulary = CreateVocabulary(dic1, dic2)
    dic1Vector = ComputeVector(dic1, vocabulary)
    dic2Vector = ComputeVector(dic2, vocabulary)
    for elem in dic1Vector:
        print "<"+elem,',',str(dic1Vector[elem])+">"
    sim=ComputeSimlirity(dic1Vector,dic2Vector)
    print "similarity="+str(sim)
    #####################################
   
   1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85

三.总结
I.任何事情均通于天道地道，都有一个积累的过程，努力是成功的必要前提条件II.让我们一同努力，明天会更好！

转载地址

1313123131312

关注

1
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
vsm向量空间模型实现

简介代码实现总结一.简介在检索当中，主要涉及了两个核心问题: I.相似度计算 II.索引的建立索引建立参考链接：http://blog.csdn.net/malefactor/article/details/7256305这里我们重点讲解第一个问题 1.1整体流程如图： 1.2在整个流程当中，第二步骤和第三步骤对于效果影响很多，故此很关键
复制链接

扫一扫