向量空间模型的实现
使用Python构建向量空间表示的基本步骤
import math
import jieba
from gensim.corpora.dictionary import Dictionary
stopwords = []
zong = 0
# 装载停用词列表
def remove_stopword(word_list):
if len(stopwords) == 0:
with open('stopword.txt', "r", encoding="utf-8") as word_input:
for word in word_input:
stopwords.append(word.split("\n")[0].strip())
new_word_list = []
for word in word_list:
if word in stopwords:
continue
new_word_list.append(word)
return new_word_list
# 载入四篇测试文档
docs = ["新型互联网大数据技术研究", "大数据采集技术与应用方法", "一种互联网技术研究方法", "计算机系统的分析与设计技术"]
# 加载自定义词典
jieba.load_userdict(r"C:\Users\qssss\PycharmProjects\pythonProject\信息检索\实验二\一、文本预处理\udict.txt")
# 分词、去停用词
words = []
num_word = {} # 用于记录每个词语在几个文档中出现过
wen_word = {} # 用于记录每个文档中拥有的词语
pin_word = {} # 文档中词语出现的次数
num = 0
for i in docs:
cut_words = {}
ac = list(jieba.cut(i))
ac = remove_stopword(ac)
wen_word[num] = ac
for j in ac:
words.append(j)
num_word[j] = num_word.get(j, 0) + 1
for j in set(ac):
kk = 0
for temp in ac:
if temp == j:
kk += 1
pin_word[j] = kk
# print(ac)
num += 1
zong = num
# print(num_word)
print(wen_word)
cut_words = {} # 用于记录每个词出现过的次数
for word in set(words):
if word == '\n' or word == '。' or word == ',':
continue
num = 0
for temp in words:
if temp == word:
num += 1
cut_words[word] = num
# print(cut_words)
# 计算每个文档中每个词汇的TF-IDF值
i = 1
tf_idf = []
for count, wordss in wen_word.items():
tf_words = {}
nw = len(wordss)
for word in wordss:
aa = pin_word[word]
kk = math.log10(1+zong/num_word[word])
tf_words[word] = aa*kk
print("第{}个文档的TF-IDF统计信息为:".format(i))
print(tf_words)
tf_idf.append(tf_words)
i += 1
print(tf_idf)
# 用余弦相似度计算每个文档对的相似性
nword = len(set(words))
d = [[] for row in range(zong)]
words = list(set(words))
print(words)
for i in words:
for num in range(zong):
if i in tf_idf[num].keys():
d[num].append(tf_idf[num][i])
else:
d[num].append(0)
print(d)
mu = {}
for num in range(zong):
k = 0
for i in d[num]:
k += i*i
mu[num] = math.sqrt(k)
print(mu)
for i in range(zong):
for j in range(i, zong):
temp = 0
if i == j:
continue
for k in range(nword):
temp += d[i][k] * d[j][k]
temp = temp/(mu[i]*mu[j])
print("第{}文档与第{}文档的余弦相似度为:".format(i+1, j+1), end='')
print(temp)