某农业大学信息搜索与引擎-第3次实验

qssssss79

已于 2023-06-19 19:11:40 修改

阅读量178

点赞数

分类专栏：信息搜索与引擎文章标签：数据结构 python 信息检索

于 2023-06-19 18:57:14 首次发布

本文链接：https://blog.csdn.net/qssssss79/article/details/131293429

版权

信息搜索与引擎专栏收录该内容

4 篇文章 0 订阅

订阅专栏

该代码示例展示了如何使用Python进行文本预处理，包括分词、去除停用词，然后利用jieba库。接着，它计算TF-IDF值并用余弦相似度衡量文档之间的相似性，主要关注IT领域的技术如大数据和互联网技术。

摘要由CSDN通过智能技术生成

向量空间模型的实现

使用Python构建向量空间表示的基本步骤

import math
import jieba
from gensim.corpora.dictionary import Dictionary

stopwords = []
zong = 0

# 装载停用词列表
def remove_stopword(word_list):
    if len(stopwords) == 0:
        with open('stopword.txt', "r", encoding="utf-8") as word_input:
            for word in word_input:
                stopwords.append(word.split("\n")[0].strip())
    new_word_list = []
    for word in word_list:
        if word in stopwords:
            continue
        new_word_list.append(word)
    return new_word_list


# 载入四篇测试文档
docs = ["新型互联网大数据技术研究", "大数据采集技术与应用方法", "一种互联网技术研究方法", "计算机系统的分析与设计技术"]


# 加载自定义词典
jieba.load_userdict(r"C:\Users\qssss\PycharmProjects\pythonProject\信息检索\实验二\一、文本预处理\udict.txt")


# 分词、去停用词
words = []
num_word = {}  # 用于记录每个词语在几个文档中出现过
wen_word = {}  # 用于记录每个文档中拥有的词语
pin_word = {}  # 文档中词语出现的次数
num = 0
for i in docs:
    cut_words = {}
    ac = list(jieba.cut(i))
    ac = remove_stopword(ac)
    wen_word[num] = ac
    for j in ac:
        words.append(j)
        num_word[j] = num_word.get(j, 0) + 1
    for j in set(ac):
        kk = 0
        for temp in ac:
            if temp == j:
                kk += 1
        pin_word[j] = kk
    # print(ac)
    num += 1
zong = num
# print(num_word)
print(wen_word)

cut_words = {}  # 用于记录每个词出现过的次数
for word in set(words):
    if word == '\n' or word == '。' or word == '，':
        continue
    num = 0
    for temp in words:
        if temp == word:
            num += 1
    cut_words[word] = num
# print(cut_words)


# 计算每个文档中每个词汇的TF-IDF值
i = 1
tf_idf = []
for count, wordss in wen_word.items():
    tf_words = {}
    nw = len(wordss)
    for word in wordss:
        aa = pin_word[word]
        kk = math.log10(1+zong/num_word[word])
        tf_words[word] = aa*kk
    print("第{}个文档的TF-IDF统计信息为:".format(i))
    print(tf_words)
    tf_idf.append(tf_words)
    i += 1
print(tf_idf)


# 用余弦相似度计算每个文档对的相似性
nword = len(set(words))
d = [[] for row in range(zong)]
words = list(set(words))
print(words)
for i in words:
    for num in range(zong):
        if i in tf_idf[num].keys():
            d[num].append(tf_idf[num][i])
        else:
            d[num].append(0)
print(d)

mu = {}
for num in range(zong):
    k = 0
    for i in d[num]:
        k += i*i
    mu[num] = math.sqrt(k)
print(mu)

for i in range(zong):
    for j in range(i, zong):
        temp = 0
        if i == j:
            continue
        for k in range(nword):
            temp += d[i][k] * d[j][k]
        temp = temp/(mu[i]*mu[j])
        print("第{}文档与第{}文档的余弦相似度为：".format(i+1, j+1), end='')
        print(temp)