智能信息检索 1：构建倒排表

XuMian11

已于 2024-07-09 11:20:35 修改

阅读量26

点赞数

分类专栏： Python 文章标签：前端数据库 python

于 2024-06-26 08:43:59 首次发布

本文链接：https://blog.csdn.net/Xm041206/article/details/139975577

版权

Python 专栏收录该内容

8 篇文章 0 订阅

订阅专栏

辅助函数源文件：

read_file.py

import json
import os
import string
import jieba
from pypinyin import lazy_pinyin

def read_data():
    directory_path = r"E:\Procedure\Project\IR\data"
    text_files_content = []
    # 列出目录下的所有文件
    for filename in os.listdir(directory_path):
        file_path = os.path.join(directory_path, filename)  # 获取文件的完整路径
        try:
            with open(file_path, 'r', encoding='utf-8') as f:  # 尝试使用UTF-8编码打开文件
                content = f.read()  # 读取文件内容为字符串
        except UnicodeDecodeError:
            # 如果UTF-8编码失败，尝试使用GBK编码
            with open(file_path, 'r', encoding='gbk') as f:
                content = f.read()
        text_files_content.append(content)  # 将内容添加到列表中
    return text_files_content


def store_table(inverted_index):
    # 将倒排索引转换为JSON格式
    # inverted_index_json = json.dumps(inverted_index, ensure_ascii=False, indent=None)
    with open('inverted_index.json', 'w', encoding='utf-8') as f:
        for data in inverted_index:
            json.dump(data, ensure_ascii=False, fp=f)
            f.write('\n')
    print("倒排索引已保存到 'inverted_index.json'")


def sort_key1(inverted_index):
    sorted_keys = sorted(inverted_index.keys(), key=lambda x: str(x), reverse=False)
    sorted_inverted_index = [{key: inverted_index[key]} for key in sorted_keys]
    return sorted_inverted_index
def sort_key(inverted_index):
    keys_with_pinyin = [(key, ''.join(lazy_pinyin(key))) for key in inverted_index.keys()]
    # 根据拼音进行排序
    sorted_keys_with_pinyin = sorted(keys_with_pinyin, key=lambda x: x[1])
    # 构建排序后的倒排索引
    sorted_inverted_index = [{key: inverted_index[key]} for key, pinyin in sorted_keys_with_pinyin]
    return sorted_inverted_index

def op_str():
    chinese_punctuation = "，。！？“”‘’；：【】、《》（）"
    all_punctuation = string.punctuation + chinese_punctuation
    return all_punctuation


def test_txt():
    list = [
        "密码 但是 容易 日 哈哈",
        "完美 定要 对方 对",
        "强烈 当 当然 打出",
        "后来 怎么办 怕 才能 打"
    ]
    return list

主函数：

注意本次实验中的文档中已经做好了分词处理

import json
import read_file
import string

tokenized_docs=read_file.read_data()
#tokenized_docs=read_file.test_txt()

# 初始化倒排索引字典
# 这里使用两层字典：第一层的键是词项，值是一个列表；
# 列表中的元素是另一个列表，包含文档ID和该词在文档中的词频TF。
inverted_index = {}
#all_punctuation=read_file.op_str()

# 构建倒排索引，并记录每个词项的文档频率（DF）和词频（TF）
for doc_id, doc in enumerate(tokenized_docs, start=1):
    #doc = doc.translate(str.maketrans('', '', all_punctuation))
    tf_map = {}  # 当前文档的词频映射{word:tf}
    tokens = doc.split()
    for token in tokens:
        if token not in tf_map:
            tf_map[token] = 0
        tf_map[token] += 1  # 更新词频

    # 更新倒排索引
    for token, tf in tf_map.items():
        if token not in inverted_index:
            inverted_index[token] = {'df': 0, 'tf_list': []}
        inverted_index[token]['df'] += 1  # 增加文档频率
        inverted_index[token]['tf_list'].append((doc_id, tf))  # 记录词频


index=read_file.sort_key(inverted_index)
# 打印倒排索引的JSON格式字符串
#print(index)

# 将JSON字符串写入文件
read_file.store_table(inverted_index=index)
#