智能信息检索：布尔查询_智能检索布尔排序-CSDN博客

本文链接：https://blog.csdn.net/Xm041206/article/details/140291000

函数介绍：

本次实验利用了上一次实验构建成功的倒排表

send为评测提交函数

and_search为AND逻辑布尔查询

and_search_sort增添了相关度排序的功能

mul_search函数接受不同参数来确定查询的逻辑

test支持用户手动输入测试

代码实现：

bool_search.py

import json
import requests
def send():
    f=open(r'xxx.txt',"rb")
    files={'file':f}
    r=requests.post(url="http://121.37.1.35:5001/detectfile",files=files)
    print(r.text)
def and_search(query_str, inverted_index):
    # 初始化结果集，它将存储包含所有查询词项的文档ID
    query_words=query_str.split()
    result_docs = None
    # 遍历每个查询词项
    for word in query_words:
        # 获取当前词项的文档集合
        word_docs = set(doc_id for doc_id, _ in inverted_index[word]['tf_list'])
        # 如果是第一个词项，初始化结果集
        if result_docs is None:
            result_docs = word_docs
        else:
            # 取交集以保留只包含所有词项的文档
            result_docs &= word_docs
        # 如果在任何点结果集为空，可以提前结束搜索
        if not result_docs:
            break
    # 将结果集转换为文档ID列表
    result_doc_ids = list(result_docs)

    return result_doc_ids


def read_index():
    """
    示例使用
    inverted_index 是倒排索引，格式如下：
    {
        '词1': {'df': 2, 'tf_list': [[1, 2], [2, 1]]},
        '词2': {'df': 1, 'tf_list': [[1, 1]]},
    }
    """
    file_path = r"E:\Procedure\Project\IR\inverted_index.json"
    file_path='inverted_index.json'
    inverted_index = {}
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            record = json.loads(line)
            if record:
                key, value = record.popitem()  # 获取字典中的第一对键值对
                inverted_index[key] = value
    return inverted_index

def and_search_sort(query_str,inverted_index):
    # 将查询字符串按空格分割成词项列表
    query_words = query_str.split()
    # 初始化一个字典来存储每个文档的总词频
    doc_frequencies = {}
    # 计算每个文档的总词频
    for word in query_words:
        t_freq={}
        for tf_list in inverted_index.get(word, {}).get('tf_list', []):
            if tf_list:
                doc_id,tf=tf_list[0],tf_list[1]
                t_freq[doc_id]=tf
        if not doc_frequencies:
            doc_frequencies.update(t_freq)
            continue
        dt={}
        for key,value in doc_frequencies.items():
            if(t_freq.get(key,None)):
                dt[key]=value+t_freq[key]
        doc_frequencies=dt
    # 根据总词频对文档进行排序
    sorted_doc_frequencies = sorted(doc_frequencies.items(),key=lambda item:item[1],reverse=True)
    # 提取排序后的文档ID列表
    sorted_doc_ids = [doc_id for doc_id, _ in sorted_doc_frequencies]
    return sorted_doc_ids

def read_query():
    file='query-2024.txt'
    index=read_index()
    res=[]
    with open(file,'r',encoding='gbk')as file:
        #每一行为一条查询
        for line in file:#返回一个列表
            search_results=and_search_sort(query_str=line,inverted_index=index)
            search_str=[str(doc_id) for doc_id in search_results]
            re_str='\t'.join(search_str)
            res.append(re_str)
    path= 'xxx.txt'
    with open(path,'w',encoding='utf-8')as file:
        for string in res:
            file.write(string)
            file.write('\n')

#read_query()
#send()

增加OR逻辑与Not逻辑：

import bool_search
all_docs=set(range(1,44973))
def mul_search(query_words,inverted_index,condition):
    result_docs=None
    for word in query_words.split():
        word_docs = set(doc_id for doc_id, _ in inverted_index[word]['tf_list'])
        if result_docs is None:
            result_docs = word_docs
        else:
            if condition == "AND":
                result_docs &= word_docs
            else:
                result_docs |= word_docs
            # 可以添加NOT操作的处理
    if condition=="AND" or condition=="OR":
        return list(result_docs)
    else:
        return all_docs-result_docs

def test():
    inverted_index = bool_search.read_index()
    condition=input("输入查询类型:")
    query_words=input()
    res=mul_search(query_words,inverted_index,condition)
    print(res)
    return
test()