函数介绍:
本次实验利用了上一次实验构建成功的倒排表
send为评测提交函数
and_search为AND逻辑布尔查询
and_search_sort增添了相关度排序的功能
mul_search函数接受不同参数来确定查询的逻辑
test支持用户手动输入测试
代码实现:
bool_search.py
import json
import requests
def send():
f=open(r'xxx.txt',"rb")
files={'file':f}
r=requests.post(url="http://121.37.1.35:5001/detectfile",files=files)
print(r.text)
def and_search(query_str, inverted_index):
# 初始化结果集,它将存储包含所有查询词项的文档ID
query_words=query_str.split()
result_docs = None
# 遍历每个查询词项
for word in query_words:
# 获取当前词项的文档集合
word_docs = set(doc_id for doc_id, _ in inverted_index[word]['tf_list'])
# 如果是第一个词项,初始化结果集
if result_docs is None:
result_docs = word_docs
else:
# 取交集以保留只包含所有词项的文档
result_docs &= word_docs
# 如果在任何点结果集为空,可以提前结束搜索
if not result_docs:
break
# 将结果集转换为文档ID列表
result_doc_ids = list(result_docs)
return result_doc_ids
def read_index():
"""
示例使用
inverted_index 是倒排索引,格式如下:
{
'词1': {'df': 2, 'tf_list': [[1, 2], [2, 1]]},
'词2': {'df': 1, 'tf_list': [[1, 1]]},
}
"""
file_path = r"E:\Procedure\Project\IR\inverted_index.json"
file_path='inverted_index.json'
inverted_index = {}
with open(file_path, 'r', encoding='utf-8') as file:
for line in file:
record = json.loads(line)
if record:
key, value = record.popitem() # 获取字典中的第一对键值对
inverted_index[key] = value
return inverted_index
def and_search_sort(query_str,inverted_index):
# 将查询字符串按空格分割成词项列表
query_words = query_str.split()
# 初始化一个字典来存储每个文档的总词频
doc_frequencies = {}
# 计算每个文档的总词频
for word in query_words:
t_freq={}
for tf_list in inverted_index.get(word, {}).get('tf_list', []):
if tf_list:
doc_id,tf=tf_list[0],tf_list[1]
t_freq[doc_id]=tf
if not doc_frequencies:
doc_frequencies.update(t_freq)
continue
dt={}
for key,value in doc_frequencies.items():
if(t_freq.get(key,None)):
dt[key]=value+t_freq[key]
doc_frequencies=dt
# 根据总词频对文档进行排序
sorted_doc_frequencies = sorted(doc_frequencies.items(),key=lambda item:item[1],reverse=True)
# 提取排序后的文档ID列表
sorted_doc_ids = [doc_id for doc_id, _ in sorted_doc_frequencies]
return sorted_doc_ids
def read_query():
file='query-2024.txt'
index=read_index()
res=[]
with open(file,'r',encoding='gbk')as file:
#每一行为一条查询
for line in file:#返回一个列表
search_results=and_search_sort(query_str=line,inverted_index=index)
search_str=[str(doc_id) for doc_id in search_results]
re_str='\t'.join(search_str)
res.append(re_str)
path= 'xxx.txt'
with open(path,'w',encoding='utf-8')as file:
for string in res:
file.write(string)
file.write('\n')
#read_query()
#send()
增加OR逻辑与Not逻辑:
import bool_search
all_docs=set(range(1,44973))
def mul_search(query_words,inverted_index,condition):
result_docs=None
for word in query_words.split():
word_docs = set(doc_id for doc_id, _ in inverted_index[word]['tf_list'])
if result_docs is None:
result_docs = word_docs
else:
if condition == "AND":
result_docs &= word_docs
else:
result_docs |= word_docs
# 可以添加NOT操作的处理
if condition=="AND" or condition=="OR":
return list(result_docs)
else:
return all_docs-result_docs
def test():
inverted_index = bool_search.read_index()
condition=input("输入查询类型:")
query_words=input()
res=mul_search(query_words,inverted_index,condition)
print(res)
return
test()
优化:
测试时,一定要先读入倒排表,再读取用户的输入,否则倒排表读取时间较久,影响用户体验。