设计思路
- 利用jieba分词
- 去停用词
利用nltk进行词干提取、词形还原等预处理(效果太差了)- 利用genism的bm25模型建立索引
- 对于query设置置信区,并逐条搜索
- 讲搜索内容写入字典
- 按照指定格式输出字典内容
(当然用es写也可以)
# -*- coding:utf-8 -*-
import nltk.tokenize
import time
import jieba
"""
———————————————————————————————————————————————————
2020信息检索期末考试
凌珑
————————————————————————————————————————————————————
"""
'''建索引'''
def readfile(filename):
file= open(filename,'r',encoding='UTF-8')
print("获取文件成功")
return file
def makestops():
stopwords=set()
with open('stopwords.txt','r')as f:
while True:
line = f.readline()
if not line:
break
line = line.strip('\n')
stopwords.add(line)
return stopwords
def cutsentence(sen,stops):
# words = sen.split()
# words = nltk.tokenize.word_tokenize(sen)
words = jieba.lcut_for_search(sen.strip(), HMM=True)
words = [i for i in words if i not in stops]
return words
def pretreatment(datafile,corpus,dataid):
begin=time.time()
stops = makestops()
for sentence in datafile:
sen = sentence.split("\t", 1)
words=cutsentence(sen