创建索引可以先分词,用KEYWORD的类型存储,commas=True可以是空格也可以是逗号可配置,stored=True配置是否和doc存在一起,返回可以一起返回
if not exists_in(index_dir):
data_dir = '***'
df_csv = pd.read_csv(data_dir, usecols=['***', '***', '***', '***', '***', '***'])
schema = Schema(wid=ID(stored=True), catagory=TEXT(stored=True),
classes=TEXT(stored=True), level=TEXT(stored=True),
keywords=KEYWORD(commas=True), simwords=KEYWORD(commas=True))
ix = create_in(index_dir, schema)
writer = ix.writer()
for i in range(len(df_csv)):
writer.add_document(wid=str(df_csv['***'][i]), catagory=df_csv['***'][i],
classes=df_csv['***'][i],
level=df_csv['***'][i],
keywords=re.sub(r"[\[\]\']", '', df_csv['***'][i]),
simwords=re.sub(r"[\[\]\']", '', df_csv['***'][i]))
writer.commit()
else:
ix = open_dir(index_dir)
with ix.searcher() as searcher:
query = QueryParser('***', ix.schema).parse(word)
results = searcher.search(query, limit=None)
for result in results:
print(result.fields())
https://whoosh.readthedocs.io/en/latest/api/fields.html#whoosh.fields.ID
commas=True