1.设计schema
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED
from whoosh.analysis import StemmingAnalyzer
#simple schema for index emails
schema = Schema(from_addr=ID(stored=True),
to_addr=ID(stored=True),
subject=TEXT(stored=True),
body=TEXT(analyzer=StemmingAnalyzer()),
tags=KEYWORD)
fileds :可以被索引和存储,存储的话搜索结果可以获得域值。包括TEXT(analyzer,phrase,stored),KEYWORD(lowercase,commas,scorable),ID。
- 创建索引后修改schema:
writer = ix.writer()
writer.add_field("fieldname", fields.TEXT(stored=True))
writer.remove_field("content")
writer.commit() - field boosts:使在title中查到词是body中的二倍
schema = Schema(title=TEXT(field_boost=2.0), body=TEXT)
2.index
#创建索引
import os, os.path
from whoosh import index
if not os.path.exists("indexdir"):
os.mkdir("indexdir")
ix = index.create_in("indexdir", schema)
#打开存在的索引
import whoosh.index as index
ix = index.open_dir("indexdir")
#添加索引文档
writer = ix.writer()
writer.add_document(title=u"My document", content=u"This is my document!",path=u"/a", tags=u"first short", icon=u"/icons/star.png")
writer.add_document(title=u"Second try", content=u"This is the second example.",path=u"/b", tags=u"second short", icon=u"/icons/sheep.png")
writer.commit()
#一个域被索引和储存,但是值不同。原来的值会被分析和索引,返回结果是储存的值。
writer.add_document(title=u"Title to be indexed", _stored_title=u"Stored title")
#通过某个域包含给定词删除文档,通常用于ID或KEYWORD域
ix.delete_by_term('fieldname', u'termtext')
#更新文档,前提是至少schema中的一个域是unique的
writer.update_document(path=u"/a", content="Replacement for the first document")
#增加索引,只更新改变过的文档。首先存储每个文档最后修改时间,因此可以检查文件是否改变。
def get_schema()
return Schema(path=ID(unique=True, stored=True), time=STORED, content=TEXT)
def add_doc(writer, path):
fileobj = open(path, "rb")
content = fileobj.read()
fileobj.close()
modtime = os.path.getmtime(path)
writer.add_document(path=path, content=content, time=modtime)
def index_my_docs(dirname, clean=False):
if clean:
clean_index(dirname)
else:
incremental_index(dirname)
def incremental_index(dirname)
ix = index.open_dir(dirname)
# The set of all paths in the index
indexed_paths = set()
# The set of all paths we need to re-index
to_index = set()
with ix.searcher() as searcher:
writer = ix.writer()
# Loop over the stored fields in the index
for fields in searcher.all_stored_fields():
indexed_path = fields['path']
indexed_paths.add(indexed_path)
if not os.path.exists(indexed_path):
# This file was deleted since it was indexed
writer.delete_by_term('path', indexed_path)
else:
# Check if this file was changed since it
# was indexed
indexed_time = fields['time']
mtime = os.path.getmtime(indexed_path)
if mtime > indexed_time:
# The file has changed, delete it and add it to the list of
# files to reindex
writer.delete_by_term('path', indexed_path)
to_index.add(indexed_path)
# Loop over the files in the filesystem
# Assume we have a function that gathers the filenames of the
# documents to be indexed
for path in my_docs():
if path in to_index or path not in indexed_paths:
# This is either a file that's changed, or a new file
# that wasn't indexed before. So index it!
add_doc(writer, path)
writer.commit()