whoosh-learning1 schema&index

1.设计schema

from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED
from whoosh.analysis import StemmingAnalyzer
#simple schema for index emails 
schema = Schema(from_addr=ID(stored=True),
                to_addr=ID(stored=True),
                subject=TEXT(stored=True),
                body=TEXT(analyzer=StemmingAnalyzer()),
                tags=KEYWORD)

fileds :可以被索引和存储,存储的话搜索结果可以获得域值。包括TEXT(analyzer,phrase,stored),KEYWORD(lowercase,commas,scorable),ID。

  • 创建索引后修改schema:
    writer = ix.writer()
    writer.add_field("fieldname", fields.TEXT(stored=True))
    writer.remove_field("content")
    writer.commit()
  • field boosts:使在title中查到词是body中的二倍
    schema = Schema(title=TEXT(field_boost=2.0), body=TEXT)

2.index

#创建索引
import os, os.path
from whoosh import index
if not os.path.exists("indexdir"):
    os.mkdir("indexdir")
ix = index.create_in("indexdir", schema)
#打开存在的索引
import whoosh.index as index
ix = index.open_dir("indexdir")
#添加索引文档
writer = ix.writer()
writer.add_document(title=u"My document", content=u"This is my document!",path=u"/a", tags=u"first short", icon=u"/icons/star.png")
writer.add_document(title=u"Second try", content=u"This is the second example.",path=u"/b", tags=u"second short", icon=u"/icons/sheep.png")
writer.commit()
#一个域被索引和储存,但是值不同。原来的值会被分析和索引,返回结果是储存的值。
writer.add_document(title=u"Title to be indexed", _stored_title=u"Stored title")
#通过某个域包含给定词删除文档,通常用于ID或KEYWORD域
ix.delete_by_term('fieldname', u'termtext')
#更新文档,前提是至少schema中的一个域是unique的
writer.update_document(path=u"/a", content="Replacement for the first document")
#增加索引,只更新改变过的文档。首先存储每个文档最后修改时间,因此可以检查文件是否改变。
def get_schema()
  return Schema(path=ID(unique=True, stored=True), time=STORED, content=TEXT)
def add_doc(writer, path):
  fileobj = open(path, "rb")
  content = fileobj.read()
  fileobj.close()
  modtime = os.path.getmtime(path)
  writer.add_document(path=path, content=content, time=modtime)
  def index_my_docs(dirname, clean=False):
  if clean:
    clean_index(dirname)
  else:
    incremental_index(dirname)
def incremental_index(dirname)
    ix = index.open_dir(dirname)

    # The set of all paths in the index
    indexed_paths = set()
    # The set of all paths we need to re-index
    to_index = set()

    with ix.searcher() as searcher:
      writer = ix.writer()

      # Loop over the stored fields in the index
      for fields in searcher.all_stored_fields():
        indexed_path = fields['path']
        indexed_paths.add(indexed_path)

        if not os.path.exists(indexed_path):
          # This file was deleted since it was indexed
          writer.delete_by_term('path', indexed_path)

        else:
          # Check if this file was changed since it
          # was indexed
          indexed_time = fields['time']
          mtime = os.path.getmtime(indexed_path)
          if mtime > indexed_time:
            # The file has changed, delete it and add it to the list of
            # files to reindex
            writer.delete_by_term('path', indexed_path)
            to_index.add(indexed_path)

      # Loop over the files in the filesystem
      # Assume we have a function that gathers the filenames of the
      # documents to be indexed
      for path in my_docs():
        if path in to_index or path not in indexed_paths:
          # This is either a file that's changed, or a new file
          # that wasn't indexed before. So index it!
          add_doc(writer, path)

      writer.commit()
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值