whoosh-learning1 schema&index

最新推荐文章于 2022-05-25 21:25:26 发布

tianchi7

最新推荐文章于 2022-05-25 21:25:26 发布

阅读量203

点赞数

分类专栏： python+ir 文章标签： whoosh python搜索引擎

本文链接：https://blog.csdn.net/yue1151180702/article/details/71487793

版权

python+ir 专栏收录该内容

2 篇文章 0 订阅

订阅专栏

1.设计schema

from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED
from whoosh.analysis import StemmingAnalyzer
#simple schema for index emails 
schema = Schema(from_addr=ID(stored=True),
                to_addr=ID(stored=True),
                subject=TEXT(stored=True),
                body=TEXT(analyzer=StemmingAnalyzer()),
                tags=KEYWORD)

fileds ：可以被索引和存储，存储的话搜索结果可以获得域值。包括TEXT（analyzer，phrase，stored），KEYWORD（lowercase，commas，scorable），ID。

创建索引后修改schema：
writer = ix.writer() writer.add_field("fieldname", fields.TEXT(stored=True)) writer.remove_field("content") writer.commit()
field boosts:使在title中查到词是body中的二倍
schema = Schema(title=TEXT(field_boost=2.0), body=TEXT)

2.index

#创建索引
import os, os.path
from whoosh import index
if not os.path.exists("indexdir"):
    os.mkdir("indexdir")
ix = index.create_in("indexdir", schema)
#打开存在的索引
import whoosh.index as index
ix = index.open_dir("indexdir")
#添加索引文档
writer = ix.writer()
writer.add_document(title=u"My document", content=u"This is my document!",path=u"/a", tags=u"first short", icon=u"/icons/star.png")
writer.add_document(title=u"Second try", content=u"This is the second example.",path=u"/b", tags=u"second short", icon=u"/icons/sheep.png")
writer.commit()
#一个域被索引和储存，但是值不同。原来的值会被分析和索引，返回结果是储存的值。
writer.add_document(title=u"Title to be indexed", _stored_title=u"Stored title")
#通过某个域包含给定词删除文档，通常用于ID或KEYWORD域
ix.delete_by_term('fieldname', u'termtext')
#更新文档，前提是至少schema中的一个域是unique的
writer.update_document(path=u"/a", content="Replacement for the first document")
#增加索引，只更新改变过的文档。首先存储每个文档最后修改时间，因此可以检查文件是否改变。
def get_schema()
  return Schema(path=ID(unique=True, stored=True), time=STORED, content=TEXT)
def add_doc(writer, path):
  fileobj = open(path, "rb")
  content = fileobj.read()
  fileobj.close()
  modtime = os.path.getmtime(path)
  writer.add_document(path=path, content=content, time=modtime)
  def index_my_docs(dirname, clean=False):
  if clean:
    clean_index(dirname)
  else:
    incremental_index(dirname)
def incremental_index(dirname)
    ix = index.open_dir(dirname)

    # The set of all paths in the index
    indexed_paths = set()
    # The set of all paths we need to re-index
    to_index = set()

    with ix.searcher() as searcher:
      writer = ix.writer()

      # Loop over the stored fields in the index
      for fields in searcher.all_stored_fields():
        indexed_path = fields['path']
        indexed_paths.add(indexed_path)

        if not os.path.exists(indexed_path):
          # This file was deleted since it was indexed
          writer.delete_by_term('path', indexed_path)

        else:
          # Check if this file was changed since it
          # was indexed
          indexed_time = fields['time']
          mtime = os.path.getmtime(indexed_path)
          if mtime > indexed_time:
            # The file has changed, delete it and add it to the list of
            # files to reindex
            writer.delete_by_term('path', indexed_path)
            to_index.add(indexed_path)

      # Loop over the files in the filesystem
      # Assume we have a function that gathers the filenames of the
      # documents to be indexed
      for path in my_docs():
        if path in to_index or path not in indexed_paths:
          # This is either a file that's changed, or a new file
          # that wasn't indexed before. So index it!
          add_doc(writer, path)

      writer.commit()