sphinx学习（三）基本配置 index definition

最新推荐文章于 2018-07-14 22:59:42 发布
weixin_33840661
最新推荐文章于 2018-07-14 22:59:42 发布
阅读量224
点赞数
文章标签： python
原文链接：https://my.oschina.net/yunjie/blog/774825
版权
2019独角兽企业重金招聘Python工程师标准>>>
index definition
index test1
{
	# index type
	# optional, default is 'plain'
	# known values are 'plain', 'distributed', and 'rt' (see samples below)
	# type			= plain

	# document source(s) to index
	# multi-value, mandatory
	# document IDs must be globally unique across all sources
	source			= src1

	# index files path and file name, without extension
	# mandatory, path must be writable, extensions will be auto-appended
	path			= /usr/local/sphinx/var/data/test1

	# document attribute values (docinfo) storage mode
	# optional, default is 'extern'
	# known values are 'none', 'extern' and 'inline'
	docinfo			= extern

	# dictionary type, 'crc' or 'keywords'
	# crc is faster to index when no substring/wildcards searches are needed
	# crc with substrings might be faster to search but is much slower to index
	# (because all substrings are pre-extracted as individual keywords)
	# keywords is much faster to index with substrings, and index is much (3-10x) smaller
	# keywords supports wildcards, crc does not, and never will
	# optional, default is 'keywords'
	dict			= keywords

	# memory locking for cached data (.spa and .spi), to prevent swapping
	# optional, default is 0 (do not mlock)
	# requires searchd to be run from root
	mlock			= 0

	# a list of morphology preprocessors to apply
	# optional, default is empty
	#
	# builtin preprocessors are 'none', 'stem_en', 'stem_ru', 'stem_enru',
	# 'soundex', and 'metaphone'; additional preprocessors available from
	# libstemmer are 'libstemmer_XXX', where XXX is algorithm code
	# (see libstemmer_c/libstemmer/modules.txt)
	#
	# morphology		= stem_en, stem_ru, soundex
	# morphology		= libstemmer_german
	# morphology		= libstemmer_sv
	morphology		= none

	# minimum word length at which to enable stemming
	# optional, default is 1 (stem everything)
	#
	# min_stemming_len	= 1


	# stopword files list (space separated)
	# optional, default is empty
	# contents are plain text, charset_table and stemming are both applied
	#
	# stopwords		= /usr/local/sphinx/var/data/stopwords.txt


	# wordforms file, in "mapfrom > mapto" plain text format
	# optional, default is empty
	#
	# wordforms		= /usr/local/sphinx/var/data/wordforms.txt


	# tokenizing exceptions file
	# optional, default is empty
	#
	# plain text, case sensitive, space insensitive in map-from part
	# one "Map Several Words => ToASingleOne" entry per line
	#
	# exceptions		= /usr/local/sphinx/var/data/exceptions.txt


	# embedded file size limit
	# optional, default is 16K
	#
	# exceptions, wordforms, and stopwords files smaller than this limit
	# are stored in the index; otherwise, their paths and sizes are stored
	#
	# embedded_limit		= 16K

	# minimum indexed word length
	# default is 1 (index everything)
	min_word_len		= 1


	# ignored characters list
	# optional, default value is empty
	#
	# ignore_chars		= U+00AD


	# minimum word prefix length to index
	# optional, default is 0 (do not index prefixes)
	#
	# min_prefix_len		= 0


	# minimum word infix length to index
	# optional, default is 0 (do not index infixes)
	#
	# min_infix_len		= 0


	# maximum substring (prefix or infix) length to index
	# optional, default is 0 (do not limit substring length)
	#
	# max_substring_len	= 8


	# list of fields to limit prefix/infix indexing to
	# optional, default value is empty (index all fields in prefix/infix mode)
	#
	# prefix_fields		= filename
	# infix_fields		= url, domain


	# expand keywords with exact forms and/or stars when searching fit indexes
	# search-time only, does not affect indexing, can be 0 or 1
	# optional, default is 0 (do not expand keywords)
	#
	# expand_keywords		= 1

	
	# n-gram length to index, for CJK indexing
	# only supports 0 and 1 for now, other lengths to be implemented
	# optional, default is 0 (disable n-grams)
	#
	# ngram_len		= 1


	# n-gram characters list, for CJK indexing
	# optional, default is empty
	#
	# ngram_chars		= U+3000..U+2FA1F


	# phrase boundary characters list
	# optional, default is empty
	#
	# phrase_boundary		= ., ?, !, U+2026 # horizontal ellipsis


	# phrase boundary word position increment
	# optional, default is 0
	#
	# phrase_boundary_step	= 100


	# blended characters list
	# blended chars are indexed both as separators and valid characters
	# for instance, AT&T will results in 3 tokens ("at", "t", and "at&t")
	# optional, default is empty
	#
	# blend_chars		= +, &, U+23


	# blended token indexing mode
	# a comma separated list of blended token indexing variants
	# known variants are trim_none, trim_head, trim_tail, trim_both, skip_pure
	# optional, default is trim_none
	#
	# blend_mode		= trim_tail, skip_pure


	# whether to strip HTML tags from incoming documents
	# known values are 0 (do not strip) and 1 (do strip)
	# optional, default is 0
	html_strip		= 0

	# what HTML attributes to index if stripping HTML
	# optional, default is empty (do not index anything)
	#
	# html_index_attrs	= img=alt,title; a=title;


	# what HTML elements contents to strip
	# optional, default is empty (do not strip element contents)
	#
	# html_remove_elements	= style, script


	# whether to preopen index data files on startup
	# optional, default is 0 (do not preopen), searchd-only
	#
	# preopen			= 1


	# whether to enable in-place inversion (2x less disk, 90-95% speed)
	# optional, default is 0 (use separate temporary files), indexer-only
	#
	# inplace_enable		= 1


	# in-place fine-tuning options
	# optional, defaults are listed below
	#
	# inplace_hit_gap		= 0 # preallocated hitlist gap size
	# inplace_docinfo_gap	= 0 # preallocated docinfo gap size
	# inplace_reloc_factor	= 0.1 # relocation buffer size within arena
	# inplace_write_factor	= 0.1 # write buffer size within arena


	# whether to index original keywords along with stemmed versions
	# enables "=exactform" operator to work
	# optional, default is 0
	#
	# index_exact_words	= 1


	# position increment on overshort (less that min_word_len) words
	# optional, allowed values are 0 and 1, default is 1
	#
	# overshort_step		= 1


	# position increment on stopword
	# optional, allowed values are 0 and 1, default is 1
	#
	# stopword_step		= 1


	# hitless words list
	# positions for these keywords will not be stored in the index
	# optional, allowed values are 'all', or a list file name
	#
	# hitless_words		= all
	# hitless_words		= hitless.txt


	# detect and index sentence and paragraph boundaries
	# required for the SENTENCE and PARAGRAPH operators to work
	# optional, allowed values are 0 and 1, default is 0
	#
	# index_sp			= 1


	# index zones, delimited by HTML/XML tags
	# a comma separated list of tags and wildcards
	# required for the ZONE operator to work
	# optional, default is empty string (do not index zones)
	#
	# index_zones		= title, h*, th


	# index per-document and average per-index field lengths, in tokens
	# required for the BM25A(), BM25F() in expression ranker
	# optional, default is 0 (do not index field lenghts)
	#
	# index_field_lengths	= 1


	# regular expressions (regexps) to filter the fields and queries with
	# gets applied to data source fields when indexing
	# gets applied to search queries when searching
	# multi-value, optional, default is empty list of regexps
	#
	# regexp_filter		= \b(\d+)\" => \1inch
	# regexp_filter		= (blue|red) => color


	# list of the words considered frequent with respect to bigram indexing
	# optional, default is empty
	#
	# bigram_freq_words	= the, a, i, you, my


	# bigram indexing mode
	# known values are none, all, first_freq, both_freq
	# option, default is none (do not index bigrams)
	#
	# bigram_index		= both_freq


	# snippet document file name prefix
	# preprended to file names when generating snippets using load_files option
	# WARNING, this is a prefix (not a path), trailing slash matters!
	# optional, default is empty
	#
	# snippets_file_prefix	= /mnt/mydocs/server1


	# whether to apply stopwords before or after stemming
	# optional, default is 0 (apply stopwords after stemming)
	#
	# stopwords_unstemmed	= 0


	# path to a global (cluster-wide) keyword IDFs file
	# optional, default is empty (use local IDFs)
	#
	# global_idf		= /usr/local/sphinx/var/global.idf
}
转载于:https://my.oschina.net/yunjie/blog/774825