sphinx / coreseek 配置文件详解

最新推荐文章于 2021-01-26 20:40:46 发布
胡德咏
最新推荐文章于 2021-01-26 20:40:46 发布
阅读量363
点赞数
本文链接：https://blog.csdn.net/u010620626/article/details/85225932
版权
source src1
{
  # 数据源类型，需强制指定，没有默认值
  # 可选类型： mysql, pgsql, mssql, xmlpipe, xmlpipe2, odbc
  type			= mysql # 这里我们使用的 MySQL

  # 这里是简单易懂的 SQL 连接参数全家桶
  sql_host		= localhost # host
  sql_user		= test # 用户
  sql_pass		= # 密码，注意当没有密码的时候空着，不可以写 '' 或者 ""
  sql_db			= test # 数据库
  sql_port		= 3306	# 可选, 默认是 3306

  # 也可以通过 socket 连接
  # 可选，默认是空
  # Linux 常见路径 '/var/lib/mysql/mysql.sock'
  # FreeBSD 常见路径 '/tmp/mysql.sock'
  #
  # sql_sock		= /tmp/mysql.sock


  # MySQL 指定客户端连接的一些标识
  # 可选，默认是 0
  # indexer 和 mysql 之间的交互，需要考虑效率和安全性。
  # 有 0/32/2048/32768 可选值, 依次表示 无/使用压缩协议/握手后切换到 ssl/Mysql4.1版本身份认证
  # mysql_connect_flags	= 32 # enable compression

  # 若上文中指定 ssl 协议连接时这里才需要填写
  # 可选，默认是空
  #
  # mysql_ssl_cert		= /etc/ssl/client-cert.pem
  # mysql_ssl_key		= /etc/ssl/client-key.pem
  # mysql_ssl_ca		= /etc/ssl/cacert.pem

  # mssql 特有，是否使用 windows 登录
  # 可选，默认是 0
  #
  # mssql_winauth		= 1 # 使用当前登录的用户认证登录


  # MS SQL特定的Unicode索引标志
  # 可选，默认值是 0 （需要 SBCS - 单字节数据）
  #
  # mssql_unicode		= 1 # 从服务器请求 Unicode 数据


  # 指定 ODBC  DSN
  # 强制指定 odbc 数据源类型，没有默认值
  #
  # odbc_dsn		= DBQ=C:\data;DefaultDir=C:\data;Driver={Microsoft Text Driver (*.txt; *.csv)};
  # sql_query		= SELECT id, data FROM documents.csv

  # ODBC 或 MS SQL 指定, 字段预读缓冲大小
  # 为什么要有这么一种缓冲呢？
  # 有的字符串，虽然长度很长，但是实际上并没有使用那么长的字符，所以在 Sphinx 并不会收录所有的字符，而是给每个属性一个缓冲作为长度限制。
  # 默认情况下非字符类型的属性限制是 1K，字符类型的限制是 1M
  # 可选，默认值是自动检测
  #
  # sql_column_buffers	= content=12M, comments=1M


  # sql_query 中的语句执行前，需要先执行这个 （可以用户完成一些前置配置工作）
  #
  # sql_query_pre		= SET NAMES utf8
  # sql_query_pre		= SET SESSION query_cache_type=OFF


  # 主文档获取语句
  # 强制， 整形的 id 必须出现在 select 语句中 select 关键词后第一个位置
  sql_query		= \
    SELECT id, group_id, UNIX_TIMESTAMP(date_added) AS date_added, title, content \
    FROM documents

  
  # joined/payload 查询
  ## 有的时候有多个表，我们想要查询的字段在其他表中。这个时候就需要对 sql_query 进行 join 操作。
  ## sql_joined_field 是增加一个字段, 这个字段是从其他表中查询出来的。
  ## 这里对分号后面的查询语句是有要求的，如果是 query， 则返回 id 和查询字段，如果是 payload-query, 则返回 id, 查询字段和权重。
  ## 并且这里的后一个查询需要按照 id 进行升序排列。
  # sql_joined_field = tags from query; select docid, concat('tag', tagid) from tags order by docid asc
  # sql_joined_field = wtags from payload-query; select docid, tag, tagweight from tags order by docid asc

  # 外部文件字段，在表中一个字段存的是外部文章地址，但是实际上字段内容在文件中。比如这个字段 content_file_path。
  # 当 indexer 建立索引的时候，查询这个字段，就读取这个文件，并进行分词和索引建立
  # 最大的文件大小通过 indexer 中 max_file_field_buffer 指定，文件 IO 错误是非致命错误，并被当做警告处理。
  # sql_file_field		= content_file_path

  
  # 当数据源数据太大的时候，一个 sql 语句查询下面往往很有可能锁表等操作。
  ## 那么就可以使用多次查询，那么这个多次查询就需要有个范围和步长， sql_query_range 和 sql_query_step 就是做这个的。
  ## 获取最大值和最小的id, 然后根据步长来获取数据。比如下面的例子，如果有 4500 条数据，这个表建立索引的时候就会进行 5 次 sql 查询。
  # 而 5 次 sql 查询每次的间隔时间是 sql_ranged_throttle 来进行设置的。单位是毫秒。
  # sql_query 需要和 $start 、$end 标记的区间关联
  # sql_query		= \
  #	SELECT doc.id, doc.id AS group, doc.title, doc.data \
  #	FROM documents doc \
  #	WHERE id>=$start AND id<=$end
  #
  # sql_query_range		= SELECT MIN(id),MAX(id) FROM documents

  
  # 步长
  # 可选，默认值： 1024
  #
  # sql_range_step		= 1000

  ## 下面是不同属性
  ## 属性是存在索引中的，它不进行全文索引，但是可以用于过滤和排序

  # 无符号整形属性定义
  # 可以定义多个值（可以指定任意数量的属性）, 可选
  # 可选字节大小可以被指定，默认是 32
  # sql_attr_uint		= author_id
  # sql_attr_uint		= forum_id:9 # 9 bits for forum_id
  sql_attr_uint		= group_id

  # 布尔值属性定义
  # 可以定义多个值，可选
  # 等同于 sql_attr_uint = 1
  # sql_attr_bool		= is_deleted

  # biginit 类型属性定义
  # 可以定义多个值，可选
  # 可以一个有符号的 64 属性
  # bigint attribute declaration
  # multi-value (an arbitrary number of attributes is allowed), optional
  # declares a signed (unlike uint!) 64-bit attribute
  #
  # sql_attr_bigint		= my_bigint_id


  # UNIX 时间戳属性定义
  # 可以定义多个值，可选
  # 和 integer 类似，但可以用于 date 函数
  # UNIX timestamp attribute declaration
  # multi-value (an arbitrary number of attributes is allowed), optional
  # similar to integer, but can also be used in date functions
  #
  # sql_attr_timestamp	= posted_ts
  # sql_attr_timestamp	= last_edited_ts
  sql_attr_timestamp	= date_added

  # 字符串排序属性。一般我们按照字符串排序的话，我们会将这个字符串存下来进入到索引中，然后在查询的时候比较索引中得字符大小进行排序。
  # 但是这个时候的索引就会很大，于是我们就想到一个方法，我们在建立索引的时候，先将字符串值从数据库中取出、暂存、排序。
  # 然后给排序后的数据分配一个序号，然后建立索引的时候，就将这个序号存入到索引中去。这样在查询的时候也就能完成字符串排序的操作。
  # string ordinal attribute declaration
  # multi-value (an arbitrary number of attributes is allowed), optional
  # sorts strings (bytewise), and stores their indexes in the sorted list
  # sorting by this attr is equivalent to sorting by the original strings
  #
  # sql_attr_str2ordinal	= author_name

  # 浮点数属性，经常在查询地理经纬度的时候用到
  # floating point attribute declaration
  # multi-value (an arbitrary number of attributes is allowed), optional
  # values are stored in single precision, 32-bit IEEE 754 format
  #
  # sql_attr_float		= lat_radians
  # sql_attr_float		= long_radians

  ## MVA 属性
  # 试想一下，有一个文章系统，每篇文章都有多个标签，这个文章就叫做多值属性.
  # 我要对某个标签进行查询过滤，那么在建立查询的时候就应该把这个标签的值放入到索引中。
  # multi-valued attribute (MVA) attribute declaration
  # multi-value (an arbitrary number of attributes is allowed), optional
  # MVA values are variable length lists of unsigned 32-bit integers
  #
  # syntax is ATTR-TYPE ATTR-NAME 'from' SOURCE-TYPE [;QUERY] [;RANGE-QUERY]
  # ATTR-TYPE is 'uint' or 'timestamp'
  # SOURCE-TYPE is 'field', 'query', or 'ranged-query'
  # QUERY is SQL query used to fetch all ( docid, attrvalue ) pairs
  # RANGE-QUERY is SQL query used to fetch min and max ID values, similar to 'sql_query_range'
  #
  # sql_attr_multi		= uint tag from query; SELECT docid, tagid FROM tags
  # sql_attr_multi		= uint tag from ranged-query; \
  #	SELECT docid, tagid FROM tags WHERE id>=$start AND id<=$end; \
  #	SELECT MIN(docid), MAX(docid) FROM tags


  # 字符串属性定义
  # string attribute declaration
  # multi-value (an arbitrary number of these is allowed), optional
  # lets you store and retrieve strings
  #
  # sql_attr_string		= stitle


  # 文档词汇数记录属性。如果下面就是在索引建立的时候增加一个词汇数的字段
  # wordcount attribute declaration
  # multi-value (an arbitrary number of these is allowed), optional
  # lets you count the words at indexing time
  #
  # sql_attr_str2wordcount	= stitle


  # 字段和属性定义组合
  # 将字段存储为属性，但同时也为其建立全文索引
  # combined field plus attribute declaration (from a single column)
  # stores column as an attribute, but also indexes it as a full-text field
  #
  # sql_field_string	= author
  # sql_field_str2wordcount	= title

  # 取后查询，在 sql_query 执行后立即操作
  # 它和 sql_query_post_index 的区别就是执行时间不同
  # sql_query_post 是在 sql_query 执行后执行，而 sql_query_post_index 是在索引建立完成后才执行。
  # 所以如果要记录最后索引执行时间，那么应该在 sql_query_post_index 中执行.
  # post-query, executed on sql_query completion
  # optional, default is empty
  #
  # sql_query_post		=

  # 参数 sql_query_post 的说明
  # post-index-query, executed on successful indexing completion
  # optional, default is empty
  # $maxid expands to max document ID actually fetched from DB
  #
  # sql_query_post_index	= REPLACE INTO counters ( id, val ) \
  #	VALUES ( 'max_indexed_id', $maxid )

  
  # ranged query 	throttling, in milliseconds
  # optional, default is 0 which means no delay
  # enforces given delay before each query step
  sql_ranged_throttle	= 0

  ## 命令行获取信息查询
  # 我们进行索引一般只会返回主键 id, 而不会返回表中的所有字段。
  # 但是在调试的时候，我们一般需要返回表中的字段，那这个时候，就需要使用 sql_query_info。
  # 同时这个字段只在控制后台有效，在 api 中是无效的。
  # 注意： 一定要寒三友 $id 宏，通过它获取数据
  sql_query_info		= SELECT * FROM documents WHERE id=$id

  # 比如有两个索引，一个索引比较旧，一个索引比较新，那么旧索引中就会有数据是旧的。
  # 当我要对两个索引进行搜索的时候，哪些数据要按照新的索引来进行查询呢。
  # kill-list query, fetches the document IDs for kill-list
  # k-list will suppress matches from preceding indexes in the same query
  # optional, default is empty
  #
  # sql_query_killlist	= SELECT id FROM documents WHERE edited>=@last_reindex
  # 详见: http://web.archive.org/web/20150403083256/http://www.coreseek.cn:80/docs/coreseek_4.1-sphinx_2.0.1-beta.html#conf-sql-query-killlist

  # 索引时， SQL 数据源的解压字段设置 - 当建立索引的动作发生在数据库所在机器以外的机器时，这个选项会降低数据库的负载，并节约网络带宽。要想使用这个特性，就必须保证在建立时zlib和zlib-devel都是可用的。
  # columns to unpack on indexer side when indexing
  # multi-value, optional, default is empty list
  #
  # unpack_zlib		= zlib_column
  # unpack_mysqlcompress	= compressed_column
  # unpack_mysqlcompress	= compressed_column_2


  # maximum unpacked length allowed in MySQL COMPRESS() unpacker
  # optional, default is 16M
  #
  # unpack_mysqlcompress_maxsize	= 16M


  #####################################################################
  ## xmlpipe2 settings
  #####################################################################

  # xmlpipe 的数据源就是一个 xml 文档
  # type			= xmlpipe

  # 读取数据源的 shell 命令
  # shell command to invoke xmlpipe stream producer
  # mandatory
  #
  # xmlpipe_command		= cat /usr/local/coreseek/var/test.xml

  # xmlpipe2 字段定义
  # xmlpipe2 field declaration
  # multi-value, optional, default is empty
  #
  # xmlpipe_field		= subject
  # xmlpipe_field		= content

  # 属性定义
  # xmlpipe2 attribute declaration
  # multi-value, optional, default is empty
  # all xmlpipe_attr_XXX options are fully similar to sql_attr_XXX
  #
  # xmlpipe_attr_timestamp	= published
  # xmlpipe_attr_uint	= author_id

  # UTF-8验证和修复设置
  # 只适用 xmlpipe2 数据源，数据源中有可能有非 utf-8 的字符，这个时候解析容易出错
  # optional, default is 0
  #
  # xmlpipe_fixup_utf8	= 1
}

#############################################################################
## index definition
#############################################################################

# local index example
#
# this is an index which is stored locally in the filesystem
#
# all indexing-time options (such as morphology and charsets)
# are configured per local index
index test1
{
  # 索引类型，包括 palin, distribute 和 rt。分别是普通索引/分布式索引/增量索引。默认 plain。
  # index type
  # optional, default is 'plain'
  # known values are 'plain', 'distributed', and 'rt' (see samples below)
  # type			= plain

  # 数据源
  # document source(s) to index
  # multi-value, mandatory
  # document IDs must be globally unique across all sources
  source			= src1

  # 索引文件存放的路径
  # index files path and file name, without extension
  # mandatory, path must be writable, extensions will be auto-appended
  path			= /usr/local/coreseek/var/data/test1

  # 文档信息的存储模式，包括 none, extern, inline。默认是 extern.
  # docinfo 指的就是数据的所有属性(field)构成的一个集合。
  # 首先文档id是存储在一个文件中的（spa)
  # 当时用 inline 的时候，文档的属性和文件id都是存放在 spa 中的，所以进行查询过滤的时候，不要进行额外的操作。
  # 当时用 extern 的时候，文档的属性是存放在另外一个 (spd) 中的，但是当启动 searchd 的时候，会把这个文件加载到内存中。
  # extern 就意味着每次做查询过滤的时候，除了文档id之外，还需要去内存中根据属性进行过滤。
  # 但是即使这样， extern 由于文件小，效率也不低。所以不是有特殊要求，一般都是使用 extern。
  # document attribute values (docinfo) storage mode
  # optional, default is 'extern'
  # known values are 'none', 'extern' and 'inline'
  docinfo			= extern

  # 缓冲内存锁定
  # searchd 会将 spa 和 spi 预读到内存中。但是如果这部分内存数据长时间没有访问，则它会被交换到磁盘上。
  # 可以通过设置 mlock 来避免出现这个情况。
  # memory locking for cached data (.spa and .spi), to prevent swapping
  # optional, default is 0 (do not mlock)
  # requires searchd to be run from root
  mlock			= 0

  # 词形处理器
  # 词形处理是什么意思呢, 比如在英语中， dogs 是 dog 的复数, 所以 dog 是 dogs 的词干，这两个词实际上是通过一个词。
  # 所以英语的词形处理器会将 dogs 当做 dog 来处理。
  # a list of morphology preprocessors to apply
  # optional, default is empty
  #
  # builtin preprocessors are 'none', 'stem_en', 'stem_ru', 'stem_enru',
  # 'soundex', and 'metaphone'; additional preprocessors available from
  # libstemmer are 'libstemmer_XXX', where XXX is algorithm code
  # (see libstemmer_c/libstemmer/modules.txt)
  #
  # morphology		= stem_en, stem_ru, soundex
  # morphology		= libstemmer_german
  # morphology		= libstemmer_sv
  morphology		= none

  # 词形处理有时候会出现问题，比如讲 gps 处理成 gp, 这个设置可以允许根据词的长度来决定是否要使用词形处理器
  # minimum word length at which to enable stemming
  # optional, default is 1 (stem everything)
  #
  # min_stemming_len	= 1

  # 停止词，停止词就是不被索引的词
  # stopword files list (space separated)
  # optional, default is empty
  # contents are plain text, charset_table and stemming are both applied
  #
  # stopwords		= /usr/local/coreseek/var/data/stopwords.txt

  # 自定义词形字典
  # wordforms file, in "mapfrom > mapto" plain text format
  # optional, default is empty
  #
  # wordforms		= /usr/local/coreseek/var/data/wordforms.txt

  # 词汇特殊处理
  # 有的一些特殊词我们希望把它当做另外一个词来处理。比如: c++ => cplusplus来处理。
  # tokenizing exceptions file
  # optional, default is empty
  #
  # plain text, case sensitive, space insensitive in map-from part
  # one "Map Several Words => ToASingleOne" entry per line
  #
  # exceptions		= /usr/local/coreseek/var/data/exceptions.txt

  # 最小索引词长度，小于这个长度的词不会被索引。
  # minimum indexed word length
  # default is 1 (index everything)
  min_word_len		= 1

  # 字符编码类型，可以为 sbcs,utf-8。对于 coreseek, 还可以有 zn_cn.utf-8,zh_ch.gbk,zh_ch.big5
  # charset encoding type
  # optional, default is 'sbcs'
  # known types are 'sbcs' (Single Byte CharSet) and 'utf-8'
  charset_type		= sbcs

  # 字符表和大小写转换规则。对于 coreseek, 这个字段无效。
  # charset definition and case folding rules "table"
  # optional, default value depends on charset_type
  #
  # defaults are configured to include English and Russian characters only
  # you need to change the table to include additional ones
  # this behavior MAY change in future versions
  #
  # 'sbcs' default value is
  # charset_table		= 0..9, A..Z->a..z, _, a..z, U+A8->U+B8, U+B8, U+C0..U+DF->U+E0..U+FF, U+E0..U+FF
  #
  # 'utf-8' default value is
  # charset_table		= 0..9, A..Z->a..z, _, a..z, U+410..U+42F->U+430..U+44F, U+430..U+44F

  # 忽略字符表。在忽略字符表中的前后缀会被连起来当做一个单独关键词处理。
  # ignored characters list
  # optional, default value is empty
  #
  # ignore_chars		= U+00AD

  # min_prefix_len,min_infix_len,prefix_fields,infix_fields 都是 enable_star 开启的时候 才有效果。
  # 最小前缀索引长度
  # 为什么要有这个配置项呢？
  # 首先这个是当启用通配符配置的前提下说的，前缀索引使得一个关键词产生了多个索引项，导致索引文件体积和搜索时间增加巨大。
  # 那么我们就有必要限制下前缀索引的长度，比如 example, 当前缀索引长度设置为 5 的时候，它只会分解为 exampl, example 了
  # minimum word prefix length to index
  # optional, default is 0 (do not index prefixes)
  #
  # min_prefix_len		= 0

  # 最小索引中缀长度
  # minimum word infix length to index
  # optional, default is 0 (do not index infixes)
  #
  # min_infix_len		= 0

  # 前缀索引和中缀索引字段列表。并不是所有的字段都需要进行前缀和中缀索引。
  # list of fields to limit prefix/infix indexing to
  # optional, default value is empty (index all fields in prefix/infix mode)
  #
  # prefix_fields		= filename
  # infix_fields		= url, domain

  # 是否启用通配符，默认是 0，不启用。
  # enable star-syntax (wildcards) when searching prefix/infix indexes
  # search-time only, does not affect indexing, can be 0 or 1
  # optional, default is 0 (do not use wildcard syntax)
  #
  # enable_star		= 1

  # 词汇展开
  # 是否尽可能展示关键字的精确格式或者型号形式
  # expand keywords with exact forms and/or stars when searching fit indexes
  # search-time only, does not affect indexing, can be 0 or 1
  # optional, default is 0 (do not expand keywords)
  #
  # expand_keywords		= 1

  # n-gram 索引的分词技术
  # n-gram 是指不按照词典，而是按照字长来分词，这个主要是针对非英文体系的一些语言来做的（中文、韩文、日文）
  # 对 coreseek 来说，这两个配置项可以忽略。
  # n-gram length to index, for CJK indexing
  # only supports 0 and 1 for now, other lengths to be implemented
  # optional, default is 0 (disable n-grams)
  #
  # ngram_len		= 1


  # n-gram characters list, for CJK indexing
  # optional, default is empty
  #
  # ngram_chars		= U+3000..U+2FA1F

  # 词组边界列表和步长
  # 哪些字符被看做分割不同词组的边界
  # phrase boundary characters list
  # optional, default is empty
  #
  # phrase_boundary		= ., ?, !, U+2026 # horizontal ellipsis

  
  # phrase boundary word position increment
  # optional, default is 0
  #
  # phrase_boundary_step	= 100

  # 混合字符列表
  # blended characters list
  # blended chars are indexed both as separators and valid characters
  # for instance, AT&T will results in 3 tokens ("at", "t", and "at&t")
  # optional, default is empty
  #
  # blend_chars		= +, &, U+23


  # blended token indexing mode
  # a comma separated list of blended token indexing variants
  # known variants are trim_none, trim_head, trim_tail, trim_both, skip_pure
  # optional, default is trim_none
  #
  # blend_mode		= trim_tail, skip_pure

  # html 标记清理，是否从输出全文数据中去除 html 标记
  # whether to strip HTML tags from incoming documents
  # known values are 0 (do not strip) and 1 (do strip)
  # optional, default is 0
  html_strip		= 0

  # html 标记清理，是否从输出全文数据中去除 html 标记。
  # what HTML attributes to index if stripping HTML
  # optional, default is empty (do not index anything)
  #
  # html_index_attrs	= img=alt,title; a=title;

  # 需要清理的 html 元素
  # what HTML elements contents to strip
  # optional, default is empty (do not strip element contents)
  #
  # html_remove_elements	= style, script

  # searchd 是预先打开全部索引还是将他预先缓冲在内存中
  # whether to preopen index data files on startup
  # optional, default is 0 (do not preopen), searchd-only
  #
  # preopen			= 1

  # 字典文件是保持在磁盘上，预载入内存。
  # whether to keep dictionary (.spi) on disk, or cache it in RAM
  # optional, default is 0 (cache in RAM), searchd-only
  #
  # ondisk_dict		= 1

  # 由于在索引建立的时候，需要建立临时文件和副本，还有旧的索引
  # 这个时候磁盘使用量暴增，于是有个方法是临时文件的重复利用
  # 这个配置会极大的减少建立索引的时候的磁盘压力，代价是 索引建立速度变慢。
  # whether to enable in-place inversion (2x less disk, 90-95% speed)
  # optional, default is 0 (use separate temporary files), indexer-only
  #
  # inplace_enable		= 1


  # in-place fine-tuning options
  # optional, defaults are listed below
  #
  # inplace_hit_gap		= 0 # preallocated hitlist gap size
  # inplace_docinfo_gap	= 0 # preallocated docinfo gap size
  # inplace_reloc_factor	= 0.1 # relocation buffer size within arena
  # inplace_write_factor	= 0.1 # write buffer size within arena


  # whether to index original keywords along with stemmed versions
  # enables "=exactform" operator to work
  # optional, default is 0
  #
  # index_exact_words	= 1

  # 在经过过短的位置后增加位置值
  # position increment on overshort (less that min_word_len) words
  # optional, allowed values are 0 and 1, default is 1
  #
  # overshort_step		= 1

  # 在经过停用词后增加位置值
  # position increment on stopword
  # optional, allowed values are 0 and 1, default is 1
  #
  # stopword_step		= 1

  # 位置忽略词汇列表
  # hitless words list
  # positions for these keywords will not be stored in the index
  # optional, allowed values are 'all', or a list file name
  #
  # hitless_words		= all
  # hitless_words		= hitless.txt

  # 是否检测并索引句子和段落边界。
  # detect and index sentence and paragraph boundaries
  # required for the SENTENCE and PARAGRAPH operators to work
  # optional, allowed values are 0 and 1, default is 0
  #
  # index_sp			= 1

  # 字段内需要索引的 html / xml 区域的标签列表。
  # index zones, delimited by HTML/XML tags
  # a comma separated list of tags and wildcards
  # required for the ZONE operator to work
  # optional, default is empty string (do not index zones)
  #
  # index_zones		= title, h*, th
}

# realtime index example
#
# you can run INSERT, REPLACE, and DELETE on this index on the fly
# using MySQL protocol (see 'listen' directive below)
index rt
{
  # 'rt' index type must be specified to use RT index
  type			= rt

  # index files path and file name, without extension
  # mandatory, path must be writable, extensions will be auto-appended
  path			= /usr/local/coreseek/var/data/rt

  # RT 内存索引限制
  # RAM chunk size limit
  # RT index will keep at most this much data in RAM, then flush to disk
  # optional, default is 32M
  #
  # rt_mem_limit		= 512M

  # 全文字段定义
  # full-text field declaration
  # multi-value, mandatory
  rt_field		= title
  rt_field		= content

  # 无符号整数属性定义
  # unsigned integer attribute declaration
  # multi-value (an arbitrary number of attributes is allowed), optional
  # declares an unsigned 32-bit attribute
  rt_attr_uint		= gid

  # 各种属性定义
  # RT indexes currently support the following attribute types:
  # uint, bigint, float, timestamp, string
  #
  # rt_attr_bigint		= guid
  # rt_attr_float		= gpa
  # rt_attr_timestamp	= ts_added
  # rt_attr_string		= author
}

#############################################################################
## indexer settings
#############################################################################

indexer
{
  # 建立索引的时候，索引内存限制
  # memory limit, in bytes, kiloytes (16384K) or megabytes (256M)
  # optional, default is 32M, max is 2047M, recommended is 256M to 1024M
  mem_limit		= 32M

  # 每秒最大 I/O 操作次数，用于限制 I/O 操作
  # maximum IO calls per second (for I/O throttling)
  # optional, default is 0 (unlimited)
  #
  # max_iops		= 40

  # 最大允许的 I/O 操作大小，以字节为单位，用于 I/O 节流
  # maximum IO call size, bytes (for I/O throttling)
  # optional, default is 0 (unlimited)
  #
  # max_iosize		= 1048576

  # 对于 xmlpipe2 数据源允许的最大的字段大小，以字节为单位
  # maximum xmlpipe2 field length, bytes
  # optional, default is 2M
  #
  # max_xmlpipe2_field	= 4M

  # 写缓冲区大小，单位是字节
  # write buffer size, bytes
  # several (currently up to 4) buffers will be allocated
  # write buffers are allocated in addition to mem_limit
  # optional, default is 1M
  #
  # write_buffer		= 1M

  # 文件字段可用的最大缓冲区大小，字节为单位
  # maximum file field adaptive buffer size
  # optional, default is 8M, minimum is 1M
  #
  # max_file_field_buffer	= 32M
}

#############################################################################
## searchd settings
#############################################################################

searchd
{
  # [hostname:]port[:protocol], or /unix/socket/path to listen on
  # known protocols are 'sphinx' (SphinxAPI) and 'mysql41' (SphinxQL)
  #
  # multi-value, multiple listen points are allowed
  # optional, defaults are 9312:sphinx and 9306:mysql41, as below
  #
  # 监听端口
  # listen			= 127.0.0.1
  # listen			= 192.168.0.1:9312
  # listen			= 9312
  # listen			= /var/run/searchd.sock
  listen			= 9312
  listen			= 9306:mysql41

  # 监听日志
  # log file, searchd run info is logged here
  # optional, default is 'searchd.log'
  log			= /usr/local/coreseek/var/log/searchd.log

  # 查询日志
  # query log file, all search queries are logged here
  # optional, default is empty (do not log queries)
  query_log		= /usr/local/coreseek/var/log/query.log

  # 客户端读超时时间
  # client read timeout, seconds
  # optional, default is 5
  read_timeout		= 5

  # 客户端持久连接超时时间
  # request timeout, seconds
  # optional, default is 5 minutes
  client_timeout		= 300

  # 搜索并发数
  # maximum amount of children to fork (concurrent searches to run)
  # optional, default is 0 (unlimited)
  max_children		= 30

  # 进程id文件
  # PID file, searchd process ID file name
  # mandatory
  pid_file		= /usr/local/coreseek/var/log/searchd.pid

  # 守护进程在内存中为每个索引保持并返回给客户端的最大值
  # max amount of matches the daemon ever keeps in RAM, per-index
  # WARNING, THERE'S ALSO PER-QUERY LIMIT, SEE SetLimits() API CALL
  # default is 1000 (just like Google)
  max_matches		= 1000

  # 无缝轮转。防止 searchd 轮转在需要预取大量数据的索引时停止响应
  # 当进行索引轮转的时候，可能需要消耗大量的时候在轮转索引上。
  # 但是启动了无缝轮转，就以消耗内存为代价减少轮转时间
  # seamless rotate, prevents rotate stalls if precaching huge datasets
  # optional, default is 1
  seamless_rotate		= 1

  # 索引预开启，是否强制重新打开所有索引文件
  # whether to forcibly preopen all indexes on startup
  # optional, default is 1 (preopen everything)
  preopen_indexes		= 1

  # 索引轮转成功之后，是否删除以 .old 为扩展名的索引拷贝
  # whether to unlink .old index copies on succesful rotation.
  # optional, default is 1 (do unlink)
  unlink_old		= 1

  # 属性刷新周期
  # 就是使用 UpdateAttributes() 更新的文档属性每隔多少时间时间写回磁盘中
  # attribute updates periodic flush timeout, seconds
  # updates will be automatically dumped to disk this frequently
  # optional, default is 0 (disable periodic flush)
  #
  # attr_flush_period	= 900

  # 索引字典存储方式
  # instance-wide ondisk_dict defaults (per-index value take precedence)
  # optional, default is 0 (precache all dictionaries in RAM)
  #
  # ondisk_dict_default	= 1

  # 用于多值属性 MVA 更新的存储空间的内存共享池大小
  # MVA updates pool size
  # shared between all instances of searchd, disables attr flushes!
  # optional, default size is 1M
  mva_updates_pool	= 1M

  # 网络通讯是允许的最大的包的大小
  # max allowed network packet size
  # limits both query packets from clients, and responses from agents
  # optional, default size is 8M
  max_packet_size		= 8M
  
  # 奔溃日志文件
  # crash log path
  # searchd will (try to) log crashed query to 'crash_log_path.PID' file
  # optional, default is empty (do not create crash logs)
  #
  # crash_log_path		= /usr/local/coreseek/var/log/crash

  # 每次查询允许设置的过滤器的最大个数
  # max allowed per-query filter count
  # optional, default is 256
  max_filters		= 256

  # 单个过滤器允许的值的最大个数
  # max allowed per-filter values count
  # optional, default is 4096
  max_filter_values	= 4096

  # tcp  监听待处理队列长度
  # socket listen queue length
  # optional, default is 5
  #
  # listen_backlog		= 5

  # 每个关键字的读缓冲取的大小
  # per-keyword read buffer size
  # optional, default is 256K
  #
  # read_buffer		= 256K

  # 无匹配时读取操作的大小
  # unhinted read size (currently used when reading hits)
  # optional, default is 32K
  #
  # read_unhinted		= 32K

  # 每次批量查询的查询数限制
  # max allowed per-batch query count (aka multi-query count)
  # optional, default is 32
  max_batch_queries	= 32

  # 每个查询的公共子树文档缓冲大小
  # max common subtree document cache size, per-query
  # optional, default is 0 (disable subtree optimization)
  #
  # subtree_docs_cache	= 4M

  # 每个公共子树命中缓存大小
  # max common subtree hit cache size, per-query
  # optional, default is 0 (disable subtree optimization)
  #
  # subtree_hits_cache	= 8M

  # 多处理模式(MPM)。可选项: 可用值 none、fork、prefork 以及 threads。默认在 unix 类系统为 form, windows 系统为 threads。
  # multi-processing mode (MPM)
  # known values are none, fork, prefork, and threads
  # optional, default is fork
  #
  workers			= threads # for RT to work

  # 并发查询线程数
  # max threads to create for searching local parts of a distributed index
  # optional, default is 0, which means disable multi-threaded searching
  # should work with all MPMs (ie. does NOT require workers=threads)
  #
  # dist_threads		= 4

  # 二进制日志路径
  # binlog files path; use empty string to disable binlog
  # optional, default is build-time configured data directory
  #
  # binlog_path		= # disable logging
  # binlog_path		= /usr/local/coreseek/var/data # binlog.001 etc will be created there

  # 二进制日志刷新
  # binlog flush/sync mode
  # 0 means flush and sync every second
  # 1 means flush and sync every transaction
  # 2 means flush every transaction, sync every second
  # optional, default is 2
  #
  # binlog_flush		= 2

  # 二进制日志大小限制
  # binlog per-file size limit
  # optional, default is 128M, 0 means no limit
  #
  # binlog_max_log_size	= 256M

  # 线程堆栈大小
  # per-thread stack size, only affects workers=threads mode
  # optional, default is 64K
  #
  # thread_stack			= 128K

  # 关键字展开限制
  # per-keyword expansion limit (for dict=keywords prefix searches)
  # optional, default is 0 (no limit)
  #
  # expansion_limit		= 1000

  # RT 索引刷新周期
  # RT RAM chunks flush period
  # optional, default is 0 (no periodic flush)
  #
  # rt_flush_period		= 900

  # 查询日志格式
  # query log file format
  # optional, known values are plain and sphinxql, default is plain
  #
  # query_log_format		= sphinxql

  # MySQL 版本设置
  # version string returned to MySQL network protocol clients
  # optional, default is empty (use Sphinx version)
  #
  # mysql_version_string	= 5.0.37

  # 插件目录
  # trusted plugin directory
  # optional, default is empty (disable UDFs)
  #
  # plugin_dir			= /usr/local/sphinx/lib

  # 服务端默认字符集
  # default server-wide collation
  # optional, default is libc_ci
  # collation_server		= utf8_general_ci

  # 服务端 libc 字符集
  # server-wide locale for libc based collations
  # optional, default is C
  #
  # collation_libc_locale	= ru_RU.UTF-8

  # 线程服务看守
  # threaded server watchdog (only used in workers=threads mode)
  # optional, values are 0 and 1, default is 1 (watchdog on)
  #
  # watchdog				= 1

  # 兼容模式
  # SphinxQL compatibility mode (legacy columns and their names)
  # optional, default is 0 (SQL compliant syntax and result sets)
  #
  # compat_sphinxql_magics	= 1
}
参考:

https://www.cnblogs.com/yjf512/p/3598332.html
胡德咏
关注
0
点赞
踩
0

收藏

觉得还不错? 一键收藏
打赏
0
评论
sphinx / coreseek 配置文件详解

source src1{ # 数据源类型，需强制指定，没有默认值 # 可选类型： mysql, pgsql, mssql, xmlpipe, xmlpipe2, odbc type = mysql # 这里我们使用的 MySQL # 这里是简单易懂的 SQL 连接参数全家桶 sql_host = localhost # host sql_user = test ...
复制链接

扫一扫