Sphinx+Mysql+shell搜索引擎架构 AND 安装

作者:lxm      时间:2012/11/20 (最后整理修改)

Sphinx能干什么?

高速索引 (在新款CPU上,近10 MB/秒);
高速搜索 (2-4G的文本量中平均查询速度不到0.1秒);
高可用性 (单CPU上最大可支持100 GB的文本,100M文档);
提供良好的相关性排名
支持分布式搜索;
提供文档摘要生成;
提供从MySQL内部的插件式存储引擎上搜索
支持布尔,短语, 和近义词查询;
支持每个文档多个全文检索域(默认最大32个);
支持每个文档多属性;
支持断词;
支持单字节编码与UTF-8编码;

        Sphinx不负责文本字段的存储。假设将数据库的id、date、title、body字段,用sphinx建立搜索索引。根据关键字、时间、类别、范围等信息查询一下sphinx,sphinx只会将查询结果的ID号告诉我们。要显示title、body等信息,还需要根据此ID号去查询MySQL数据库,或者从Memcachedb等其他的存储中取得。安装SphinxSE作为MySQL的存储引擎,将MySQL与Sphinx结合起来,是一种便捷的方法。
        创建一张Sphinx类型表,将MyISAM表的主键ID和Sphinx表的ID作一个JOIN联合查询。这样,对于MyISAM表来所,只相当于一个WHERE id=...的主键查询,WHERE后的条件都交给Sphinx去处理,可以充分发挥两者的优势,实现高速搜索查询。

Sphinx配置
source 源名称1{
 …
}
index 索引名称1{
 source=源名称1
 …
}
source 源名称2{
 …
}
index 索引名称2{
 source = 源名称2
 …
}
indexer{
 …
}
searchd{
 …
}

从组成我们可以发现sphinx可以定义多个索引与数据源,不同的索引与数据源可以应用到不同表或不同应用的全文检索。

根据前面的实例,我们配置出我们需要的sphinx.conf,如下:

source cgfinal
{
 type = mysql
 strip_html = 0
 index_html_attrs =
 sql_host = localhost
 sql_user = root
 sql_pass = admin
 sql_db = test
 sql_port= 3306 # optional, default is 3306
 sql_query_pre=  SET NAMES utf8

 sql_query = SELECT ARTICLESID,TITLE,CONTENTS,AUTHOR,CATALOGID,ADDTIME,EDITUSERID,\
HITS FROM a.eht_news_articles 
 #sql_query = SELECT * FROM a.eht_news_articles 
 sql_attr_uint= CATALOGID
 sql_attr_uint= EDITUSERID
 sql_attr_uint = HITS
 sql_attr_timestamp = ADDTIME
 
 sql_query_post  =
 sql_ranged_throttle= 0
 #sql_query_info = SELECT * FROM a.eht_news_articles WHERE ARTICLESID=$id
}
index cgfinal
{
 source   = cgfinal
 path   = d:/sphinx/data/cgfinal
 docinfo   = extern
 mlock   = 0
 morphology   = none
 stopwords   =
 min_word_len  = 1
 charset_type  = utf-8
 charset_table = U+FF10..U+FF19->0..9, 0..9, U+FF41..U+FF5A->a..z, U+FF21..U+FF3A->a..z,\
A..Z->a..z, a..z, U+0149, U+017F, U+0138, U+00DF, U+00FF, U+00C0..U+00D6->U+00E0..U+00F6,\
U+00E0..U+00F6, U+00D8..U+00DE->U+00F8..U+00FE, U+00F8..U+00FE, U+0100->U+0101, U+0101,\
U+0102->U+0103, U+0103, U+0104->U+0105, U+0105, U+0106->U+0107, U+0107, U+0108->U+0109,\
U+0109, U+010A->U+010B, U+010B, U+010C->U+010D, U+010D, U+010E->U+010F, U+010F,\
U+0110->U+0111, U+0111, U+0112->U+0113, U+0113, U+0114->U+0115, U+0115, \
U+0116->U+0117,U+0117, U+0118->U+0119, U+0119, U+011A->U+011B, U+011B, U+011C->U+011D,\
 U+011D,U+011E->U+011F, U+011F, U+0130->U+0131, U+0131, U+0132->U+0133, U+0133, \
U+0134->U+0135,U+0135, U+0136->U+0137, U+0137, U+0139->U+013A, U+013A, U+013B->U+013C, \
U+013C,U+013D->U+013E, U+013E, U+013F->U+0140, U+0140, U+0141->U+0142, U+0142, \
U+0143->U+0144,U+0144, U+0145->U+0146, U+0146, U+0147->U+0148, U+0148, U+014A->U+014B, \
U+014B,U+014C->U+014D, U+014D, U+014E->U+014F, U+014F, U+0150->U+0151, U+0151, \
U+0152->U+0153,U+0153, U+0154->U+0155, U+0155, U+0156->U+0157, U+0157, U+0158->U+0159,\
 U+0159,U+015A->U+015B, U+015B, U+015C->U+015D, U+015D, U+015E->U+015F, U+015F, \
U+0160->U+0161,U+0161, U+0162->U+0163, U+0163, U+0164->U+0165, U+0165, U+0166->U+0167, \
U+0167,U+0168->U+0169, U+0169, U+016A->U+016B, U+016B, U+016C->U+016D, U+016D, \
U+016E->U+016F,U+016F, U+0170->U+0171, U+0171, U+0172->U+0173, U+0173, U+0174->U+0175,\
 U+0175,U+0176->U+0177, U+0177, U+0178->U+00FF, U+00FF, U+0179->U+017A, U+017A, \
U+017B->U+017C,U+017C, U+017D->U+017E, U+017E, U+0410..U+042F->U+0430..U+044F, \
U+0430..U+044F,U+05D0..U+05EA, U+0531..U+0556->U+0561..U+0586, U+0561..U+0587, \
U+0621..U+063A, U+01B9,U+01BF, U+0640..U+064A, U+0660..U+0669, U+066E, U+066F, \
U+0671..U+06D3, U+06F0..U+06FF,U+0904..U+0939, U+0958..U+095F, U+0960..U+0963, \
U+0966..U+096F, U+097B..U+097F,U+0985..U+09B9, U+09CE, U+09DC..U+09E3, U+09E6..U+09EF, \
U+0A05..U+0A39, U+0A59..U+0A5E,U+0A66..U+0A6F, U+0A85..U+0AB9, U+0AE0..U+0AE3, \
U+0AE6..U+0AEF, U+0B05..U+0B39,U+0B5C..U+0B61, U+0B66..U+0B6F, U+0B71, U+0B85..U+0BB9, \
U+0BE6..U+0BF2, U+0C05..U+0C39,U+0C66..U+0C6F, U+0C85..U+0CB9, U+0CDE..U+0CE3, \
U+0CE6..U+0CEF, U+0D05..U+0D39, U+0D60,U+0D61, U+0D66..U+0D6F, U+0D85..U+0DC6, \
U+1900..U+1938, U+1946..U+194F, U+A800..U+A805,U+A807..U+A822, U+0386->U+03B1, \
U+03AC->U+03B1, U+0388->U+03B5, U+03AD->U+03B5,U+0389->U+03B7, U+03AE->U+03B7, \
U+038A->U+03B9, U+0390->U+03B9, U+03AA->U+03B9,U+03AF->U+03B9, U+03CA->U+03B9, \
U+038C->U+03BF, U+03CC->U+03BF, U+038E->U+03C5,U+03AB->U+03C5, U+03B0->U+03C5, \
U+03CB->U+03C5, U+03CD->U+03C5, U+038F->U+03C9,U+03CE->U+03C9, U+03C2->U+03C3, \
U+0391..U+03A1->U+03B1..U+03C1,U+03A3..U+03A9->U+03C3..U+03C9, U+03B1..U+03C1, \
U+03C3..U+03C9, U+0E01..U+0E2E,U+0E30..U+0E3A, U+0E40..U+0E45, U+0E47, U+0E50..U+0E59, \
U+A000..U+A48F, U+4E00..U+9FBF,U+3400..U+4DBF, U+20000..U+2A6DF, U+F900..U+FAFF, \
U+2F800..U+2FA1F, U+2E80..U+2EFF,U+2F00..U+2FDF, U+3100..U+312F, U+31A0..U+31BF, \
U+3040..U+309F, U+30A0..U+30FF,U+31F0..U+31FF, U+AC00..U+D7AF, U+1100..U+11FF, \
U+3130..U+318F, U+A000..U+A48F,U+A490..U+A4CF 
 min_prefix_len  = 0
 min_infix_len  = 1
 ngram_len = 1

 ngrams_chars = U+4E00..U+9FBF, U+3400..U+4DBF, U+20000..U+2A6DF, U+F900..U+FAFF,\
U+2F800..U+2FA1F, U+2E80..U+2EFF, U+2F00..U+2FDF, U+3100..U+312F, U+31A0..U+31BF,\
U+3040..U+309F, U+30A0..U+30FF, U+31F0..U+31FF, U+AC00..U+D7AF, U+1100..U+11FF,\
U+3130..U+318F, U+A000..U+A48F, U+A490..U+A4CF
}
indexer
{
 mem_limit   = 32M
}
searchd
{
 # address    = 0.0.0.0
 port    = 3312
 log     = d:/sphinx/log/searchd.log
 query_log   = d:/sphinx/log/query.log
 read_timeout  = 5
 max_children  = 30
 pid_file   = d:/sphinx/log/searchd.pid
 max_matches   = 1000
 seamless_rotate  = 1
}

相关配置项说明

#type 数据库类型,目前支持mysql与pgsql
#strip_html 是否去掉html标签
#sql_host 数据库主机地址
#sql_user 数据库用户名
#sql_pass 数据库密码
#sql_db   数据库名称
#sql_port 数据库采用的端口
#sql_query_pre 执行sql前要设置的字符集,用utf8必须SET NAMES utf8
#sql_query  全文检索要显示的内容,在这里尽可能不使用where或group by,将where与groupby的内容交给sphinx,由sphinx进行条件过滤与groupby效率会更高
#注意:select 出来的字段必须至少包括一个唯一主键(ARTICLESID)以及要全文检索的字段,你计划原本在where中要用到的字段也要select出来
#这里不用使用orderby
#sql_attr_开头的表示一些属性字段,你原计划要用在where,orderby,groupby中的字段要在这里定义
#根据我们原先的SQL:
#select * from eht_articles where title like ? and catalogid=? And edituserid=?  And addtime between ? and ? order by hits desc
#我们需要对catalogid,edituserid,addtime,hits进行属性定义(这四个字段也要在select的字段列表中),定义时不同的字段类型有不同的属性名称,具体可以见sphinx.conf.in中的说明

index部分配置项说明

#source 数据源名
#path   索引记录存放目录,如d:/sphinx/data/cgfinal,实际存放时会存放在d:/sphinx/data目录,然后创建多个cgfinal名称,不同扩展名的索引文件。
#其他的配置如min_word_len,charset_type,charset_table,ngrams_chars,ngram_len这些则是支持中文检索需要设置的内容。
#如果检索的不是中文,则charset_table,ngrams_chars,min_word_len就要设置不同的内容,具体官方网站的论坛中有很多,大家可以去搜索看看。
#
# Sphinx configuration file sample
#
# WARNING! While this sample file mentions all available options,
# it contains (very) short helper descriptions only. Please refer to
# doc/sphinx.html for details.
#

附加一份网上找到非常详细的配置说明案例:

#############################################################################
## data source definition
#############################################################################

source src1
{
    # data source type. mandatory, no default value
    # known types are mysql, pgsql, mssql, xmlpipe, xmlpipe2, odbc
    type            = mysql

    #####################################################################
    ## SQL settings (for 'mysql' and 'pgsql' types)
    #####################################################################

    # some straightforward parameters for SQL source types
    sql_host        = localhost
    sql_user        = root
    sql_pass        = ****** 
    sql_db            = ******
    sql_port        = 3306    # optional, default is 3306

    # UNIX socket name
    # optional, default is empty (reuse client library defaults)
    # usually '/var/lib/mysql/mysql.sock' on Linux
    # usually '/tmp/mysql.sock' on FreeBSD
    #
    sql_sock        = /tmp/mysql.sock


    # MySQL specific client connection flags
    # optional, default is 0
    # 数据传输方式
    # mysql_connect_flags    = 32 # enable compression

    # MySQL specific SSL certificate settings
    # optional, defaults are empty
    # SLL链接
    # mysql_ssl_cert        = /etc/ssl/client-cert.pem
    # mysql_ssl_key        = /etc/ssl/client-key.pem
    # mysql_ssl_ca        = /etc/ssl/cacert.pem

    # MS SQL specific Windows authentication mode flag
    # MUST be in sync with charset_type index-level setting
    # optional, default is 0
    #
    # mssql_winauth        = 1 # use currently logged on user credentials


    # MS SQL specific Unicode indexing flag
    # optional, default is 0 (request SBCS data)
    #
    # mssql_unicode        = 1 # request Unicode data from server


    # ODBC specific DSN (data source name)
    # mandatory for odbc source type, no default value
    #
    # odbc_dsn        = DBQ=C:\data;DefaultDir=C:\data;Driver={Microsoft Text Driver (*.txt; *.csv)};
    # sql_query        = SELECT id, data FROM documents.csv


    # ODBC and MS SQL specific, per-column buffer sizes
    # optional, default is auto-detect
    #
    # sql_column_buffers    = content=12M, comments=1M


    # pre-query, executed before the main fetch query
    # multi-value, optional, default is empty list of queries
    # 发送SQL语句前发送
    sql_query_pre        = SET NAMES utf8
    sql_query_pre        = SET SESSION query_cache_type=OFF


    # main document fetch query
    # mandatory, integer document ID field MUST be the first selected column
    # 需要查询的表 构建查询
    sql_query        = \
        SELECT id,target_type,genre,stars,sub_title,sports_team,music_band,music_album \
        FROM ko_link
    #如果多个数据源并要在一个索引,必须要保持字段的顺序数量跟数据都要一致,否则将出错


    # joined/payload field fetch query
    # joined fields let you avoid (slow) JOIN and GROUP_CONCAT
    # payload fields let you attach custom per-keyword values (eg. for ranking)
    #
    # syntax is FIELD-NAME 'from'  ( 'query' | 'payload-query' ); QUERY
    # joined field QUERY should return 2 columns (docid, text)
    # payload field QUERY should return 3 columns (docid, keyword, weight)
    #
    # REQUIRES that query results are in ascending docuent ID order!
    # multi-value, optional, default is empty list of queries
    #  添加字段,来源与表 自动连接
    # 字段结果集保持为
    # (1,tags1)
    # (1,tags2)
    # (2,tags3)
    # (2,tags4)
    # 添加字段将用于搜索,结果如有第3个字段,第3个字段表示该记录的权重,权重为大于1的值
    # sql_joined_field    = tags from query; SELECT docid, CONCAT('tag',tagid) FROM tags ORDER BY docid ASC
    # sql_joined_field    = wtags from payload-query; SELECT docid, tag, tagweight FROM tags ORDER BY docid ASC


    # file based field declaration
    #
    # content of this field is treated as a file name
    # and the file gets loaded and indexed in place of a field
    #
    # max file size is limited by max_file_field_buffer indexer setting
    # file IO errors are non-fatal and get reported as warnings
    # 把字段声明放入文件
    # sql_file_field        = content_file_path


    # range query setup, query that must return min and max ID values
    # optional, default is empty
    # 
    # sql_query will need to reference $start and $end boundaries
    # if using ranged query:
    # 分区查询,防止MYSQL死锁
    # sql_query        = \
    #    SELECT doc.id, doc.id AS group, doc.title, doc.data \
    #    FROM documents doc \
    #    WHERE id>=$start AND id<=$end
    #
    # sql_query_range        = SELECT MIN(id),MAX(id) FROM documents


    # range query step
    # optional, default is 1024
    # 分区查询跳步
    # sql_range_step        = 1000

    
    
    
    # unsigned integer attribute declaration
    # multi-value (an arbitrary number of attributes is allowed), optional
    # optional bit size can be specified, default is 32
    # 声明无符号数字段
    #sql_attr_uint        = target_type
    # sql_attr_uint        = forum_id:9 # 9 bits for forum_id
    #sql_attr_uint        = group_id
    #声明BOOL字段
    # boolean attribute declaration
    # multi-value (an arbitrary number of attributes is allowed), optional
    # equivalent to sql_attr_uint with 1-bit size
    #
    # sql_attr_bool        = is_deleted


    # bigint attribute declaration
    # multi-value (an arbitrary number of attributes is allowed), optional
    # declares a signed (unlike uint!) 64-bit attribute
    # 声明长整字段
    # sql_attr_bigint        = my_bigint_id


    # UNIX timestamp attribute declaration
    # multi-value (an arbitrary number of attributes is allowed), optional
    # similar to integer, but can also be used in date functions
    # 声明时间字段
    # sql_attr_timestamp    = posted_ts
    # sql_attr_timestamp    = last_edited_ts
    #sql_attr_timestamp    = date_added
    
    # string ordinal attribute declaration
    # multi-value (an arbitrary number of attributes is allowed), optional
    # sorts strings (bytewise), and stores their indexes in the sorted list
    # sorting by this attr is equivalent to sorting by the original strings
    # 声明字符串字段 用于排序等,但此字段不会被存储
    # sql_attr_str2ordinal    = author_name


    # floating point attribute declaration
    # multi-value (an arbitrary number of attributes is allowed), optional
    # values are stored in single precision, 32-bit IEEE 754 format
    # 声明浮点字段
    # sql_attr_float        = lat_radians
    # sql_attr_float        = long_radians


    # multi-valued attribute (MVA) attribute declaration
    # multi-value (an arbitrary number of attributes is allowed), optional
    # MVA values are variable length lists of unsigned 32-bit integers
    #
    # syntax is ATTR-TYPE ATTR-NAME 'from' SOURCE-TYPE [;QUERY] [;RANGE-QUERY]
    # ATTR-TYPE is 'uint' or 'timestamp'
    # SOURCE-TYPE is 'field', 'query', or 'ranged-query'
    # QUERY is SQL query used to fetch all ( docid, attrvalue ) pairs
    # RANGE-QUERY is SQL query used to fetch min and max ID values, similar to 'sql_query_range'
    # 声明复合字段
    # sql_attr_multi        = uint tag from query; SELECT docid, tagid FROM tags
    # sql_attr_multi        = uint tag from ranged-query; \
    #    SELECT docid, tagid FROM tags WHERE id>=$start AND id<=$end; \
    #    SELECT MIN(docid), MAX(docid) FROM tags


    # string attribute declaration
    # multi-value (an arbitrary number of these is allowed), optional
    # lets you store and retrieve strings
    # 只是把数据存储,但不会索引改字段
    # sql_attr_string        = stitle


    # wordcount attribute declaration
    # multi-value (an arbitrary number of these is allowed), optional
    # lets you count the words at indexing time
    # 将转化成关键字的字段,用于提高匹配率
    # sql_attr_str2wordcount    = stitle


    # combined field plus attribute declaration (from a single column)
    # stores column as an attribute, but also indexes it as a full-text field
    # 跟sql_attr_string不同是该属性加入索引
    # sql_field_string    = author
    # sql_field_str2wordcount    = title

    
    # post-query, executed on sql_query completion
    # optional, default is empty
    # 取后查询
    # sql_query_post        =

    
    # post-index-query, executed on successful indexing completion
    # optional, default is empty
    # $maxid expands to max document ID actually fetched from DB
    # 索引后查询
    # sql_query_post_index    = REPLACE INTO counters ( id, val ) \
    #    VALUES ( 'max_indexed_id', $maxid )


    # ranged query throttling, in milliseconds
    # optional, default is 0 which means no delay
    # enforces given delay before each query step
    #分区查询的时间间隔
    sql_ranged_throttle    = 0

    # document info query, ONLY for CLI search (ie. testing and debugging)
    # optional, default is empty
    # must contain $id macro and must fetch the document by that id
    #命令行调试查询结果用
    sql_query_info        = SELECT * FROM ko_link WHERE id=$id

    # kill-list query, fetches the document IDs for kill-list
    # k-list will suppress matches from preceding indexes in the same query
    # optional, default is empty
    ##清理指定查询ID列表,对于数据的更改
    # sql_query_killlist    = SELECT id FROM documents WHERE edited>=@last_reindex


    # columns to unpack on indexer side when indexing
    # multi-value, optional, default is empty list
    # 启用ZIP压缩 可以降低系统负载 但必须保证zlib库zlib-dev库可用
    # unpack_zlib        = zlib_column
    # unpack_mysqlcompress    = compressed_column
    # unpack_mysqlcompress    = compressed_column_2


    # maximum unpacked length allowed in MySQL COMPRESS() unpacker
    # optional, default is 16M
    # 压缩缓存区大小 不能小于字段存储值
    # unpack_mysqlcompress_maxsize    = 16M


    #####################################################################
    ## xmlpipe2 配置
    #####################################################################

    # type            = xmlpipe

    # shell command to invoke xmlpipe stream producer
    # mandatory
    #
    # xmlpipe_command        = cat /usr/local/sphinx/var/test.xml

    # xmlpipe2 field declaration
    # multi-value, optional, default is empty
    #
    # xmlpipe_field        = subject
    # xmlpipe_field        = content


    # xmlpipe2 attribute declaration
    # multi-value, optional, default is empty
    # all xmlpipe_attr_XXX options are fully similar to sql_attr_XXX
    #
    # xmlpipe_attr_timestamp    = published
    # xmlpipe_attr_uint    = author_id


    # perform UTF-8 validation, and filter out incorrect codes
    # avoids XML parser choking on non-UTF-8 documents
    # optional, default is 0
    #
    # xmlpipe_fixup_utf8    = 1
}


# inherited source example
# 继承数据源
# all the parameters are copied from the parent source,
# and may then be overridden in this source definition
#source src1throttled : src1
#{
#    sql_ranged_throttle    = 100
#}

#############################################################################
## index definition
#############################################################################

# local index example
#
# this is an index which is stored locally in the filesystem
#
# all indexing-time options (such as morphology and charsets)
# are configured per local index
index test1
{
    # index type
    # optional, default is 'plain'
    # known values are 'plain', 'distributed', and 'rt' (see samples below) 
    #索引类型 本地 分布式 
    # type            = plain

    # document source(s) to index
    # multi-value, mandatory
    # document IDs must be globally unique across all sources
    #数据源,可以多个数据源
    source            = src1

    # index files path and file name, without extension
    # mandatory, path must be writable, extensions will be auto-appended
    # 索引保存路径
    path            = /usr/local/sphinx/var/data/test1

    # document attribute values (docinfo) storage mode
    # optional, default is 'extern'
    # known values are 'none', 'extern' and 'inline'
    #索引存储方式
    docinfo            = extern

    # memory locking for cached data (.spa and .spi), to prevent swapping
    # optional, default is 0 (do not mlock)
    # requires searchd to be run from root
    #内存锁定 需要保证足够权限
    mlock            = 0

    # a list of morphology preprocessors to apply
    # optional, default is empty
    #
    # builtin preprocessors are 'none', 'stem_en', 'stem_ru', 'stem_enru',
    # 'soundex', and 'metaphone'; additional preprocessors available from
    # libstemmer are 'libstemmer_XXX', where XXX is algorithm code
    # (see libstemmer_c/libstemmer/modules.txt)
    # 词语提取器
    # morphology        = stem_en, stem_ru, soundex
    # morphology        = libstemmer_german
    # morphology        = libstemmer_sv
    morphology        = stem_en

    # minimum word length at which to enable stemming
    # optional, default is 1 (stem everything)
    # 词干化的最小词长
    # min_stemming_len    = 1


    # stopword files list (space separated)
    # optional, default is empty
    # contents are plain text, charset_table and stemming are both applied
    # 停用搜索词
    # stopwords        = /usr/local/sphinx/var/data/stopwords.txt


    # wordforms file, in "mapfrom > mapto" plain text format
    # optional, default is empty
    # 词型字典 可用spelldump工具生成
    # wordforms        = /usr/local/sphinx/var/data/wordforms.txt


    # tokenizing exceptions file
    # optional, default is empty
    #Token特例文件,就是有些词是完整词意,不能拆分索引如a&t 跟a & t
    # plain text, case sensitive, space insensitive in map-from part
    # one "Map Several Words => ToASingleOne" entry per line
    #
    # exceptions        = /usr/local/sphinx/var/data/exceptions.txt


    # minimum indexed word length
    # default is 1 (index everything)
    #  最小索引长度,就是小于指定长度的词不被索引
    min_word_len        = 1

    # charset encoding type
    # optional, default is 'sbcs'
    # known types are 'sbcs' (Single Byte CharSet) and 'utf-8'
    # 字符编码
    charset_type        = utf-8

    # charset definition and case folding rules "table"
    # optional, default value depends on charset_type
    #
    # defaults are configured to include English and Russian characters only
    # you need to change the table to include additional ones
    # this behavior MAY change in future versions
    #
    # 'sbcs' default value is
    # charset_table        = 0..9, A..Z->a..z, _, a..z, U+A8->U+B8, U+B8, U+C0..U+DF->U+E0..U+FF, U+E0..U+FF
    # 转换字符表
    # 'utf-8' default value is
    # charset_table        = 0..9, A..Z->a..z, _, a..z, U+410..U+42F->U+430..U+44F, U+430..U+44F


    # ignored characters list
    # optional, default value is empty
    #  忽略字符表
    # ignore_chars        = U+00AD


    # minimum word prefix length to index
    # optional, default is 0 (do not index prefixes)
    #索引的最小前缀长度,小心使用,索引和搜索的时间皆会恶化
    # min_prefix_len        = 0


    # minimum word infix length to index
    # optional, default is 0 (do not index infixes)
    #索引的最小中缀长度 小心使用,索引和搜索的时间皆会恶化
    # min_infix_len        = 0


    # list of fields to limit prefix/infix indexing to
    # optional, default value is empty (index all fields in prefix/infix mode)
    # 未知
    # prefix_fields        = filename
    # infix_fields        = url, domain


    # enable star-syntax (wildcards) when searching prefix/infix indexes
    # search-time only, does not affect indexing, can be 0 or 1
    # optional, default is 0 (do not use wildcard syntax)
    # 启用星号语法
    # enable_star        = 1


    # expand keywords with exact forms and/or stars when searching fit indexes
    # search-time only, does not affect indexing, can be 0 or 1
    # optional, default is 0 (do not expand keywords)
    # 扩大搜索关键字 形式如: running -> ( running | *running* | =running )
    # expand_keywords        = 1

    
    # n-gram length to index, for CJK indexing
    # only supports 0 and 1 for now, other lengths to be implemented
    # optional, default is 0 (disable n-grams)
    # 中文等其他语言的基本支持
    # ngram_len        = 1


    # n-gram characters list, for CJK indexing
    # optional, default is empty
    #中文或其他语言的值范围
    # ngram_chars        = U+3000..U+2FA1F


    # phrase boundary characters list
    # optional, default is empty
    # 边界符
    # phrase_boundary        = ., ?, !, U+2026 # horizontal ellipsis


    # phrase boundary word position increment
    # optional, default is 0
    # 边界符增量
    # phrase_boundary_step    = 100


    # blended characters list
    # blended chars are indexed both as separators and valid characters
    # for instance, AT&T will results in 3 tokens ("at", "t", and "at&t")
    # optional, default is empty
    # 混合字符列表
    # blend_chars        = +, &, U+23


    # blended token indexing mode
    # a comma separated list of blended token indexing variants
    # known variants are trim_none, trim_head, trim_tail, trim_both, skip_pure
    # optional, default is trim_none
    #未知
    # blend_mode        = trim_tail, skip_pure


    # whether to strip HTML tags from incoming documents
    # known values are 0 (do not strip) and 1 (do strip)
    # optional, default is 0
    # 删除HTML标签 (小心文本被删除)
    html_strip        = 0

    # what HTML attributes to index if stripping HTML
    # optional, default is empty (do not index anything)
    # 保留的HTML标签
    # html_index_attrs    = img=alt,title; a=title;


    # what HTML elements contents to strip
    # optional, default is empty (do not strip element contents)
    # 不但删除标签,其包含的文本也将删除
    # html_remove_elements    = style, script


    # whether to preopen index data files on startup
    # optional, default is 0 (do not preopen), searchd-only
    # 预先打开索引还是每次查询的时候在打开索引
    # preopen            = 1


    # whether to keep dictionary (.spi) on disk, or cache it in RAM
    # optional, default is 0 (cache in RAM), searchd-only
    # 将字典文件是否保存在内存中
    # ondisk_dict        = 1


    # whether to enable in-place inversion (2x less disk, 90-95% speed)
    # optional, default is 0 (use separate temporary files), indexer-only
    # 是否启用原地索引倒转 将少磁盘使用 性能会有一点损失
    # inplace_enable        = 1


    # in-place fine-tuning options
    # optional, defaults are listed below
    #微调原地倒转
    # inplace_hit_gap        = 0 # preallocated hitlist gap size 
    # inplace_docinfo_gap    = 0 # preallocated docinfo gap size
    # inplace_reloc_factor    = 0.1 # relocation buffer size within arena
    # inplace_write_factor    = 0.1 # write buffer size within arena


    # whether to index original keywords along with stemmed versions
    # enables "=exactform" operator to work
    # optional, default is 0
    # 是否在索引原关键词的词干化/重映射后的形式的同时也索引原词
    # index_exact_words    = 1


    # position increment on overshort (less that min_word_len) words
    # optional, allowed values are 0 and 1, default is 1
    #在经过过短的词(比 min_word_len短的词)处后增加位置值
    # overshort_step        = 1


    # position increment on stopword
    # optional, allowed values are 0 and 1, default is 1
    #在经过 停用词 处后增加位置值可选选项
    # stopword_step        = 1


    # hitless words list
    # positions for these keywords will not be stored in the index
    # optional, allowed values are 'all', or a list file name
    # 不能中断的字符列表
    # hitless_words        = all
    # hitless_words        = hitless.txt #字符文件


    # detect and index sentence and paragraph boundaries
    # required for the SENTENCE and PARAGRAPH operators to work
    # optional, allowed values are 0 and 1, default is 0
    # 是否检查标签合并 针对HTML
    # index_sp            = 1


    # index zones, delimited by HTML/XML tags
    # a comma separated list of tags and wildcards
    # required for the ZONE operator to work
    # optional, default is empty string (do not index zones)
    # 对HTML标签的权重
    # index_zones        = title, h*, th
}


# inherited index example
# 索引继承
# all the parameters are copied from the parent index,
# and may then be overridden in this index definition
#index test1stemmed : test1
#{
#    path            = /usr/local/sphinx/var/data/test1stemmed
#    morphology        = stem_en
#}


# distributed index example
#
# this is a virtual index which can NOT be directly indexed,
# and only contains references to other local and/or remote indexes
#index dist1
#{
#分布式索引配置
    # 'distributed' index type MUST be specified
#    type            = distributed

    # local index to be searched
    # there can be many local indexes configured
#    local            = test1
#    local            = test1stemmed

    # remote agent
    # multiple remote agents may be specified
    # syntax for TCP connections is 'hostname:port:index1,[index2[,...]]'
    # syntax for local UNIX connections is '/path/to/socket:index1,[index2[,...]]'
#    agent            = localhost:9313:remote1
#    agent            = localhost:9314:remote2,remote3
    # agent            = /var/run/searchd.sock:remote4

    # blackhole remote agent, for debugging/testing
    # network errors and search results will be ignored
    #
    # agent_blackhole        = testbox:9312:testindex1,testindex2


    # remote agent connection timeout, milliseconds
    # optional, default is 1000 ms, ie. 1 sec
#    agent_connect_timeout    = 1000

    # remote agent query timeout, milliseconds
    # optional, default is 3000 ms, ie. 3 sec
#    agent_query_timeout    = 3000
#}


# realtime index example
#
# you can run INSERT, REPLACE, and DELETE on this index on the fly
# using MySQL protocol (see 'listen' directive below)
#index rt
#{
    # 'rt' index type must be specified to use RT index
#    type            = rt

    # index files path and file name, without extension
    # mandatory, path must be writable, extensions will be auto-appended

#    path            = /usr/local/sphinx/var/data/rt

    # RAM chunk size limit
    # RT index will keep at most this much data in RAM, then flush to disk
    # optional, default is 32M
    #
    # rt_mem_limit        = 512M

    # full-text field declaration
    # multi-value, mandatory
#    rt_field        = title
#    rt_field        = content

    # unsigned integer attribute declaration
    # multi-value (an arbitrary number of attributes is allowed), optional
    # declares an unsigned 32-bit attribute
#    rt_attr_uint        = gid

    # RT indexes currently support the following attribute types:
    # uint, bigint, float, timestamp, string
    #
    # rt_attr_bigint        = guid
    # rt_attr_float        = gpa
    # rt_attr_timestamp    = ts_added
    # rt_attr_string        = author
#}

#############################################################################
## indexer settings
#############################################################################

indexer
{
    #索引过程内存使用限制。可选选项,默认32M。 
    # memory limit, in bytes, kiloytes (16384K) or megabytes (256M)
    # optional, default is 32M, max is 2047M, recommended is 256M to 1024M
    mem_limit        = 32M

    # maximum IO calls per second (for I/O throttling)
    # optional, default is 0 (unlimited)
    # 每秒最大I/O操作次数,用于限制I/O操作。可选选项,默认为0(无限制)。 
    # max_iops        = 40


    # maximum IO call size, bytes (for I/O throttling)
    # optional, default is 0 (unlimited)
    # 最大允许的I/O操作大小,以字节为单位,用于I/O节流。可选选项,默认为0(不限制)。 
    # max_iosize        = 1048576


    # maximum xmlpipe2 field length, bytes
    # optional, default is 2M
    # 对于XMLLpipe2数据源允许的最大的域大小
    # max_xmlpipe2_field    = 4M


    # write buffer size, bytes
    # several (currently up to 4) buffers will be allocated
    # write buffers are allocated in addition to mem_limit
    # optional, default is 1M
    # 写缓冲区的大小,单位是字节。可选选项,默认值是1MB。 
    # write_buffer        = 1M


    # maximum file field adaptive buffer size
    # optional, default is 8M, minimum is 1M
    #
    # max_file_field_buffer    = 32M
}

#############################################################################
## searchd settings
#############################################################################

searchd
{
    # [hostname:]port[:protocol], or /unix/socket/path to listen on
    # known protocols are 'sphinx' (SphinxAPI) and 'mysql41' (SphinxQL)
    #
    # multi-value, multiple listen points are allowed
    # optional, defaults are 9312:sphinx and 9306:mysql41, as below
    #
    # listen            = 127.0.0.1
    # listen            = 192.168.0.1:9312
    # listen            = 9312
    # listen            = /var/run/searchd.sock
    listen            = 9312
    #listen            = 9306:mysql41

    # log file, searchd run info is logged here
    # optional, default is 'searchd.log'
    # 全部searchd运行时事件会被记录在这个日志文件中。 
    log            = /usr/local/sphinx/var/log/searchd.log

    # query log file, all search queries are logged here
    # optional, default is empty (do not log queries)
    # 全部搜索查询会被记录在此文件中。
    query_log        = /usr/local/sphinx/var/log/query.log

    # client read timeout, seconds
    # optional, default is 5
    #网络客户端请求的读超时时间,单位是秒。
    read_timeout        = 5

    # request timeout, seconds
    # optional, default is 5 minutes
    #在使用持久连接时,两次查询之间等待的最长时间(单位是秒)。
    client_timeout        = 300

    # maximum amount of children to fork (concurrent searches to run)
    # optional, default is 0 (unlimited)
    #子进程的最大数量 ,用来控制服务器负载。任何时候不可能有比此设置值更多的搜索同时运行。当达到限制时,新的输入客户端会被用临时失败(SEARCH_RETRY)状态码驳回,同时给出一个声明服务器已到最大连接限制的消息。 
    max_children        = 30

    # PID file, searchd process ID file name
    # mandatory
    #进程ID文件
    pid_file        = /usr/local/sphinx/var/log/searchd.pid

    # max amount of matches the daemon ever keeps in RAM, per-index
    # WARNING, THERE'S ALSO PER-QUERY LIMIT, SEE SetLimits() API CALL
    # default is 1000 (just like Google)
    #守护进程在内存中为每个索引所保持并返回给客户端的匹配数目的最大值。
    max_matches        = 1000

    # seamless rotate, prevents rotate stalls if precaching huge datasets
    # optional, default is 1
    #防止 searchd 轮换在需要预取大量数据的索引时停止响应。可选选项,默认为1(启用无缝(seamless)轮换)。 
    seamless_rotate        = 1

    # whether to forcibly preopen all indexes on startup
    # optional, default is 1 (preopen everything)
    #是否在启动是强制重新打开所有索引文件。可选选项,默认为0(不重新打开)。
    preopen_indexes        = 1

    # whether to unlink .old index copies on succesful rotation.
    # optional, default is 1 (do unlink)
    #索引轮换成功之后,是否删除以.old为扩展名的索引拷贝。可选选项,默认为1(删除这些索引拷贝)。 
    unlink_old        = 1

    # attribute updates periodic flush timeout, seconds
    # updates will be automatically dumped to disk this frequently
    # optional, default is 0 (disable periodic flush)
    # UpdateAttributes() 调用时候更新是否隔一段时间写入磁盘
    # attr_flush_period    = 900


    # instance-wide ondisk_dict defaults (per-index value take precedence)
    # optional, default is 0 (precache all dictionaries in RAM)
    #对 ondisk_dict 指令的全局的默认值。 可选选项,默认值是0(将字典预先缓冲到内存)。
    # ondisk_dict_default    = 1


    # MVA updates pool size
    # shared between all instances of searchd, disables attr flushes!
    # optional, default size is 1M
    #网络通讯时允许的最大的包的大小。
    mva_updates_pool    = 1M

    # max allowed network packet size
    # limits both query packets from clients, and responses from agents
    # optional, default size is 8M
    #用于多值属性MVA更新的存储空间的共享池大小。
    max_packet_size        = 8M

    # crash log path
    # searchd will (try to) log crashed query to 'crash_log_path.PID' file
    # optional, default is empty (do not create crash logs)
    #崩溃日志文件的路径
    # crash_log_path        = /usr/local/sphinx/var/log/crash


    # max allowed per-query filter count
    # optional, default is 256
    #每次查询允许设置的过滤器的最大个数。只用于内部检查,不直接影响内存使用或性能。
    max_filters        = 256

    # max allowed per-filter values count
    # optional, default is 4096
    #单个过滤器允许的值的最大个数。只用于内部检查,不直接影响内存使用或性能。
    max_filter_values    = 4096


    # socket listen queue length
    # optional, default is 5
    #TCP监听积压列表长度。无法如对的请求立即失败并收到“连接被拒”错误信息
    # listen_backlog        = 5


    # per-keyword read buffer size
    # optional, default is 256K
    #每个关键字的读缓冲区的大小。可选选项,默认值是256K。 
    # read_buffer        = 256K


    # unhinted read size (currently used when reading hits)
    # optional, default is 32K
    #无提示时读操作的大小。可选选项,默认值是32K。
    # read_unhinted        = 32K


    # max allowed per-batch query count (aka multi-query count)
    # optional, default is 32
    #限制每批次的查询量。一个OPEN之后的查询量
    max_batch_queries    = 32


    # max common subtree document cache size, per-query
    # optional, default is 0 (disable subtree optimization)
    #
    # subtree_docs_cache    = 4M


    # max common subtree hit cache size, per-query
    # optional, default is 0 (disable subtree optimization)
    # 限制RAM使用一个共同的子树优化 默认不优化
    # subtree_hits_cache    = 8M


    # multi-processing mode (MPM)
    # known values are none, fork, prefork, and threads
    # optional, default is fork
    # 工作方式 
    workers            = threads # for RT to work


    # max threads to create for searching local parts of a distributed index
    # optional, default is 0, which means disable multi-threaded searching
    # should work with all MPMs (ie. does NOT require workers=threads)
    #  
    # dist_threads        = 4


    # binlog files path; use empty string to disable binlog
    # optional, default is build-time configured data directory
    # 二进制日志路径
    # binlog_path        = # disable logging
    # binlog_path        = /usr/local/sphinx/var/data # binlog.001 etc will be created there


    # binlog flush/sync mode
    # 0 means flush and sync every second
    # 1 means flush and sync every transaction
    # 2 means flush every transaction, sync every second
    # optional, default is 2
    # 日志刷新模式
    # binlog_flush        = 2


    # binlog per-file size limit
    # optional, default is 128M, 0 means no limit
    #最大日志大小
    # binlog_max_log_size    = 256M


    # per-thread stack size, only affects workers=threads mode
    # optional, default is 64K
    #每个线程的堆栈大小。
    # thread_stack            = 128K


    # per-keyword expansion limit (for dict=keywords prefix searches)
    # optional, default is 0 (no limit)
    # 扩大为一个关键字的最大数目
    # expansion_limit        = 1000


    # RT RAM chunks flush period
    # optional, default is 0 (no periodic flush)
    #RT索引在内存中检查的时间
    # rt_flush_period        = 900


    # query log file format
    # optional, known values are plain and sphinxql, default is plain
    # 查询日志格式
    # query_log_format        = sphinxql


    # version string returned to MySQL network protocol clients
    # optional, default is empty (use Sphinx version)
    # MYSQL版本
    # mysql_version_string    = 5.0.37


    # trusted plugin directory
    # optional, default is empty (disable UDFs)
    # 插件目录
    # plugin_dir            = /usr/local/sphinx/lib


    # default server-wide collation
    # optional, default is libc_ci
    # 链接字符集
    # collation_server        = utf8_general_ci


    # server-wide locale for libc based collations
    # optional, default is C
    # collation 选项
    # collation_libc_locale    = ru_RU.UTF-8


    # threaded server watchdog (only used in workers=threads mode)
    # optional, values are 0 and 1, default is 1 (watchdog on)
    # 是否启用服务器监控进程
    # watchdog                = 1

    
    # SphinxQL compatibility mode (legacy columns and their names)
    # optional, default is 0 (SQL compliant syntax and result sets)
    #sphinxql 兼容模式
    # compat_sphinxql_magics    = 1
}

# --eof--


增量索引使用

(1).在Mysql中建立一个增量表,凡是更新操作(添加,修改,删除),都会将相关ID更新到到增量表;表结构很简单,只记录ID.
(2).修改Sphinx的配置文件

sql_query = SELECT 0 AS in_update FROM documents
sql_attr_uint = in_update


增加了一个字段in_update,用来标记主索引的这条记录,是否在增量表中.
(3).产品的更新操作时,除了更新增量表,同时也利用Sphinx API的
UpdateAttributes方法,将主索引中的相关记录的in_update属性设置成”1″
$Sphinx->UpdateAttributes( $index, array( 'in_update' ), array( $id => '1' ) );
更新完了属性后,发出增量索引更新通知,可以是写队列,写文件等方式.
(4).在查询时,增加过滤器.

$Sphinx->SetFilter( 'in_update', array(0) );
这样,就不会使用到主索引中的处于增量索引中的doc,以免搜索到错误的编辑前的数据.
(5).守护进程接到增量索引更新通知,重建增量索引.
(6).每天某时段更新主索引,清空增量表,清空增量索引…
 
删除索引:
可以在创建索引时,配置文件中设置以下属性:
sql_attr_bool = is_deleted
is_deleted 默认值为 0 ,当记录要删除时,调用接口中的 UpdateAttributes设置其属性值为 1,在搜索和合并索引时增加过滤器只让is_deleted为 0 的记录通过
select id, title, content,1 as is_deleted from table
$ sphinx ->UpdateAttributes('is_deleted', array('is_deleted'),array($id => array(0)));
$sphinx->SetFilter(); SetFilter
设置过滤器只允许“is_deleted”为 0 的那些条件通过,而去除所有标记为已删除(“is_deleted” 为非0)的记录

“主索引+增量索引”更新方式

XX网的特征:发帖较为频繁;刚发完的帖被用户自己编辑、修改的可能性大;新发的帖会被编辑审核(删除、标记为中介等);两天以前的老帖变动性较小。
 基于这个特征,我设计了Sphinx主索引和增量索引。对于前天17:00之前的记录建立主索引,每天凌晨自动重建一次主索引;对于前天17:00之后到当前最新的记录,间隔1分钟自动重建一次增量索引。

=================================================================================================================

sphinx全文搜索引擎架构案例

1、创建sphinx配置文件

      在安装sphinx的目录(我的是E:\sphinx),打开sphinx.conf配置文件

     

source test1
{
         type       = mysql       #数据源,我这里是mysql
         sql_host    = 127.0.0.1  #数据库服务器
          sql_user    = test        #数据库用户名
          sql_pass    = test        #数据库密码
          sql_db      = test        #数据库
          sql_port      = 3306         
        sql_query_pre  = SET NAMES utf8 #数据库编码
	   sql_query_pre  = REPLACE INTO sphinx_time SELECT 1,UNIX_TIMESTAMP(NOW())  #创建主索引时间
	   sql_query      = SELECT type,id,title,content,price,updatetime,uid,0 AS in_update,0 AS in_delete FROM test1 WHERE updateTime < ( SELECT sqltime FROM sphinx_time where counterid=1)
	   sql_attr_uint        = in_update
	   sql_attr_uint        = in_delete
	   sql_attr_uint        = type
	   sql_attr_uint        = uid
	   sql_attr_float	      = price
	   sql_attr_timestamp   = updatetime
}
source item_delta: pcore_item_item{
	 sql_ranged_throttle = 100
	 sql_query_pre       = SET NAMES utf8
	 sql_query_pre       = SET SESSION query_cache_type=OFF
	 sql_query = SELECT type,id,title,content,price,updatetime,uid,0 AS in_update,0 AS in_delete FROM test1 WHERE updateTime >= ( SELECT sqltime FROM sphinx_time where counterid=1) 
          sql_attr_uint = in_update 
          sql_attr_uint = in_delete 
          sql_attr_uint = type 
          sql_attr_uint = uid 
          sql_attr_float = price 
          sql_attr_timestamp = updatetime
}
index test1{
        source = test1
		path = /data/test1
		docinfo = extern ##### 文档信息存储方式
		mlock = 0 #缓存数据内存锁定
		morphology = none #### 形态学(对中文无效)
		stopwords =
		min_word_len   = 1 #### 索引的词最小长度
		charset_type   = utf-8
		#  指定utf-8的编码表
		charset_table = U+FF10..U+FF19->0..9, 0..9, U+FF41..U+FF5A->a..z, U+FF21..U+FF3A->a..z,\
		A..Z->a..z, a..z, U+0149, U+017F, U+0138, U+00DF, U+00FF, U+00C0..U+00D6->U+00E0..U+00F6,\
		U+00E0..U+00F6, U+00D8..U+00DE->U+00F8..U+00FE, U+00F8..U+00FE, U+0100->U+0101, U+0101,\
		U+0102->U+0103, U+0103, U+0104->U+0105, U+0105, U+0106->U+0107, U+0107, U+0108->U+0109,\
		U+0109, U+010A->U+010B, U+010B, U+010C->U+010D, U+010D, U+010E->U+010F, U+010F,\
		U+0110->U+0111, U+0111, U+0112->U+0113, U+0113, U+0114->U+0115, U+0115, \
		U+0116->U+0117,U+0117, U+0118->U+0119, U+0119, U+011A->U+011B, U+011B, U+011C->U+011D,\
		U+011D,U+011E->U+011F, U+011F, U+0130->U+0131, U+0131, U+0132->U+0133, U+0133, \
		U+0134->U+0135,U+0135, U+0136->U+0137, U+0137, U+0139->U+013A, U+013A, U+013B->U+013C, \
		U+013C,U+013D->U+013E, U+013E, U+013F->U+0140, U+0140, U+0141->U+0142, U+0142, \
		U+0143->U+0144,U+0144, U+0145->U+0146, U+0146, U+0147->U+0148, U+0148, U+014A->U+014B, \
		U+014B,U+014C->U+014D, U+014D, U+014E->U+014F, U+014F, U+0150->U+0151, U+0151, \
		U+0152->U+0153,U+0153, U+0154->U+0155, U+0155, U+0156->U+0157, U+0157, U+0158->U+0159,\
		U+0159,U+015A->U+015B, U+015B, U+015C->U+015D, U+015D, U+015E->U+015F, U+015F, \
		U+0160->U+0161,U+0161, U+0162->U+0163, U+0163, U+0164->U+0165, U+0165, U+0166->U+0167, \
		U+0167,U+0168->U+0169, U+0169, U+016A->U+016B, U+016B, U+016C->U+016D, U+016D, \
		U+016E->U+016F,U+016F, U+0170->U+0171, U+0171, U+0172->U+0173, U+0173, U+0174->U+0175,\
		U+0175,U+0176->U+0177, U+0177, U+0178->U+00FF, U+00FF, U+0179->U+017A, U+017A, \
		U+017B->U+017C,U+017C, U+017D->U+017E, U+017E, U+0410..U+042F->U+0430..U+044F, \
		U+0430..U+044F,U+05D0..U+05EA, U+0531..U+0556->U+0561..U+0586, U+0561..U+0587, \
		U+0621..U+063A, U+01B9,U+01BF, U+0640..U+064A, U+0660..U+0669, U+066E, U+066F, \
		U+0671..U+06D3, U+06F0..U+06FF,U+0904..U+0939, U+0958..U+095F, U+0960..U+0963, \
		U+0966..U+096F, U+097B..U+097F,U+0985..U+09B9, U+09CE, U+09DC..U+09E3, U+09E6..U+09EF, \
		U+0A05..U+0A39, U+0A59..U+0A5E,U+0A66..U+0A6F, U+0A85..U+0AB9, U+0AE0..U+0AE3, \
		U+0AE6..U+0AEF, U+0B05..U+0B39,U+0B5C..U+0B61, U+0B66..U+0B6F, U+0B71, U+0B85..U+0BB9, \
		U+0BE6..U+0BF2, U+0C05..U+0C39,U+0C66..U+0C6F, U+0C85..U+0CB9, U+0CDE..U+0CE3, \
		U+0CE6..U+0CEF, U+0D05..U+0D39, U+0D60,U+0D61, U+0D66..U+0D6F, U+0D85..U+0DC6, \
		U+1900..U+1938, U+1946..U+194F, U+A800..U+A805,U+A807..U+A822, U+0386->U+03B1, \
		U+03AC->U+03B1, U+0388->U+03B5, U+03AD->U+03B5,U+0389->U+03B7, U+03AE->U+03B7, \
		U+038A->U+03B9, U+0390->U+03B9, U+03AA->U+03B9,U+03AF->U+03B9, U+03CA->U+03B9, \
		U+038C->U+03BF, U+03CC->U+03BF, U+038E->U+03C5,U+03AB->U+03C5, U+03B0->U+03C5, \
		U+03CB->U+03C5, U+03CD->U+03C5, U+038F->U+03C9,U+03CE->U+03C9, U+03C2->U+03C3, \
		U+0391..U+03A1->U+03B1..U+03C1,U+03A3..U+03A9->U+03C3..U+03C9, U+03B1..U+03C1, \
		U+03C3..U+03C9, U+0E01..U+0E2E,U+0E30..U+0E3A, U+0E40..U+0E45, U+0E47, U+0E50..U+0E59, \
		U+A000..U+A48F, U+4E00..U+9FBF,U+3400..U+4DBF, U+20000..U+2A6DF, U+F900..U+FAFF, \
		U+2F800..U+2FA1F, U+2E80..U+2EFF,U+2F00..U+2FDF, U+3100..U+312F, U+31A0..U+31BF, \
		U+3040..U+309F, U+30A0..U+30FF,U+31F0..U+31FF, U+AC00..U+D7AF, U+1100..U+11FF, \
		U+3130..U+318F, U+A000..U+A48F,U+A490..U+A4CF 
		min_prefix_len   = 0
		min_infix_len   = 1
		# 简单分词,只支持0和1,如果要搜索中文,请指定为1
		ngram_len = 1
		# 需要分词的字符,如果要搜索中文,去掉前面的注释

		#-----------------  
		html_strip      = 0  

}
index test1_delta: test1{#//增量索引
           source = test1_delta
         path     = /data/test1_delta
}

  #//......
  #//多个索引源 + 索引 配置方式如上 一个source{}对应一个index {},一个配置文件可以有多个索引


indexer
{
	 mem_limit            = 256M  
    max_iops             = 0  
    write_buffer         = 4M  

} 
searchd
{
	listen              = 9312   
    read_timeout        = 5  
    max_children        = 256  
    max_matches         = 100000   #最大查询数 这里可以设置大一些 在使用API可以用动态控制  
    seamless_rotate     = 1  
    preopen_indexes     = 0  
    unlink_old          = 1  
    max_packet_size     = 8M  
    read_buffer         = 1M  
    pid_file            = /usr/local/sphinx/log/searchd.pid  
    log                 = /usr/local/sphinx/log/searchd.log  
    query_log           = /usr/local/sphinx/log/query.log  
    #binlog_path        = /usr/local/sphinx/log/  
}

 

2、初始化 sphinx.conf 配置中的全部索引

在sphinx/bin目录下 使用如下命令:
indexer -c E:\sphinx\sphinx.conf --all

如果searchd处于运行状态,则执行

indexer -c E:\sphinx\sphinx.conf --all --rotate

 

3、启动 sphinx

searchd -c E:\sphinx\sphinx.conf

注意:liunx OR xp 可能启动有所不同,启动sphinx服务 请百度,这里就不多说了。

 

4、通过shell 脚步来重建索引

    1)创建脚步baild_main_index.sh:
    

#!/bin/sh
#创建主索引
/usr/local/sphinx/bin/indexer --rotate \
test1 \
test2

#清空搜索日志
:>/usr/local/sphinx/var/log/query.log
:>/usr/local/sphinx/var/log/searchd.log


    2)创建增量索引脚步baild_delta_index.sh:

#!/bin/sh
#while true
#do
#----------循环开始----------
#/usr/local/mysql-search/bin/mysql -uroot -pi0705tsstyl -e "stop slave;" --socket=/mysql/3406/mysql.sock
#创建增量索引
/usr/local/sphinx/bin/indexer --rotate \
test1_delta \
test2_delta 

#/usr/local/mysql-search/bin/mysql -uroot -pi0705tsstyl -e "start slave;" --socket=/mysql/3406/mysql.sock
#----------循环结束----------
#sleep 300
#done


   3)赋予主索引更新脚本可执行权限
  

chmod +x /usr/local/sphinx/bin/build_main_index.sh



    4)每天凌晨定时重建主索引:

 
#每天凌晨4点19分重建一次搜索引擎的主索引
19 4 * * * /bin/sh /usr/local/sphinx/bin/build_main_index.sh

   5)每1分钟自动重建一次搜索引擎的增量索引
#每1分钟自动重建一次搜索引擎的增量索引
0-57/1 * * * * /bin/sh /usr/local/sphinx/bin/build_delta_index.sh

~~~~

附加一个网上找到的PHPApi客户端,非常详细

<?php 

include_once 'sphinxapi.php';
$s = new SphinxClient();
$s->setServer("localhost", 9312);
$s->SetConnectTimeout ( 1 );//设置链接超时


/*
$s->AddQuery();//列表查询
$s->RunQueries ();//执行列表查询
$s->ResetFilters();//清除过滤条件
$s->BuildExcerpts($docs, $index, $words);//生成简要
$s->BuildKeywords($query, $index, $hits);//生成关键字
$s->GetLastError();//错误
$s->GetLastWarning();//警告
$s->FlushAttributes();//索引刷入硬盘
$s->IsConnectError();//链接错误
$s->ResetGroupBy();//重设分组

$s->SetFieldWeights(array('sub_title'=>1));//加权最小为1
$s->SetIDRange($min, $max);//ID范围
$s->SetIndexWeights(array('test1'=>1));//索引权重
$s->Status();//服务是否可用
$s->UpdateAttributes($index, $attrs, $values);//更新硬盘索引
*/
/*
参考文档:http://www.coreseek.cn/docs/coreseek_4.1-sphinx_2.0.1-beta.html#matching-modes
SPH_MATCH_ALL, matches all query words (default mode);
SPH_MATCH_ANY, matches any of the query words;
SPH_MATCH_PHRASE, matches query as a phrase, requiring perfect match;
SPH_MATCH_BOOLEAN, matches query as a boolean expression (see Section 5.2, “Boolean query syntax”);
SPH_MATCH_EXTENDED, matches query as an expression in Sphinx internal query language (see Section 5.3, “Extended query syntax”). As of 0.9.9, this has been superceded by SPH_MATCH_EXTENDED2, providing additional functionality and better performance. The ident is retained for legacy application code that will continue to be compatible once Sphinx and its components, including the API, are upgraded.
SPH_MATCH_EXTENDED2, matches query using the second version of the Extended matching mode.
SPH_MATCH_FULLSCAN, m
*/
$s->setMatchMode(SPH_MATCH_ANY);//匹配模式
$s->setMaxQueryTime(3);//查询超时
//$s->SetSelect ( $select );//设置返回的字段
/*
$cl->SetSelect ( "*, @weight+(user_karma+ln(pageviews))*0.1 AS myweight" );
$cl->SetSelect ( "exp_years, salary_gbp*{$gbp_usd_rate} AS salary_usd,
   IF(age>40,1,0) AS over40" );
$cl->SetSelect ( "*, AVG(price) AS avgprice" );
 */

/*
$cl->SetGroupBy ( "category", SPH_GROUPBY_ATTR, "@count desc" );
$cl->SetGroupDistinct ( "vendor" );
==
SELECT id, weight, all-attributes,
    COUNT(DISTINCT vendor) AS @distinct,
    COUNT(*) AS @count
FROM products
GROUP BY category
ORDER BY @count DESC
*/
//$s->SetGroupBy ( $groupby, SPH_GROUPBY_ATTR, $groupsort );//汇总
//$s->SetGroupDistinct ( $distinct );//设置不重复字段

$s->SetArrayResult ( true );//结果是否有ID

/*
    SPH_SORT_RELEVANCE mode, that sorts by relevance in descending order (best matches first);
    SPH_SORT_ATTR_DESC mode, that sorts by an attribute in descending order (bigger attribute values first);
    SPH_SORT_ATTR_ASC mode, that sorts by an attribute in ascending order (smaller attribute values first);
    SPH_SORT_TIME_SEGMENTS mode, that sorts by time segments (last hour/day/week/month) in descending order, and then by relevance in descending order;
    SPH_SORT_EXTENDED mode, that sorts by SQL-like combination of columns in ASC/DESC order;
    SPH_SORT_EXPR mode, that sorts by an arithmetic expression.
 */
//$s->SetSortMode ( SPH_SORT_EXTENDED, $sortby );//排序模式

/*
$s->SetOverride($attrname, $attrtype, $values);
$s->ResetOverrides();*/
/*
$s->SetRetries($count);//设置失败重试
$s->SetRankingMode($ranker);//设置排名模式 均适用于SPH_MATCH_EXTENDED2搜索

//第3个参数当为true时,相当于$attribute!=$value,默认值是false
$s->SetFilter ( 'target_type', $filtervals );//设置过滤,值列表
$s->SetFilterFloatRange($attribute, $min, $max);//浮动范围
$s->SetFilterRange($attribute, $min, $max);//指定范围
$s->SetGeoAnchor($attrlat, $attrlong, $lat, $long);
*/
//link
//$s->SetFilter ( 'target_type', array(1),true );


$s->SetLimits ( 0, 10 );//显示数量:开始 量 最大量 右偏移量
$result = $s->query("good","team");//查询


print_r($result);


 

===============PS

还算不错的一个sphinx API 中文手册

http://www.open-open.com/doc/view/a4dc3ede0bbb452a9440fc670838fb0e

 

  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值