下载sphinx-chinese包
下载字典xdict
1.安装g++
sudo aptitude install build-essential
2.安装libexpat-dev
sudo aptitude install libexpat-dev
3.解压sphinx-for-chinese-2.0.2-dev-r2894.tar.gz
sudo tar xzvf sphinx-for-chinese-2.0.2-dev-r2894.tar.gz
4.配置(cd 到sphinx-for-chinese-2.0.2-dev-r2894目录下)
./configure --without-mysql
5. 编译:
make
6.安装
sudo make install
7.配置文件放到 /usr/local/etc/
sudo cp ~/sphinx/sphinx.conf /usr/local/etc/
8.mkdict词典(先解压)
a. 建目录
sudo mkdir sphinx
cd sphinx
sudo mkdir dict
b.mkdict解压的词典并拷贝到/usr/local/share/sphinx/dict/
mkdict /home/sam/sphinx/xdict.txt xdict
sudo cp xdict /usr/local/share/sphinx/dict/
9.把数据源放到/opt/
sudo cp ~/sphinx/data.xml /opt/
10.建索引
sudo indexer --all
11.查找
search "要查找的词"
sphinx.conf:
#
# Sphinx configuration file sample
#
# WARNING! While this sample file mentions all available options,
# it contains (very) short helper descriptions only. Please refer to
# doc/sphinx.html for details.
#
#############################################################################
## data source definition
#############################################################################
source src1
{
# data source type. mandatory, no default value
# known types are mysql, pgsql, mssql, xmlpipe, xmlpipe2, odbc
#####################################################################
## xmlpipe2 settings
#####################################################################
type = xmlpipe2
# shell command to invoke xmlpipe stream producer
# mandatory
#
xmlpipe_command = cat /opt/data.xml
# xmlpipe2 field declaration
# multi-value, optional, default is empty
#
# xmlpipe_field = subject
# xmlpipe_field = content
# xmlpipe2 attribute declaration
# multi-value, optional, default is empty
# all xmlpipe_attr_XXX options are fully similar to sql_attr_XXX
#
# xmlpipe_attr_timestamp = published
# xmlpipe_attr_uint = author_id
# perform UTF-8 validation, and filter out incorrect codes
# avoids XML parser choking on non-UTF-8 documents
# optional, default is 0
#
# xmlpipe_fixup_utf8 = 1
}
# inherited source example
#
# all the parameters are copied from the parent source,
# and may then be overridden in this source definition
#source src1throttled : src1
#{
# sql_ranged_throttle = 100
#}
#############################################################################
## index definition
#############################################################################
# local index example
#
# this is an index which is stored locally in the filesystem
#
# all indexing-time options (such as morphology and charsets)
# are configured per local index
index test1
{
# index type
# optional, default is 'plain'
# known values are 'plain', 'distributed', and 'rt' (see samples below)
# type = plain
# document source(s) to index
# multi-value, mandatory
# document IDs must be globally unique across all sources
source = src1
# index files path and file name, without extension
# mandatory, path must be writable, extensions will be auto-appended
path = /usr/local/var/data/test1
# document attribute values (docinfo) storage mode
# optional, default is 'extern'
# known values are 'none', 'extern' and 'inline'
docinfo = extern
# memory locking for cached data (.spa and .spi), to prevent swapping
# optional, default is 0 (do not mlock)
# requires searchd to be run from root
mlock = 0
# a list of morphology preprocessors to apply
# optional, default is empty
#
# builtin preprocessors are 'none', 'stem_en', 'stem_ru', 'stem_enru',
# 'soundex', and 'metaphone'; additional preprocessors available from
# libstemmer are 'libstemmer_XXX', where XXX is algorithm code
# (see libstemmer_c/libstemmer/modules.txt)
#
# morphology = stem_en, stem_ru, soundex
# morphology = libstemmer_german
# morphology = libstemmer_sv
morphology = none
# minimum word length at which to enable stemming
# optional, default is 1 (stem everything)
#
# min_stemming_len = 1
# stopword files list (space separated)
# optional, default is empty
# contents are plain text, charset_table and stemming are both applied
#
# stopwords = /usr/local/var/data/stopwords.txt
# wordforms file, in "mapfrom > mapto" plain text format
# optional, default is empty
#
# wordforms = /usr/local/var/data/wordforms.txt
# tokenizing exceptions file
# optional, default is empty
#
# plain text, case sensitive, space insensitive in map-from part
# one "Map Several Words => ToASingleOne" entry per line
#
# exceptions = /usr/local/var/data/exceptions.txt
# minimum indexed word length
# default is 1 (index everything)
min_word_len = 1
# charset encoding type
# optional, default is 'sbcs'
# known types are 'sbcs' (Single Byte CharSet) and 'utf-8'
charset_type = utf-8
chinese_dictionary = /usr/local/share/sphinx/dict/xdict
# charset definition and case folding rules "table"
# optional, default value depends on charset_type
#
# defaults are configured to include English and Russian characters only
# you need to change the table to include additional ones
# this behavior MAY change in future versions
#
# 'sbcs' default value is
# charset_table = 0..9, A..Z->a..z, _, a..z, U+A8->U+B8, U+B8, U+C0..U+DF->U+E0..U+FF, U+E0..U+FF
#
# 'utf-8' default value is
charset_table = 0..9, A..Z->a..z, _, a..z, U+410..U+42F->U+430..U+44F, U+430..U+44F
# ignored characters list
# optional, default value is empty
#
# ignore_chars = U+00AD
# minimum word prefix length to index
# optional, default is 0 (do not index prefixes)
#
min_prefix_len = 0
# minimum word infix length to index
# optional, default is 0 (do not index infixes)
#
# min_infix_len = 0
# list of fields to limit prefix/infix indexing to
# optional, default value is empty (index all fields in prefix/infix mode)
#
# prefix_fields = filename
# infix_fields = url, domain
# enable star-syntax (wildcards) when searching prefix/infix indexes
# search-time only, does not affect indexing, can be 0 or 1
# optional, default is 0 (do not use wildcard syntax)
#
# enable_star = 1
# expand keywords with exact forms and/or stars when searching fit indexes
# search-time only, does not affect indexing, can be 0 or 1
# optional, default is 0 (do not expand keywords)
#
# expand_keywords = 1
# n-gram length to index, for CJK indexing
# only supports 0 and 1 for now, other lengths to be implemented
# optional, default is 0 (disable n-grams)
#
ngram_len = 1
# n-gram characters list, for CJK indexing
# optional, default is empty
#
ngram_chars = U+3000..U+2FA1F
# phrase boundary characters list
# optional, default is empty
#
# phrase_boundary = ., ?, !, U+2026 # horizontal ellipsis
# phrase boundary word position increment
# optional, default is 0
#
# phrase_boundary_step = 100
# blended characters list
# blended chars are indexed both as separators and valid characters
# for instance, AT&T will results in 3 tokens ("at", "t", and "at&t")
# optional, default is empty
#
# blend_chars = +, &, U+23
# blended token indexing mode
# a comma separated list of blended token indexing variants
# known variants are trim_none, trim_head, trim_tail, trim_both, skip_pure
# optional, default is trim_none
#
# blend_mode = trim_tail, skip_pure
# whether to strip HTML tags from incoming documents
# known values are 0 (do not strip) and 1 (do strip)
# optional, default is 0
html_strip = 1
# what HTML attributes to index if stripping HTML
# optional, default is empty (do not index anything)
#
# html_index_attrs = img=alt,title; a=title;
# what HTML elements contents to strip
# optional, default is empty (do not strip element contents)
#
# html_remove_elements = style, script
# whether to preopen index data files on startup
# optional, default is 0 (do not preopen), searchd-only
#
# preopen = 1
# whether to keep dictionary (.spi) on disk, or cache it in RAM
# optional, default is 0 (cache in RAM), searchd-only
#
# ondisk_dict = 1
# whether to enable in-place inversion (2x less disk, 90-95% speed)
# optional, default is 0 (use separate temporary files), indexer-only
#
# inplace_enable = 1
# in-place fine-tuning options
# optional, defaults are listed below
#
# inplace_hit_gap = 0 # preallocated hitlist gap size
# inplace_docinfo_gap = 0 # preallocated docinfo gap size
# inplace_reloc_factor = 0.1 # relocation buffer size within arena
# inplace_write_factor = 0.1 # write buffer size within arena
# whether to index original keywords along with stemmed versions
# enables "=exactform" operator to work
# optional, default is 0
#
# index_exact_words = 1
# position increment on overshort (less that min_word_len) words
# optional, allowed values are 0 and 1, default is 1
#
# overshort_step = 1
# position increment on stopword
# optional, allowed values are 0 and 1, default is 1
#
# stopword_step = 1
# hitless words list
# positions for these keywords will not be stored in the index
# optional, allowed values are 'all', or a list file name
#
# hitless_words = all
# hitless_words = hitless.txt
# detect and index sentence and paragraph boundaries
# required for the SENTENCE and PARAGRAPH operators to work
# optional, allowed values are 0 and 1, default is 0
#
# index_sp = 1
# index zones, delimited by HTML/XML tags
# a comma separated list of tags and wildcards
# required for the ZONE operator to work
# optional, default is empty string (do not index zones)
#
# index_zones = title, h*, th
}
# inherited index example
#
# all the parameters are copied from the parent index,
# and may then be overridden in this index definition
#index test1stemmed : test1
#{
# path = /usr/local/var/data/test1stemmed
# morphology = stem_en
#}
# distributed index example
#
# this is a virtual index which can NOT be directly indexed,
# and only contains references to other local and/or remote indexes
#index dist1
#{
# 'distributed' index type MUST be specified
# type = distributed
# local index to be searched
# there can be many local indexes configured
# local = test1
# local = test1stemmed
# remote agent
# multiple remote agents may be specified
# syntax for TCP connections is 'hostname:port:index1,[index2[,...]]'
# syntax for local UNIX connections is '/path/to/socket:index1,[index2[,...]]'
# agent = localhost:9313:remote1
# agent = localhost:9314:remote2,remote3
# agent = /var/run/searchd.sock:remote4
# blackhole remote agent, for debugging/testing
# network errors and search results will be ignored
#
# agent_blackhole = testbox:9312:testindex1,testindex2
# remote agent connection timeout, milliseconds
# optional, default is 1000 ms, ie. 1 sec
# agent_connect_timeout = 1000
# remote agent query timeout, milliseconds
# optional, default is 3000 ms, ie. 3 sec
# agent_query_timeout = 3000
#}
# realtime index example
#
# you can run INSERT, REPLACE, and DELETE on this index on the fly
# using MySQL protocol (see 'listen' directive below)
#index rt
#{
# 'rt' index type must be specified to use RT index
# type = rt
# index files path and file name, without extension
# mandatory, path must be writable, extensions will be auto-appended
# path = /usr/local/var/data/rt
# RAM chunk size limit
# RT index will keep at most this much data in RAM, then flush to disk
# optional, default is 32M
#
# rt_mem_limit = 512M
# full-text field declaration
# multi-value, mandatory
# rt_field = title
# rt_field = content
# unsigned integer attribute declaration
# multi-value (an arbitrary number of attributes is allowed), optional
# declares an unsigned 32-bit attribute
# rt_attr_uint = gid
# RT indexes currently support the following attribute types:
# uint, bigint, float, timestamp, string
#
# rt_attr_bigint = guid
# rt_attr_float = gpa
# rt_attr_timestamp = ts_added
# rt_attr_string = author
#}
#############################################################################
## indexer settings
#############################################################################
indexer
{
# memory limit, in bytes, kiloytes (16384K) or megabytes (256M)
# optional, default is 32M, max is 2047M, recommended is 256M to 1024M
mem_limit = 128M
# maximum IO calls per second (for I/O throttling)
# optional, default is 0 (unlimited)
#
# max_iops = 40
# maximum IO call size, bytes (for I/O throttling)
# optional, default is 0 (unlimited)
#
# max_iosize = 1048576
# maximum xmlpipe2 field length, bytes
# optional, default is 2M
#
# max_xmlpipe2_field = 4M
# write buffer size, bytes
# several (currently up to 4) buffers will be allocated
# write buffers are allocated in addition to mem_limit
# optional, default is 1M
#
# write_buffer = 1M
# maximum file field adaptive buffer size
# optional, default is 8M, minimum is 1M
#
# max_file_field_buffer = 32M
}
#############################################################################
## searchd settings
#############################################################################
searchd
{
# [hostname:]port[:protocol], or /unix/socket/path to listen on
# known protocols are 'sphinx' (SphinxAPI) and 'mysql41' (SphinxQL)
#
# multi-value, multiple listen points are allowed
# optional, defaults are 9312:sphinx and 9306:mysql41, as below
#
# listen = 127.0.0.1
# listen = 192.168.0.1:9312
# listen = 9312
# listen = /var/run/searchd.sock
listen = 9312
# listen = 9306:mysql41
# log file, searchd run info is logged here
# optional, default is 'searchd.log'
log = /usr/local/var/log/searchd.log
# query log file, all search queries are logged here
# optional, default is empty (do not log queries)
query_log = /usr/local/var/log/query.log
# client read timeout, seconds
# optional, default is 5
read_timeout = 5
# request timeout, seconds
# optional, default is 5 minutes
client_timeout = 300
# maximum amount of children to fork (concurrent searches to run)
# optional, default is 0 (unlimited)
max_children = 30
# PID file, searchd process ID file name
# mandatory
pid_file = /usr/local/var/log/searchd.pid
# max amount of matches the daemon ever keeps in RAM, per-index
# WARNING, THERE'S ALSO PER-QUERY LIMIT, SEE SetLimits() API CALL
# default is 1000 (just like Google)
max_matches = 1000
# seamless rotate, prevents rotate stalls if precaching huge datasets
# optional, default is 1
seamless_rotate = 1
# whether to forcibly preopen all indexes on startup
# optional, default is 1 (preopen everything)
preopen_indexes = 1
# whether to unlink .old index copies on succesful rotation.
# optional, default is 1 (do unlink)
unlink_old = 1
# attribute updates periodic flush timeout, seconds
# updates will be automatically dumped to disk this frequently
# optional, default is 0 (disable periodic flush)
#
# attr_flush_period = 900
# instance-wide ondisk_dict defaults (per-index value take precedence)
# optional, default is 0 (precache all dictionaries in RAM)
#
# ondisk_dict_default = 1
# MVA updates pool size
# shared between all instances of searchd, disables attr flushes!
# optional, default size is 1M
mva_updates_pool = 1M
# max allowed network packet size
# limits both query packets from clients, and responses from agents
# optional, default size is 8M
max_packet_size = 8M
# crash log path
# searchd will (try to) log crashed query to 'crash_log_path.PID' file
# optional, default is empty (do not create crash logs)
#
# crash_log_path = /usr/local/var/log/crash
# max allowed per-query filter count
# optional, default is 256
max_filters = 256
# max allowed per-filter values count
# optional, default is 4096
max_filter_values = 4096
# socket listen queue length
# optional, default is 5
#
# listen_backlog = 5
# per-keyword read buffer size
# optional, default is 256K
#
# read_buffer = 256K
# unhinted read size (currently used when reading hits)
# optional, default is 32K
#
# read_unhinted = 32K
# max allowed per-batch query count (aka multi-query count)
# optional, default is 32
max_batch_queries = 32
# max common subtree document cache size, per-query
# optional, default is 0 (disable subtree optimization)
#
# subtree_docs_cache = 4M
# max common subtree hit cache size, per-query
# optional, default is 0 (disable subtree optimization)
#
# subtree_hits_cache = 8M
# multi-processing mode (MPM)
# known values are none, fork, prefork, and threads
# optional, default is fork
#
workers = threads # for RT to work
# max threads to create for searching local parts of a distributed index
# optional, default is 0, which means disable multi-threaded searching
# should work with all MPMs (ie. does NOT require workers=threads)
#
# dist_threads = 4
# binlog files path; use empty string to disable binlog
# optional, default is build-time configured data directory
#
# binlog_path = # disable logging
# binlog_path = /usr/local/var/data # binlog.001 etc will be created there
# binlog flush/sync mode
# 0 means flush and sync every second
# 1 means flush and sync every transaction
# 2 means flush every transaction, sync every second
# optional, default is 2
#
# binlog_flush = 2
# binlog per-file size limit
# optional, default is 128M, 0 means no limit
#
# binlog_max_log_size = 256M
# per-thread stack size, only affects workers=threads mode
# optional, default is 64K
#
# thread_stack = 128K
# per-keyword expansion limit (for dict=keywords prefix searches)
# optional, default is 0 (no limit)
#
# expansion_limit = 1000
# RT RAM chunks flush period
# optional, default is 0 (no periodic flush)
#
# rt_flush_period = 900
# query log file format
# optional, known values are plain and sphinxql, default is plain
#
# query_log_format = sphinxql
# version string returned to MySQL network protocol clients
# optional, default is empty (use Sphinx version)
#
# mysql_version_string = 5.0.37
# trusted plugin directory
# optional, default is empty (disable UDFs)
#
# plugin_dir = /usr/local/sphinx/lib
# default server-wide collation
# optional, default is libc_ci
#
# collation_server = utf8_general_ci
# server-wide locale for libc based collations
# optional, default is C
#
# collation_libc_locale = ru_RU.UTF-8
# threaded server watchdog (only used in workers=threads mode)
# optional, values are 0 and 1, default is 1 (watchdog on)
#
# watchdog = 1
# SphinxQL compatibility mode (legacy columns and their names)
# optional, default is 0 (SQL compliant syntax and result sets)
#
# compat_sphinxql_magics = 1
}
# --eof--
xml:
<sphinx:docset>
<sphinx:schema>
<sphinx:field name="title"/>
</sphinx:schema>
<sphinx:document id="3835035"><title>卡米复古彩虹岛 KAMI RETRO</title><description>卡米复古彩虹岛化</description></sphinx:document>
</sphinx:docset>