在ubuntu下安装sphinx-chinese 用于xml

最新推荐文章于 2021-05-25 12:37:13 发布

sam_cjz

最新推荐文章于 2021-05-25 12:37:13 发布

阅读量964

点赞数

文章标签： ubuntu collation file optimization buffer path

本文链接：https://blog.csdn.net/sam_cjz/article/details/6921397

版权

下载sphinx-chinese包

下载字典xdict

1.安装g++

sudo aptitude install build-essential

2.安装libexpat-dev

sudo aptitude install libexpat-dev

3.解压sphinx-for-chinese-2.0.2-dev-r2894.tar.gz

sudo tar xzvf sphinx-for-chinese-2.0.2-dev-r2894.tar.gz

4.配置（cd 到sphinx-for-chinese-2.0.2-dev-r2894目录下）

./configure --without-mysql

5. 编译：
make

6.安装

sudo make install

7.配置文件放到 /usr/local/etc/

sudo cp ~/sphinx/sphinx.conf /usr/local/etc/

8.mkdict词典（先解压）

a. 建目录

sudo mkdir sphinx
cd sphinx
sudo mkdir dict

b.mkdict解压的词典并拷贝到/usr/local/share/sphinx/dict/

mkdict /home/sam/sphinx/xdict.txt xdict

sudo cp xdict /usr/local/share/sphinx/dict/

9.把数据源放到/opt/

sudo cp ~/sphinx/data.xml /opt/

10.建索引

sudo indexer --all

11.查找

search "要查找的词"

sphinx.conf:

#
# Sphinx configuration file sample
#
# WARNING! While this sample file mentions all available options,
# it contains (very) short helper descriptions only. Please refer to
# doc/sphinx.html for details.
#

#############################################################################
## data source definition
#############################################################################

source src1
{
   # data source type. mandatory, no default value
   # known types are mysql, pgsql, mssql, xmlpipe, xmlpipe2, odbc

   #####################################################################
   ## xmlpipe2 settings
   #####################################################################

   type           = xmlpipe2

   # shell command to invoke xmlpipe stream producer
   # mandatory
   #
   xmlpipe_command       = cat /opt/data.xml

   # xmlpipe2 field declaration
   # multi-value, optional, default is empty
   #
   # xmlpipe_field       = subject
   # xmlpipe_field       = content

   # xmlpipe2 attribute declaration
   # multi-value, optional, default is empty
   # all xmlpipe_attr_XXX options are fully similar to sql_attr_XXX
   #
   # xmlpipe_attr_timestamp   = published
   # xmlpipe_attr_uint   = author_id

   # perform UTF-8 validation, and filter out incorrect codes
   # avoids XML parser choking on non-UTF-8 documents
   # optional, default is 0
   #
   # xmlpipe_fixup_utf8   = 1
}

# inherited source example
#
# all the parameters are copied from the parent source,
# and may then be overridden in this source definition
#source src1throttled : src1
#{
#   sql_ranged_throttle   = 100
#}

#############################################################################
## index definition
#############################################################################

# local index example
#
# this is an index which is stored locally in the filesystem
#
# all indexing-time options (such as morphology and charsets)
# are configured per local index
index test1
{
   # index type
   # optional, default is 'plain'
   # known values are 'plain', 'distributed', and 'rt' (see samples below)
   # type           = plain

   # document source(s) to index
   # multi-value, mandatory
   # document IDs must be globally unique across all sources
   source           = src1

   # index files path and file name, without extension
   # mandatory, path must be writable, extensions will be auto-appended
   path           = /usr/local/var/data/test1

   # document attribute values (docinfo) storage mode
   # optional, default is 'extern'
   # known values are 'none', 'extern' and 'inline'
   docinfo           = extern

   # memory locking for cached data (.spa and .spi), to prevent swapping
   # optional, default is 0 (do not mlock)
   # requires searchd to be run from root
   mlock           = 0

   # a list of morphology preprocessors to apply
   # optional, default is empty
   #
   # builtin preprocessors are 'none', 'stem_en', 'stem_ru', 'stem_enru',
   # 'soundex', and 'metaphone'; additional preprocessors available from
   # libstemmer are 'libstemmer_XXX', where XXX is algorithm code
   # (see libstemmer_c/libstemmer/modules.txt)
   #
   # morphology       = stem_en, stem_ru, soundex
   # morphology       = libstemmer_german
   # morphology       = libstemmer_sv
   morphology       = none

   # minimum word length at which to enable stemming
   # optional, default is 1 (stem everything)
   #
   # min_stemming_len   = 1

   # stopword files list (space separated)
   # optional, default is empty
   # contents are plain text, charset_table and stemming are both applied
   #
   # stopwords       = /usr/local/var/data/stopwords.txt

   # wordforms file, in "mapfrom > mapto" plain text format
   # optional, default is empty
   #
   # wordforms       = /usr/local/var/data/wordforms.txt

   # tokenizing exceptions file
   # optional, default is empty
   #
   # plain text, case sensitive, space insensitive in map-from part
   # one "Map Several Words => ToASingleOne" entry per line
   #
   # exceptions       = /usr/local/var/data/exceptions.txt

   # minimum indexed word length
   # default is 1 (index everything)
   min_word_len       = 1

   # charset encoding type
   # optional, default is 'sbcs'
   # known types are 'sbcs' (Single Byte CharSet) and 'utf-8'
   charset_type       = utf-8
   chinese_dictionary = /usr/local/share/sphinx/dict/xdict

   # charset definition and case folding rules "table"
   # optional, default value depends on charset_type
   #
   # defaults are configured to include English and Russian characters only
   # you need to change the table to include additional ones
   # this behavior MAY change in future versions
   #
   # 'sbcs' default value is
   # charset_table       = 0..9, A..Z->a..z, _, a..z, U+A8->U+B8, U+B8, U+C0..U+DF->U+E0..U+FF, U+E0..U+FF
   #
   # 'utf-8' default value is
   charset_table       = 0..9, A..Z->a..z, _, a..z, U+410..U+42F->U+430..U+44F, U+430..U+44F

   # ignored characters list
   # optional, default value is empty
   #
   # ignore_chars       = U+00AD

   # minimum word prefix length to index
   # optional, default is 0 (do not index prefixes)
   #
   min_prefix_len       = 0

   # minimum word infix length to index
   # optional, default is 0 (do not index infixes)
   #
   # min_infix_len       = 0

   # list of fields to limit prefix/infix indexing to
   # optional, default value is empty (index all fields in prefix/infix mode)
   #
   # prefix_fields       = filename
   # infix_fields       = url, domain

   # enable star-syntax (wildcards) when searching prefix/infix indexes
   # search-time only, does not affect indexing, can be 0 or 1
   # optional, default is 0 (do not use wildcard syntax)
   #
   # enable_star       = 1

   # expand keywords with exact forms and/or stars when searching fit indexes
   # search-time only, does not affect indexing, can be 0 or 1
   # optional, default is 0 (do not expand keywords)
   #
   # expand_keywords       = 1


   # n-gram length to index, for CJK indexing
   # only supports 0 and 1 for now, other lengths to be implemented
   # optional, default is 0 (disable n-grams)
   #
   ngram_len       = 1

   # n-gram characters list, for CJK indexing
   # optional, default is empty
   #
   ngram_chars       = U+3000..U+2FA1F

   # phrase boundary characters list
   # optional, default is empty
   #
   # phrase_boundary       = ., ?, !, U+2026 # horizontal ellipsis

   # phrase boundary word position increment
   # optional, default is 0
   #
   # phrase_boundary_step   = 100

   # blended characters list
   # blended chars are indexed both as separators and valid characters
   # for instance, AT&T will results in 3 tokens ("at", "t", and "at&t")
   # optional, default is empty
   #
   # blend_chars       = +, &, U+23

   # blended token indexing mode
   # a comma separated list of blended token indexing variants
   # known variants are trim_none, trim_head, trim_tail, trim_both, skip_pure
   # optional, default is trim_none
   #
   # blend_mode       = trim_tail, skip_pure

   # whether to strip HTML tags from incoming documents
   # known values are 0 (do not strip) and 1 (do strip)
   # optional, default is 0
   html_strip       = 1

   # what HTML attributes to index if stripping HTML
   # optional, default is empty (do not index anything)
   #
   # html_index_attrs   = img=alt,title; a=title;

   # what HTML elements contents to strip
   # optional, default is empty (do not strip element contents)
   #
   # html_remove_elements   = style, script

   # whether to preopen index data files on startup
   # optional, default is 0 (do not preopen), searchd-only
   #
   # preopen           = 1

   # whether to keep dictionary (.spi) on disk, or cache it in RAM
   # optional, default is 0 (cache in RAM), searchd-only
   #
   # ondisk_dict       = 1

   # whether to enable in-place inversion (2x less disk, 90-95% speed)
   # optional, default is 0 (use separate temporary files), indexer-only
   #
   # inplace_enable       = 1

   # in-place fine-tuning options
   # optional, defaults are listed below
   #
   # inplace_hit_gap       = 0 # preallocated hitlist gap size
   # inplace_docinfo_gap   = 0 # preallocated docinfo gap size
   # inplace_reloc_factor   = 0.1 # relocation buffer size within arena
   # inplace_write_factor   = 0.1 # write buffer size within arena

   # whether to index original keywords along with stemmed versions
   # enables "=exactform" operator to work
   # optional, default is 0
   #
   # index_exact_words   = 1

   # position increment on overshort (less that min_word_len) words
   # optional, allowed values are 0 and 1, default is 1
   #
   # overshort_step       = 1

   # position increment on stopword
   # optional, allowed values are 0 and 1, default is 1
   #
   # stopword_step       = 1

   # hitless words list
   # positions for these keywords will not be stored in the index
   # optional, allowed values are 'all', or a list file name
   #
   # hitless_words       = all
   # hitless_words       = hitless.txt

   # detect and index sentence and paragraph boundaries
   # required for the SENTENCE and PARAGRAPH operators to work
   # optional, allowed values are 0 and 1, default is 0
   #
   # index_sp           = 1

   # index zones, delimited by HTML/XML tags
   # a comma separated list of tags and wildcards
   # required for the ZONE operator to work
   # optional, default is empty string (do not index zones)
   #
   # index_zones       = title, h*, th
}

# inherited index example
#
# all the parameters are copied from the parent index,
# and may then be overridden in this index definition
#index test1stemmed : test1
#{
#   path           = /usr/local/var/data/test1stemmed
#   morphology       = stem_en
#}

# distributed index example
#
# this is a virtual index which can NOT be directly indexed,
# and only contains references to other local and/or remote indexes
#index dist1
#{
   # 'distributed' index type MUST be specified
#   type           = distributed

   # local index to be searched
   # there can be many local indexes configured
#   local           = test1
#   local           = test1stemmed

   # remote agent
   # multiple remote agents may be specified
   # syntax for TCP connections is 'hostname:port:index1,[index2[,...]]'
   # syntax for local UNIX connections is '/path/to/socket:index1,[index2[,...]]'
#   agent           = localhost:9313:remote1
#   agent           = localhost:9314:remote2,remote3
   # agent           = /var/run/searchd.sock:remote4

   # blackhole remote agent, for debugging/testing
   # network errors and search results will be ignored
   #
   # agent_blackhole       = testbox:9312:testindex1,testindex2

   # remote agent connection timeout, milliseconds
   # optional, default is 1000 ms, ie. 1 sec
#   agent_connect_timeout   = 1000

   # remote agent query timeout, milliseconds
   # optional, default is 3000 ms, ie. 3 sec
#   agent_query_timeout   = 3000
#}

# realtime index example
#
# you can run INSERT, REPLACE, and DELETE on this index on the fly
# using MySQL protocol (see 'listen' directive below)
#index rt
#{
   # 'rt' index type must be specified to use RT index
#   type           = rt

   # index files path and file name, without extension
   # mandatory, path must be writable, extensions will be auto-appended
#   path           = /usr/local/var/data/rt

   # RAM chunk size limit
   # RT index will keep at most this much data in RAM, then flush to disk
   # optional, default is 32M
   #
   # rt_mem_limit       = 512M

   # full-text field declaration
   # multi-value, mandatory
#   rt_field       = title
#   rt_field       = content

   # unsigned integer attribute declaration
   # multi-value (an arbitrary number of attributes is allowed), optional
   # declares an unsigned 32-bit attribute
#   rt_attr_uint       = gid

   # RT indexes currently support the following attribute types:
   # uint, bigint, float, timestamp, string
   #
   # rt_attr_bigint       = guid
   # rt_attr_float       = gpa
   # rt_attr_timestamp   = ts_added
   # rt_attr_string       = author
#}

#############################################################################
## indexer settings
#############################################################################

indexer
{
   # memory limit, in bytes, kiloytes (16384K) or megabytes (256M)
   # optional, default is 32M, max is 2047M, recommended is 256M to 1024M
   mem_limit       = 128M

   # maximum IO calls per second (for I/O throttling)
   # optional, default is 0 (unlimited)
   #
   # max_iops       = 40

   # maximum IO call size, bytes (for I/O throttling)
   # optional, default is 0 (unlimited)
   #
   # max_iosize       = 1048576

   # maximum xmlpipe2 field length, bytes
   # optional, default is 2M
   #
   # max_xmlpipe2_field   = 4M

   # write buffer size, bytes
   # several (currently up to 4) buffers will be allocated
   # write buffers are allocated in addition to mem_limit
   # optional, default is 1M
   #
   # write_buffer       = 1M

   # maximum file field adaptive buffer size
   # optional, default is 8M, minimum is 1M
   #
   # max_file_field_buffer   = 32M
}

#############################################################################
## searchd settings
#############################################################################

searchd
{
   # [hostname:]port[:protocol], or /unix/socket/path to listen on
   # known protocols are 'sphinx' (SphinxAPI) and 'mysql41' (SphinxQL)
   #
   # multi-value, multiple listen points are allowed
   # optional, defaults are 9312:sphinx and 9306:mysql41, as below
   #
   # listen           = 127.0.0.1
   # listen           = 192.168.0.1:9312
   # listen           = 9312
   # listen           = /var/run/searchd.sock
   listen           = 9312
   # listen           = 9306:mysql41

   # log file, searchd run info is logged here
   # optional, default is 'searchd.log'
   log           = /usr/local/var/log/searchd.log

   # query log file, all search queries are logged here
   # optional, default is empty (do not log queries)
   query_log       = /usr/local/var/log/query.log

   # client read timeout, seconds
   # optional, default is 5
   read_timeout       = 5

   # request timeout, seconds
   # optional, default is 5 minutes
   client_timeout       = 300

   # maximum amount of children to fork (concurrent searches to run)
   # optional, default is 0 (unlimited)
   max_children       = 30

   # PID file, searchd process ID file name
   # mandatory
   pid_file       = /usr/local/var/log/searchd.pid

   # max amount of matches the daemon ever keeps in RAM, per-index
   # WARNING, THERE'S ALSO PER-QUERY LIMIT, SEE SetLimits() API CALL
   # default is 1000 (just like Google)
   max_matches       = 1000

   # seamless rotate, prevents rotate stalls if precaching huge datasets
   # optional, default is 1
   seamless_rotate       = 1

   # whether to forcibly preopen all indexes on startup
   # optional, default is 1 (preopen everything)
   preopen_indexes       = 1

   # whether to unlink .old index copies on succesful rotation.
   # optional, default is 1 (do unlink)
   unlink_old       = 1

   # attribute updates periodic flush timeout, seconds
   # updates will be automatically dumped to disk this frequently
   # optional, default is 0 (disable periodic flush)
   #
   # attr_flush_period   = 900

   # instance-wide ondisk_dict defaults (per-index value take precedence)
   # optional, default is 0 (precache all dictionaries in RAM)
   #
   # ondisk_dict_default   = 1

   # MVA updates pool size
   # shared between all instances of searchd, disables attr flushes!
   # optional, default size is 1M
   mva_updates_pool   = 1M

   # max allowed network packet size
   # limits both query packets from clients, and responses from agents
   # optional, default size is 8M
   max_packet_size       = 8M

   # crash log path
   # searchd will (try to) log crashed query to 'crash_log_path.PID' file
   # optional, default is empty (do not create crash logs)
   #
   # crash_log_path       = /usr/local/var/log/crash

   # max allowed per-query filter count
   # optional, default is 256
   max_filters       = 256

   # max allowed per-filter values count
   # optional, default is 4096
   max_filter_values   = 4096

   # socket listen queue length
   # optional, default is 5
   #
   # listen_backlog       = 5

   # per-keyword read buffer size
   # optional, default is 256K
   #
   # read_buffer       = 256K

   # unhinted read size (currently used when reading hits)
   # optional, default is 32K
   #
   # read_unhinted       = 32K

   # max allowed per-batch query count (aka multi-query count)
   # optional, default is 32
   max_batch_queries   = 32

   # max common subtree document cache size, per-query
   # optional, default is 0 (disable subtree optimization)
   #
   # subtree_docs_cache   = 4M

   # max common subtree hit cache size, per-query
   # optional, default is 0 (disable subtree optimization)
   #
   # subtree_hits_cache   = 8M

   # multi-processing mode (MPM)
   # known values are none, fork, prefork, and threads
   # optional, default is fork
   #
   workers           = threads # for RT to work

   # max threads to create for searching local parts of a distributed index
   # optional, default is 0, which means disable multi-threaded searching
   # should work with all MPMs (ie. does NOT require workers=threads)
   #
   # dist_threads       = 4

   # binlog files path; use empty string to disable binlog
   # optional, default is build-time configured data directory
   #
   # binlog_path       = # disable logging
   # binlog_path       = /usr/local/var/data # binlog.001 etc will be created there

   # binlog flush/sync mode
   # 0 means flush and sync every second
   # 1 means flush and sync every transaction
   # 2 means flush every transaction, sync every second
   # optional, default is 2
   #
   # binlog_flush       = 2

   # binlog per-file size limit
   # optional, default is 128M, 0 means no limit
   #
   # binlog_max_log_size   = 256M

   # per-thread stack size, only affects workers=threads mode
   # optional, default is 64K
   #
   # thread_stack           = 128K

   # per-keyword expansion limit (for dict=keywords prefix searches)
   # optional, default is 0 (no limit)
   #
   # expansion_limit       = 1000

   # RT RAM chunks flush period
   # optional, default is 0 (no periodic flush)
   #
   # rt_flush_period       = 900

   # query log file format
   # optional, known values are plain and sphinxql, default is plain
   #
   # query_log_format       = sphinxql

   # version string returned to MySQL network protocol clients
   # optional, default is empty (use Sphinx version)
   #
   # mysql_version_string   = 5.0.37

   # trusted plugin directory
   # optional, default is empty (disable UDFs)
   #
   # plugin_dir           = /usr/local/sphinx/lib

   # default server-wide collation
   # optional, default is libc_ci
   #
   # collation_server       = utf8_general_ci

   # server-wide locale for libc based collations
   # optional, default is C
   #
   # collation_libc_locale   = ru_RU.UTF-8

   # threaded server watchdog (only used in workers=threads mode)
   # optional, values are 0 and 1, default is 1 (watchdog on)
   #
   # watchdog               = 1


   # SphinxQL compatibility mode (legacy columns and their names)
   # optional, default is 0 (SQL compliant syntax and result sets)
   #
   # compat_sphinxql_magics   = 1
}

# --eof--

xml:

<?xml version="1.0" encoding="UTF-8"?>
<sphinx:docset>
<sphinx:schema>
<sphinx:field name="title"/>
</sphinx:schema>

<sphinx:document id="3835035"><title>卡米复古彩虹岛 KAMI RETRO</title><description>卡米复古彩虹岛化</description></sphinx:document>

</sphinx:docset>

sam_cjz

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
在ubuntu下安装sphinx-chinese 用于xml

下载sphinx-chinese包下载字典xdict1.安装g++ sudo aptitude install build-essential2.安装libexpat-devsudo aptitude install libexpat-dev3.解压sphinx-for-chinese-2.0.2-dev-r2894.tar.gz sudo tar xzvf sp
复制链接

扫一扫