本次文档检测系统的前期测试数据库表为 /* * 一切都是为了MYSQL * 创建sougouLIB这个数据库 */ SET NAMES utf8; SET SQL_MODE=''; /*设置数据库默认校对字符集*/ CREATE DATABASE IF NOT EXISTS `sougouLIB` DEFAULT CHARACTER SET utf8 COLLATE utf8_general_ci; USE `sougouLIB`; /* * create table */ DROP TABLE IF EXISTS `doc_info`; /*表的校对规则与数据库的校对规则一样*/ CREATE TABLE `doc_info` ( `doc_id` INT(11) NOT NULL AUTO_INCREMENT, `title` VARCHAR(255) NOT NULL, `author` VARCHAR(30) NOT NULL, `post_time` INT(12) NOT NULL, `come_from` VARCHAR(255) NOT NULL, PRIMARY KEY(`doc_id`) )ENGINE=MYISAM DEFAULT CHARSET=utf8; /*ENGINE=MYISAM DEFAULT CHAEACTER SET utf8 COLLATE utf8_general_ci*/ DROP TABLE IF EXISTS `para`; CREATE TABLE `para` ( `id` INT(11) NOT NULL AUTO_INCREMENT, `para_content` TEXT, `doc_id` INT(11) NOT NULL, PRIMARY KEY(`id`) )ENGINE=MYISAM DEFAULT CHARSET=utf8; DROP TABLE IF EXISTS `delta_counter`; CREATE TABLE `delta_counter` ( `counter_id` INT(11) NOT NULL AUTO_INCREMENT, `max_doc_id` INT(11) NOT NULL, PRIMARY KEY(`counter_id`) )ENGINE=MYISAM DEFAULT CHARSET=utf8; 本次我们的文档集合为搜狗语料库,其Sphinx的配置文件为:sougouLIB.conf source base { type = mysql sql_host = localhost sql_user = root sql_pass = zhang15987 sql_db = sougoulib sql_port = 3306 sql_query_pre = SET NAMES UTF8 sql_range_step = 1024 sql_query_info_pre = SET NAMES UTF8 } source para : base { sql_query_pre = REPLACE INTO delta_counter SELECT 1,MAX(id) FROM para sql_query = SELECT id,para_content,doc_id FROM para / WHERE id <= (SELECT max_doc_id FROM delta_counter WHERE counter_id = 1) # sql_query_post = sql_attr_uint = doc_id sql_query_info = SELECT * FROM para WHERE id=$id # sql_query_range = SELECT MIN(id),MAX(id) FROM para } source para_delta : para { sql_query_pre = sql_query = SELECT id,para_content,doc_id FROM para / where id > (SELECT max_doc_id FROM delta_counter WHERE counter_id = 1) } source doc_info : base { #对应的修改,以便使其符合条件增量索引 sql_query_pre = REPLACE INTO delta_counter SELECT 1,MAX(doc_id) FROM doc_info sql_query = SELECT doc_id,title,author,post_time,come_from FROM doc_info / WHERE doc_id <= (SELECT max_doc_id FROM delta_counter WHERE counter_id = 1) # sql_query_post = DROP TABLE sql_attr_uint = post_time # sql_attr_str2ordinal = comfrom sql_query_info = SELECT * FROM doc_info WHERE doc_id=$id #sql_query_range = SELECT MIN(doc_id),MAX(doc_id) FROM doc_info } source doc_info_delta : doc_info { sql_query_pre = sql_query = SELECT doc_id,title,author,post_time,come_from FROM doc_info / WHERE doc_id > (SELECT max_doc_id FROM delta_counter WHERE counter_id = 1) sql_query_post = } #index定义 index para { source = para path = var/data/document/para docinfo = extern mlock = 0 morphology = none # stopwords = min_word_len = 1 html_strip = 1 charset_dictpath = /usr/local/mmseg/etc/ charset_type = zh_cn.utf-8 } index para_delta : para { source = para_delta path = var/data/document/para_delta } index doc_info { source = doc_info path = var/data/document/doc_info docinfo = extern mlock = 0 morphology = none # stopwords = min_word_len = 1 html_strip = 1 charset_dictpath = /usr/local/mmseg/etc/ charset_type = zh_cn.utf-8 } index doc_info_delta : doc_info { source = doc_info_delta path = var/data/document/doc_info_delta } indexer { mem_limit = 256M } searchd { listen = 9312 read_timeout = 5 max_children = 30 max_matches = 1000 seamless_rotate = 0 preopen_indexes = 0 unlink_old = 1 pid_file = var/log/searchd_mysql.pid log = var/log/searchd_mysql.log query_log = var/log/query_mysql.log }