配置coreseek
说容易也容易,说麻烦也麻烦的工作,照贴一份吧前面说过,我用的是python数据源
/usr/local/coreseek/etc/coreseek.conf
python
{
path = /usr/local/coreseek/DBSource #python 数据源脚本的存放路径
}
source Blog #索引库1 “Blog”
{
type = python #类型为python
name = Blog.MainSource #调用python的class
}
index Blog
{
source = Blog #数据源名称
path = /data/sphinx/Blog #数据文件存放路径
docinfo = extern
mlock = 0
morphology = none
min_word_len = 1
html_strip = 0
charset_dictpath = /usr/local/mmseg3/etc/ #mmseg中文分词库的位置,本文开头时我说不能修改mmseg的安装路径就错在这里
charset_type = zh_cn.utf-8
}
source UserInfo #索引库2 “UserInfo” 其余同上
{
type = python
name = UserInfo.MainSource
}
index UserInfo
{
source = UserInfo
path = /data/sphinx/UserInfo
docinfo = extern
mlock = 0
morphology = none
min_word_len = 1
html_strip = 0
charset_dictpath = /usr/local/mmseg3/etc/
charset_type = zh_cn.utf-8
}
searchd #服务器配置
{
listen = 9312 #显而易见,默认的监听端口
listen = 172.18.196.90:3306:mysql41 #兼容mysql方式的监听,我们配置了php-sphinx,故除了命令行方式外意义不大,可以关闭
pid_file = /var/run/coreseek.pid
}
python数据源脚本示意
贴过来供各位参考,注意的是返回的“字段名”必须是全小写
#!usr/bin/env python
from DBConfig import DBConfig
import MySQLdb
class MainSource(object):
def __init__(self, conf):
self.conf=conf
self.data=[]
self.idx = 0
def GetScheme(self): #数据结构
return [
('id' , {'docid':True } ), #id 必须是正整数
('uid', { 'type':'text'} ),
('blogcontent', { 'type':'text'} ),
]
def GetFieldOrder(self): #排序
return [('id'),('uid')]
def Connected(self): #链接
self.TableList=DBConfig().getDBTableConfig("BlogContent")
self.getTableData()
pass
def NextDocument(self): #指针进行下一行
if ( len(self.data) > 0 ):
item = self.data.pop()
#item = self.data[self.idx]
self.id = item["id"]
self.uid = item["uid"]
self.blogcontent = item["blogcontent"]
self.idx += 1
return True
elif (len(self.TableList ) > 0):
self.getTableData()
return self.NextDocument()
else:
return False
def getTableData(self):
dConfig = self.TableList.pop()
mysqlHandle=MySQLdb.connect(host=dConfig["host"], user=dConfig["user"],
passwd=dConfig["passwd"], db=dConfig["dbName"])
sSQL="SELECT blogId, uid, blogContent FROM " + dConfig["tableName"]
tableNumber = int(dConfig["tableName"][-2:], 16)
mysqlCursor=mysqlHandle.cursor()
mysqlCursor.execute(sSQL)
lResultList=mysqlCursor.fetchall()
mysqlHandle.close()
lData=[]
for lRow in lResultList:
fullId = int (lRow[0]) * 1000 + tableNumber
dRow={
"id":fullId,
"uid":lRow[1],
"blogcontent":lRow[2]
}
lData.append(dRow)
if (len(lData) > 0 ):
self.data.extend(lData)
if __name__ == "__main__":
conf = {}
source = MainSource(conf)
source.Connected()
while source.NextDocument():
print source.blogContent
pass开启服务器
建立索引:
/usr/local/coreseek/bin/indexer -c /usr/local/coreseek/etc/sphinx.conf –all 正常情况下,你会在配置文件设置的/data/sphinx目录下看到有索引文件生成。
开启服务器:
/usr/local/coreseek/bin/searchd -c /usr/local/coreseek/etc/sphinx.conf 你可以netstat -an| grep 9312 是否打开
设置定期更新索引:
coreseek的索引机制决定的他必须定期例遍全表,当然可以通过修改数据源的方式实现增量索引,然后合并索引,这样更靠谱,当然为了防止伸手党,在这里暂时不做介绍。
crontab -e
*/15 * * * * /usr/local/coreseek/bin/indexer -c /usr/local/coreseek/etc/sphinx.conf –all –rotate #15分钟索引一次