sqlite3自定义分词器

      sqlite3通过使用fts3虚表支持全文搜索,默认支持simple和porter两种分词器,并提供了接口来自定义分词器。这里我们利用mmseg来构造自定义的中文分词器。
     虽然sqlite在fts3_tokenizer.h中提供了各种接口供用户自定义分词器,但其并未提供c函数供用户来注册自定义的分词器,分词器的注册必须使用sql语句来完成。
    SELECTfts3_tokenizer(<tokenizer-name>,<sqlite3_tokenizer_moduleptr>);
   其中tokenizer-name是分词器的名称,sqlite3_tokenizer_moduleptr只一个指向sqlite3_tokenizer_module结构的指针并且编码为SQLblob。下面是官方给出的注册函数:
int registerTokenizer(
       sqlite3 *db,
       char *zName,
       const sqlite3_tokenizer_module *p
       ){
    intrc;
    sqlite3_stmt*pStmt;
    const char*zSql = "SELECT fts3_tokenizer(?, ?)";
    rc =sqlite3_prepare_v2(db, zSql, -1, &pStmt,0);
    if(rc!=SQLITE_OK ){
       return rc;
    }
   sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
   sqlite3_bind_blob(pStmt, 2, &p, sizeof(p),SQLITE_STATIC);
   sqlite3_step(pStmt);
    returnsqlite3_finalize(pStmt);
}

   要想实现自定义的分词器,最关键的时是得到指向sqlite3_tokenizer_module结构的一个指针,sqlite3_tokenizer_module结构体定义如下:
struct sqlite3_tokenizer_module {
int iVersion; //版本号,必须设置为0
int (*xCreate)( //创建虚表时自动调用并创建分词器
    intargc,                           
    const char*const*argv,            
   sqlite3_tokenizer**ppTokenizer    
);
int (*xDestroy)(sqlite3_tokenizer *pTokenizer);//数据库连接关闭时自动调用,用于销毁资源
int (*xOpen)( //插入数据或检索时自动调用以进行分词
    sqlite3_tokenizer*pTokenizer,      
    const char*pInput, intnBytes,     
   sqlite3_tokenizer_cursor **ppCursor 
);

int(*xClose)(sqlite3_tokenizer_cursor *pCursor); //分词结果提取完毕后自动调用

int (*xNext)( //逐个提取分词结果
   sqlite3_tokenizer_cursor*pCursor,   
    const char**ppToken, int *pnBytes,
    int*piStartOffset, 
    int*piEndOffset, 
    int*piPosition 
);
};
   有几点需要注意的是:
    1分词引擎使用sql语句注册意味着每建立一个sqlite连接都必须注册一次分词器,对于需要使用词库的中文分词器来说也意味着巨大的内存消耗。
    2在检索时分词结果的提取和语义的解析式交替进行的。例如我们搜索"kanif ORsqlite"的时候,引擎先将全部传入到分词器,在调用一次next获取到词kanif后,在将词sqlite传入到分词器,直到全部解析完毕。
    3由于中文分词本身的特殊性,例如"北京市"很有可能视为一个完整的词,这样在搜索"北京"的时候就无法获取到结果。如果分词器支持将"北京市"切分为"北京市"和"北京"或者将十一月切分为"11月"和"十一",那么需注意(*xNext)函数中的piStartOffset和piEndOffset参数。经测试在插入数据的时候这两个参数无实际用途,但在查询的时候这两个参数决定了下一次的输入串。

附:
#include <assert.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <sys/types.h>

#include"fts3_tokenizer.h"
#include "mmseg/mmseg.cpp"

static bool loadDic = true;

typedef struct cus_tokenizer{
sqlite3_tokenizer base;
} cus_tokenizer;

typedef struct cus_tokenizer_cursor{
sqlite3_tokenizer_cursor base;
char *pInput;
int nBytes;
int iToken;
char *pToken;
rmmseg::Algorithm *pAlgor;
} cus_tokenizer_cursor;

void initmmseg(void){
   if(!loadDic)
       return;
   mmseg_load_words("chars.dic");
   mmseg_load_words("words.dic");
    loadDic =False;
}

static int cusCreate(
int argc, const char * const *argv,
sqlite3_tokenizer **ppTokenizer
){
cus_tokenizer *t;
t = (cus_tokenizer *) sqlite3_malloc(sizeof(*t));
if( t==NULL ) return SQLITE_NOMEM;
memset(t, 0, sizeof(*t));
initmmseg();
*ppTokenizer = &t->base;
return SQLITE_OK;
}

static intcusDestroy(sqlite3_tokenizer *pTokenizer){
sqlite3_free(pTokenizer);
return SQLITE_OK;
}

static int cusOpen(
sqlite3_tokenizer*pTokenizer,        
const char *pInput, intnBytes,       
sqlite3_tokenizer_cursor**ppCursor   
){
cus_tokenizer_cursor *c;
if(pInput == 0){
    nBytes =0;
}else if(nBytes < 0)
    nBytes = (int)strlen(pInput);

c = (cus_tokenizer_cursor *)sqlite3_malloc(sizeof(*c));
if(c == NULL)
     return SQLITE_NOMEM;

c->iToken =c->nBytes = 0;
c->pInput = c->pToken =NULL;
c->pAlgor = mmseg_algor_create(pInput,nBytes);
c->nBytes = nBytes;
*ppCursor = &c->base;
return SQLITE_OK;
}

static intcusClose(sqlite3_tokenizer_cursor *pCursor){
cus_tokenizer_cursor *c = (cus_tokenizer_cursor *)pCursor;
if(c->pInput != NULL){
   sqlite3_free(c->pInput);
}
if(c->pToken != NULL){
   sqlite3_free(c->pToken);
}
if(c->pAlgor != NULL){
   mmseg_algor_destroy(c->pAlgor);
}
c->pInput = c->pToken =NULL;
c->pAlgor = NULL;
sqlite3_free(c);
return SQLITE_OK;
}


static int cusNext(
sqlite3_tokenizer_cursor *pCursor,
const char**ppToken,              
int*pnBytes,                      
int*piStartOffset,                
int*piEndOffset,                  
int*piPosition                    
){
cus_tokenizer_cursor *c = (cus_tokenizer_cursor *)pCursor;
cus_tokenizer *t = (cus_tokenizer *)pCursor->pTokenizer;
if(c->pToken != NULL){
   sqlite3_free(c->pToken);
   c->pToken = NULL;
}
struct Token token =mmseg_next_token(c->pAlgor);
if(token.length != 0 ){
    int l =token.length;
   c->pToken = (char *)sqlite3_malloc(l+1);
   if(c->pToken == NULL)
       return SQLITE_NOMEM;
   c->pToken[l] = 0;
   memcpy(c->pToken, token.text, l);
    *ppToken =c->pToken;
    *pnBytes =l;
   *piStartOffset = token.offset;
    *piEndOffset= token.offset + token.length;
    *piPosition= c->iToken++;
    returnSQLITE_OK;
}
//一般来说只有插入数据时才会进入到这里
return SQLITE_DONE;
}
static const sqlite3_tokenizer_module cusTokenizerModule ={
0,
cusCreate,
cusDestroy,
cusOpen,
cusClose,
cusNext,
};

int registerTokenizer(
       sqlite3 *db,
       char *zName,
       const sqlite3_tokenizer_module *p
       ){
    intrc;
    sqlite3_stmt*pStmt;
    const char*zSql = "SELECT fts3_tokenizer(?, ?)";
    rc =sqlite3_prepare_v2(db, zSql, -1, &pStmt,0);
    if(rc!=SQLITE_OK ){
       return rc;
    }
   sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
   sqlite3_bind_blob(pStmt, 2, &p, sizeof(p),SQLITE_STATIC);
   sqlite3_step(pStmt);
    returnsqlite3_finalize(pStmt);
}

int main(){
    constsqlite3_tokenizer_module *ptr =&cusTokenizerModule;
    sqlite3*pDB;
    sqlite3_stmt* stmt;
    char *errMsg = NULL;
    const char*zTail;

    int rc =sqlite3_open("test.sqlite3", &pDB);
   if(rc){
       printf("create error. %s\n",sqlite3_errmsg(pDB));
       return rc;
    }
    chartoken_name[] = "custoken";
   registerTokenizer(pDB, token_name, ptr);

    rc =sqlite3_exec(pDB, "CREATE VIRTUAL TABLE foo USINGfts3(tokenize=custoken)", 0, 0, &errMsg);
    if(rc !=SQLITE_OK){
       printf("create virtual error, %s\n", errMsg);
    if(rc !=SQLITE_OK){
       printf("create virtual error, %s\n", errMsg);
       return rc;
    }
    rc =sqlite3_exec(pDB, "INSERT INTO fooVALUES('\xe5\x8c\x97\xe4\xba\xac\xe5\xb8\x82')", 0, 0,&errMsg);
    if(rc !=SQLITE_OK){
       printf("insert value error, %s\n", errMsg);
       return rc;
    }
    int nrow =0, ncolumn = 0;
    char**azResult; //二维数组存放结果
   sqlite3_get_table(pDB , "SELECT * FROM foo WHERE content MATCH'\xe5\x8c\x97\xe4\xba\xac\xe5\xb8\x82'" , &azResult, &nrow , &ncolumn ,&errMsg );
    int i = 0;
    printf("row:%d column=%d \n" , nrow , ncolumn );
    printf("\nThe result of querying is : \n" );
    for( i=0 ;i<( nrow + 1 ) * ncolumn ; i++ )
         printf( "azResult[%d] = %s\n", i , azResult[i] );
   sqlite3_free_table( azResult );
   sqlite3_close(pDB);
    return0;
}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值