clucene 读源码记录

[b]0\ iconv 的使用[/b][url]http://www.gnu.org/software/libc/manual/html_node/iconv-Examples.html#iconv-Examples[/url]
[b]1\ TestUtf8 中的 测试把GBK中的数据转换为unicode,然后索引[/b]
void _Index(CuTest *tc, IndexWriter* ndx,char* file){
char path[CL_MAX_PATH];
TCHAR tlang[20];


strcpy(path,clucene_data_location);
strcat(path,"/utf8text");
CuAssert(tc,_T("Utf8 directory does not exist"),Misc::dir_Exists(path));
strcat(path,"/");
strcat(path,file);
/*strcat(path,"_utf8.txt");*/
strcat(path,"_gbk.txt");

CuAssert(tc,_T("Language file does not exist"),Misc::dir_Exists(path));

STRCPY_AtoT(tlang,file,CL_MAX_PATH);

Document doc;
doc.add(*Field::Keyword(_T("language"),tlang));

jstreams::FileReader* fr = new jstreams::FileReader(path, "GBK");

// StandardAnalyzer analyzer;
// TokenStream* stream = analyzer.tokenStream(NULL, _CLNEW CL_NS(util)::Reader(fr,true));

doc.add(*Field::Text(_T("contents"), _CLNEW CL_NS(util)::Reader(fr,true) ));
ndx->addDocument(&doc);
}

[b]2\ clucene 中util 库代码中的FileInputStream,一个从文件中读出数据填充到缓存中方法[/b]
int32_t FileInputStream::fillBuffer(char* start, int32_t space) {
if (file == 0) return -1;
// read into the buffer
int32_t nwritten = fread(start, 1, space, file);
// check the file stream status
if (ferror(file)) {
error = "Could not read from file '" + filepath + "'.";
fclose(file);
file = 0;
status = Error;
return -1;
}
if (feof(file)) {
fclose(file);
file = 0;
}
return nwritten;
}

3\
有关的

template <class T> template <class char>
void InputStreamBuffer<T>::setSize(int32_t size) {
// store pointer information
int32_t offset = (int32_t)(readPos - start);

// allocate memory in the buffer
if ( start == 0 )
start = (T*)malloc(size*sizeof(T));
else
start = (T*)realloc(start, size*sizeof(T));
this->size = size;

// restore pointer information
readPos = start + offset;
}


[b]4\GBK转UCS-2码,然后索引[/b]
 iconv_t converter = iconv_open("UCS-2-INTERNAL", "GBK");
//iconv_t converter = iconv_open("UCS-2-INTERNAL", "UTF-8");
// iconv_t converter = iconv_open("UCS-2-INTERNAL", "ASCII");

const char *inbuf ="我喜欢你欧阳";
size_t inbytesleft = strlen(inbuf);
wchar_t* start=(wchar_t*)malloc(inbytesleft*sizeof(wchar_t));
memset(start,0,inbytesleft*sizeof(wchar_t));
size_t outbytesleft = sizeof(wchar_t)*inbytesleft;
char *outbuf = (char*)start;
size_t r = iconv(converter, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
iconv_close(converter);
doc.add(*Field::Text(_T("contents"),start));



[b]4\从gb2312转 utf8 ,再从utf8转 ucs-2[/b]
//代码转换:从一种编码转为另一种编码
int code_convert(char *from_charset,char *to_charset,const char *inbuf,size_t inlen,char *outbuf,size_t outlen)
{
iconv_t cd;
int rc;
const char **pin = &inbuf;
char **pout = &outbuf;

cd = iconv_open(to_charset,from_charset);
if (cd==0) return -1;
memset(outbuf,0,outlen);
if (iconv(cd,pin,&inlen,pout,&outlen)==-1) return -1;
iconv_close(cd);
return 0;
}
//UNICODE码转为GB2312码
int u2g(char *inbuf,size_t inlen,char *outbuf,size_t outlen)
{
return code_convert("utf-8","gb2312",inbuf,inlen,outbuf,outlen);
}
//GB2312码转为UNICODE码
int g2u(char *inbuf,size_t inlen,char *outbuf,size_t outlen)
{
return code_convert("gb2312","utf-8",inbuf,inlen,outbuf,outlen);
}

iconv_t converter = iconv_open("UCS-2-INTERNAL", "UTF-8");

//gb2312码转为unicode码
char *in_gb2312="我喜欢你欧阳";
size_t tmpleng = strlen(in_gb2312);
// char out[500];

char* out=(char*)malloc(3*tmpleng*sizeof(char));
int rec = g2u(in_gb2312,strlen(in_gb2312),out,3*tmpleng);
const char * inbuf=out;
size_t inbytesleft = strlen(inbuf);
wchar_t* start=(wchar_t*)malloc(inbytesleft*sizeof(wchar_t));
memset(start,0,inbytesleft*sizeof(wchar_t));
size_t outbytesleft = sizeof(wchar_t)*inbytesleft;
char *outbuf = (char*)start;
size_t r = iconv(converter, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
iconv_close(converter);

doc.add(*Field::Text(_T("contents"),start));
大名鼎鼎的clucene,是lucenec++ 版; CLucene README ============== ------------------------------------------------------ CLucene is a C++ port of Lucene. It is a high-performance, full-featured text search engine written in C++. CLucene is faster than lucene as it is written in C++. ------------------------------------------------------ CLucene has contributions from many, see AUTHORS CLucene is distributed under the GNU Lesser General Public License (LGPL) *or* the Apache License, Version 2.0 See the LGPL.license and APACHE.license for the respective license information. Read COPYING for more about the license. Installation ------------ * For Linux, MacOSX, cygwin and MinGW build information, read INSTALL. * Boost.Jam files are provided in the root directory and subdirectories. * Microsoft Visual Studio (6&7) are provided in the win32 folder. Mailing List ------------ Questions and discussion should be directed to the CLucene mailing list at clucene-developers@lists.sourceforge.net Find subscription instructions at http://lists.sourceforge.net/lists/listinfo/clucene-developers Suggestions and bug reports can be made on our bug tracking database (http://sourceforge.net/tracker/?group_id=80013&atid=558446) The latest version ------------------ Details of the latest version can be found on the CLucene sourceforge project web site: http://www.sourceforge.net/projects/clucene Documentation ------------- Documentation is provided at http://clucene.sourceforge.net/doc/doxygen/html/ You can also build your own documentation by running doxygen from the root directory of clucene. CLucene is a very close port of Java Lucene, so you can also try looking at the Java Docs on http://lucene.apache.org/java/ Performance ----------- Very little benchmarking has been done on clucene. Andi Vajda posted some limited statistics on the clucene list a while ago with the following results. There are 250 HTML files under $JAVA_HOME/docs/api/java/util for about 6108kb of HTML
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值