Clucene C++编码转换

最新推荐文章于 2024-03-08 18:16:46 发布

sealbird

最新推荐文章于 2024-03-08 18:16:46 发布

阅读量185

点赞数

分类专栏： clucene 文章标签： C C++ C# lucene Unix

本文链接：https://blog.csdn.net/sealbird/article/details/83798907

版权

clucene 专栏收录该内容

11 篇文章 0 订阅

订阅专栏

Clucene C++编码转换
在做Clucene与lucene生成的Index文件相互兼容时，遇到了编码转换问题。它们的兼容性对于非英文的编码可能都会存在这样的问题，经过跟踪clucene程序，发现它用的是unicode编码方式储蓄，因此，要先把字符串或文件转换成unicode编码，然后再进行其它处理。

转换的具体代码如下(Linux与vc6.0测试通过)：


#ifndef _UNIX
static inline int codepage(const char* code_page)
{
    return 936;//"GBK"
}
#endif
static inline int mb2wc(const char* code_page,/*in*/const char* in,int in_len,
  /*out*/wchar_t* out,int out_max)
{
#ifdef _UNIX
 size_t result;
 iconv_t env;
 env = iconv_open("WCHAR_T",code_page);
 result = iconv(env,(char**)&in,(size_t*)&in_len,(char**)&out,(size_t*)&out_max);
 iconv_close(env);
 return (int) result;
#else
 return ::MultiByteToWideChar(codepage(code_page),0,in,in_len,out,out_max);
#endif
}
static inline int wc2mb(const char* code_page,/*in*/const wchar_t* in,int in_len,
  /*out*/char* out,int out_max)
{
#ifdef _UNIX
 size_t result;
 iconv_t env;
 env = iconv_open(code_page,"WCHAR_T");
 result = iconv(env,(char**)&in,(size_t*)&in_len,(char**)&out,(size_t*)&out_max);
 iconv_close(env);
 return (int) result;
#else
 return ::WideCharToMultiByte(codepage(code_page),0,in,-1,out,out_max, NULL, NULL);
#endif  
}
void str_to_UnicodeChar(const char* strIn,TCHAR* &strOut){
 if(!strIn)
  return;
 int  i=  mb2wc("936",(char*)strIn, -1, NULL, 0);
 strOut = (TCHAR*)malloc(sizeof(TCHAR)*i);
 mb2wc("936",(char*)strIn, -1, strOut, i); 
}
void UnicodeChar_to_str(const TCHAR* strIn,char* &strOut){
 if(!strIn)
  return;

 int i = wc2mb("936",strIn,-1,NULL,0); 
 strOut = new char[i+1]; 
 wc2mb("936", strIn, -1, strOut, i);
 strOut[i] = 0;
}
void tchar_to_str(const const TCHAR* strIn ,char* &strOut){
 int i=0;
 if(!strIn)
  return ;
 strOut = new char[1024]; 

 while(*strIn) {
  strOut[i]=*strIn++;
  i++;
 }
 strOut[i]='\0';

}