Kmeans聚类之建立词袋子模型

最新推荐文章于 2023-05-07 13:27:52 发布

weixin_34347651

最新推荐文章于 2023-05-07 13:27:52 发布

阅读量145

点赞数

文章标签：数据结构与算法 c/c++ python

作者：finallyliuyu （转载请注明出处）

最近打算将自己的工作平台由C#，python等迁移到C++。这是我的第一个C++工作程序吧。

IDE:VS2008

language: C++

library:boost（安装boost库，先要安装python安装方法见《boost库安装方法》）

tools:weka

C++程序完成的功能：从数据库中读出文章-》分词（调用ICTCLAS）-》特征词选择（DF法）->VSM模型建立->把文章写成weka数据格式arff文件（此处写成的是稀疏数据的储存格式。weka教程见《教程》）

首先给出构造停用词集合的代码：

构造停用词表

/* ********************************************************************** */
/* 获取停用词表                                                                      */
/* ********************************************************************** */
set < string > MakeStopSet()
{     set < string > stopwordsSet;
    ifstream ifile( " stopwords.txt " );
     while ( ! ifile.eof())
    {
         string temp;
        trim(temp, " " );
        ifile >> temp;
        stopwordsSet.insert(temp);
    }
     return stopwordsSet;


}

然后我们给出调用ICTclas进行分词的代码，注意：工程中调用ICTCLAS时要把data 文件夹，config文件，ictclas30.h ICTCLAS30.dll,ICTCLAS30.LIB放在工程所在的文件夹。将ictclas30.h加入工程，在调用ICTCLAS30.DLL的cpp文件的头部加上#pragma comment(lib, "ICTCLAS30.lib")

调用ICTCLAS分词

************************************************************************/
/* c字符创形式的输入，string格式的输出，此函数用于调用ICTCLAS完成分词功能
/*
/*********************************************************************** */
string ICTsplit( const char * sInput)
{
     if ( ! ICTCLAS_Init())
    {
        printf( " ICTCLAS INIT FAILED!\n " );
         string strerr(sInput);
         return strerr;
    }
    ICTCLAS_SetPOSmap(ICT_POS_MAP_SECOND);
     // 导入用户词典后
     /* printf("\n导入用户词典后：\n");
    int nCount = ICTCLAS_ImportUserDict("userdic.txt");//覆盖以前的用户词典
    //保存用户词典
    ICTCLAS_SaveTheUsrDic();
    printf("导入%d个用户词。\n", nCount); */

     const char * sResult = ICTCLAS_ParagraphProcess(sInput, 0 );
     string strresult(sResult);
     // printf("%s\n", sResult);
     // 把字符串转化成宽字符串
    wstring wsResult = myMultibyteToWideChar(strresult);
    boost::wregex wreg(L " \\s+ " );
    wsResult = boost::regex_replace(wsResult,wreg,wstring(L " | " ));
    strresult = myWideCharToMultibyte(wsResult);



     // ofile<<str1;
     // ofile.close();
     // cout<<str1<<endl;
     // ICTCLAS_FileProcess("text.txt","test_result.txt",1);
    ICTCLAS_Exit();

     return strresult;
}

ICTclas分词结果默认的分割符是空格，在以上函数中，我们改成了“|”作为分隔符，字符串替换考率用boost的正则表达式库。因为我们要处理的是汉字字符串，所有要进行宽字符串窄字符串之间的转化，我采用的是利用win32函数的方法更多方法请见《boost正则表达式处理汉字字符串》。

宽窄字符串互转函数

/* ********************************************************************** */
/*   功能：将窄字符转化成宽字符，string->wstring                          */
/* ********************************************************************** */
wstring myMultibyteToWideChar( string sResult)
{
     int iWLen = MultiByteToWideChar( CP_ACP, 0 , sResult.c_str(), sResult.size(), 0 , 0 ); // 计算转换后宽字符串的长度。（不包含字符串结束符）
    wchar_t * lpwsz = new wchar_t [iWLen + 1 ];
    MultiByteToWideChar( CP_ACP, 0 , sResult.c_str(), sResult.size(), lpwsz, iWLen ); // 正式转换。
    lpwsz[iWLen] = L ' \0 ' ;
    wstring wsResult(lpwsz);
    delete []lpwsz;
     return wsResult;
}
/* ********************************************************************** */
/* 将宽字符串转化成窄字符串用于输出                                      */
/* ********************************************************************** */
string myWideCharToMultibyte(wstring wsResult)
{     string sResult;
     int iLen = WideCharToMultiByte( CP_ACP, NULL, wsResult.c_str(), - 1 , NULL, 0 , NULL, FALSE ); // 计算转换后字符串的长度。（包含字符串结束符）
     char * lpsz = new char [iLen];
    WideCharToMultiByte( CP_OEMCP, NULL, wsResult.c_str(), - 1 , lpsz, iLen, NULL, FALSE); // 正式转换。
    sResult.assign( lpsz, iLen - 1 ); // 对string对象进行赋值。
    delete []lpsz;
     return sResult;

}

有了以上的功能，我们现在编写一个函数，函数的输入是一篇文章，输出是一个词的集合。该词集合保存的是初步去掉噪声词后的“好词”

代码如下

对每篇文章初步过滤形成词集合

/* ********************************************************************** */
/* 返回一篇文章中的好词                                                  */
/* ********************************************************************** */
vector < string > goodWordsinPieceArticle( string rawtext, set < string > stopwords)
{
    vector < wstring > goodWordstemp;
    vector < string > goodWords;
     const char * sInput = rawtext.c_str();
     string sResult = ICTsplit(sInput);
    wstring wsResult = myMultibyteToWideChar(sResult);
    boost::wregex wreg(L " \\d+ " ); // 去掉中文空格
    wsResult = boost::regex_replace(wsResult,wreg,wstring(L "" ));
     // boost::regex_split(back_inserter(goodWordstemp),wsResult,wreg);
    boost::split(goodWordstemp,wsResult,boost::is_any_of( " | " ));

     for (vector < wstring > ::iterator it = goodWordstemp.begin();it != goodWordstemp.end();it ++ )
    {
         string temp = myWideCharToMultibyte( * it);
        trim(temp, " " );
         if ( ! stopwords.count(temp) &&! temp.empty())
        {
            goodWords.push_back(temp);
        }


    }

     return goodWords;


}

上面的这个函数可以说是我们建立词袋子模型的基本单元，给上面的函数输入文章内容（rawtext）,以及停用词表，那么它将返回一个词集合。下面我们开始构造词袋子模型。在构造词袋子模型之前，我们要说一下，我们词袋子模型的格式map<string,vector<pair<int,int>>>：主键为该词，pair中的第一个int 为文章标号，第二个词为在该文中出现的次数，vector<pair<int,int>>统计的是这个词在那些文章中出现，出现过几次。因为数据量比较大所以词袋子模型map,采用引用传参，如果是值传参的话，会在内存中产生拷贝，浪费内存

下面是从数据库中读文章建立词袋子模型的代码

建立词袋子模型

/* ***********************************构建倒排表： key=word,val= a list of pairs which consists of articleid ,and count, count=tf************************************************************ */
int ConstructMap(map < string ,vector < pair < int , int >>>& mymap, int beginindex, int endindex)
{

//     vector<string> mySplit(string s);
      set < string > MakeStopSet();
    vector < string > goodWordsinPieceArticle( string rawtext, set < string > stopwords);
    CoInitialize(NULL);
    _ConnectionPtr pConn(__uuidof(Connection));
    _RecordsetPtr pRst(__uuidof(Recordset));
     char * select = new char [ 5000 ];
    memset(select, 0 , 5000 );
     char * firstpart = " select CKeyWord,ArticleId,CAbstract from Article where ArticleId between " ;
     char * lastpart = " order by ArticleId " ;
     char middlepart1[ 100 ];
     char middlepart2[ 100 ];
    sprintf_s(middlepart1, sizeof (middlepart1), " %d " ,beginindex);
    sprintf_s(middlepart2, sizeof (middlepart2), " %d " ,endindex);
    strcat(select,firstpart);
    strcat(select,middlepart1);
    strcat(select, " and " );
    strcat(select,middlepart2);
    strcat(select,lastpart);
    pConn -> ConnectionString = " Provider=SQLOLEDB.1;Password=xxxxxx;Persist Security Info=True; User ID=sa;Initial Catalog=ArticleCollection " ;
    pConn -> Open( "" , "" , "" ,adConnectUnspecified);
    pRst = pConn -> Execute(select,NULL,adCmdText);
     set < string > stopwords = MakeStopSet();
     while ( ! pRst -> rsEOF)
    {    vector < string > wordcollection;
         // string keywordstr=(_bstr_t)pRst->GetCollect("CKeyWord");
         string rawtext = (_bstr_t)pRst -> GetCollect( " CAbstract " );
         if (rawtext != "" )
        {
                wordcollection = goodWordsinPieceArticle(rawtext,stopwords);
                 string tempid = (_bstr_t)pRst -> GetCollect( " ArticleId " );
                 int articleid = atoi(tempid.c_str());
                 for (vector < string > ::iterator strit = wordcollection.begin();strit != wordcollection.end();strit ++ )
                {
                    vector < pair < int , int >> ::iterator it;
                     if (mymap[ * strit].empty())
                    {
                        pair < int , int > mytemppair = make_pair(articleid, 1 );
                        mymap[ * strit].push_back(mytemppair);

                    }
                     else
                    {
                         for (it = mymap[ * strit].begin();it != mymap[ * strit].end();it ++ )
                        {
                             if (it -> first == articleid)
                            {
                                it -> second =++ (it -> second);
                                 break ;
                            }

                        }
                         if (it == mymap[ * strit].end())
                        {
                            pair < int , int > mytemppair = make_pair(articleid, 1 );
                            mymap[ * strit].push_back(mytemppair);
                        }

                    }

            }


        }


        pRst -> MoveNext();
        wordcollection.clear();
    }
    pRst -> Close();
    pConn -> Close();
    pRst.Release();
    pConn.Release();
    CoUninitialize();
    delete[] select;
     return 0 ;

}

未完，待续。。。。。