作者:finallyliuyu 出处:博客园
最近在做关于文本分类算法的验证。汉语新闻分类的语料库采用的是我自己爬取的新闻。英文分类语料库考虑采用reuters需要处理reuters21578文本分类语料库。
下面给出处理reuters2178的代码,主要功能就是从文本中提取新闻标题、内容、类别存储到mssql2000中。
把代码拷贝下来,留在这里做份备忘,主要是因为里面涉及了些boost::regex的使用,已经宽窄字符集转换。
尤其是boost::regex的使用,有很多注意事项,比如C#中的\s+,boost要用“\\s+”等。比如boost::regex中的dotmatchnewline 模式是mod_s。这些细节问题,想要全部记住是件很困难的事情,况且也没有必要记住这些东西。用到的时候,想下当时的关键字,在博客里面搜一下就出来了。
![](https://i-blog.csdnimg.cn/blog_migrate/8f900a89c6347c561fdf2122f13be562.gif)
![ExpandedBlockStart.gif](https://i-blog.csdnimg.cn/blog_migrate/961ddebeb323a10fe0623af514929fc1.gif)
vector
<
ARTICLE
>
FindArticles(
string
rawtext)
{
vector < ARTICLE > articleCollection;
boost::regex regdoc( " <REUTERS\\s+TOPICS=\ " YES\ " \\s+LEWISSPLIT=\ " TEST\ " .*?>(.*?)</REUTERS> " ,boost::regbase::icase | boost::regbase::mod_s); // 获得doc标签内的内容
boost::regex regtitle( " <TITLE>(.*?)</TITLE> " ,boost::regbase::icase | boost::regbase::mod_s); // 获得url标签内的内容
boost::regex reglabel( " <TOPICS><D>(.*?)</D>.*?</TOPICS> " ,boost::regbase::icase | boost::regbase::mod_s); // 获得标题
boost::regex regcontent( " <BODY>(.*?)</BODY> " ,boost::regbase::icase | boost::regbase::mod_s); // 获得内容
ARTICLE article;
boost::smatch mDOC;
boost::smatch mLabel;
boost::smatch mTitle;
boost::smatch mContent;
// rawtext=ProcessSingleline(rawtext); // 预处理去掉文本中所有的回车和换行。
string ::const_iterator it = rawtext.begin();
string ::const_iterator end = rawtext.end();
while (boost::regex_search(it,end,mDOC,regdoc))
{
string doc = mDOC[ 0 ];
string label = "" ;
string title = "" ;
string content = "" ;
if (boost ::regex_search(doc,mLabel,reglabel))
{
label = mLabel[ 1 ];
}
if (boost::regex_search(doc,mTitle,regtitle))
{
title = mTitle[ 1 ];
}
if (boost::regex_search(doc,mContent,regcontent))
{
content = mContent[ 1 ];
}
if (content != "" && title != "" && label != "" )
{
article.ArticleText = content;
article.ArticleTitle = title;
article.Categorization = label;
articleCollection.push_back(article);
}
it = mDOC[ 0 ].second;
}
return articleCollection;
}
{
vector < ARTICLE > articleCollection;
boost::regex regdoc( " <REUTERS\\s+TOPICS=\ " YES\ " \\s+LEWISSPLIT=\ " TEST\ " .*?>(.*?)</REUTERS> " ,boost::regbase::icase | boost::regbase::mod_s); // 获得doc标签内的内容
boost::regex regtitle( " <TITLE>(.*?)</TITLE> " ,boost::regbase::icase | boost::regbase::mod_s); // 获得url标签内的内容
boost::regex reglabel( " <TOPICS><D>(.*?)</D>.*?</TOPICS> " ,boost::regbase::icase | boost::regbase::mod_s); // 获得标题
boost::regex regcontent( " <BODY>(.*?)</BODY> " ,boost::regbase::icase | boost::regbase::mod_s); // 获得内容
ARTICLE article;
boost::smatch mDOC;
boost::smatch mLabel;
boost::smatch mTitle;
boost::smatch mContent;
// rawtext=ProcessSingleline(rawtext); // 预处理去掉文本中所有的回车和换行。
string ::const_iterator it = rawtext.begin();
string ::const_iterator end = rawtext.end();
while (boost::regex_search(it,end,mDOC,regdoc))
{
string doc = mDOC[ 0 ];
string label = "" ;
string title = "" ;
string content = "" ;
if (boost ::regex_search(doc,mLabel,reglabel))
{
label = mLabel[ 1 ];
}
if (boost::regex_search(doc,mTitle,regtitle))
{
title = mTitle[ 1 ];
}
if (boost::regex_search(doc,mContent,regcontent))
{
content = mContent[ 1 ];
}
if (content != "" && title != "" && label != "" )
{
article.ArticleText = content;
article.ArticleTitle = title;
article.Categorization = label;
articleCollection.push_back(article);
}
it = mDOC[ 0 ].second;
}
return articleCollection;
}
![](https://i-blog.csdnimg.cn/blog_migrate/8f900a89c6347c561fdf2122f13be562.gif)
![ExpandedBlockStart.gif](https://i-blog.csdnimg.cn/blog_migrate/961ddebeb323a10fe0623af514929fc1.gif)
void
FindFile(wchar_t
*
pFilePath)
{
WIN32_FIND_DATA FindFileData;
HANDLE hFind = INVALID_HANDLE_VALUE;
wchar_t DirSpec[MAX_PATH + 1 ]; // 指定路径
DWORD dwError;
wcsncpy (DirSpec, pFilePath, wcslen(pFilePath) + 1 );
wcsncat (DirSpec, L " \\\* " , 3 );
hFind = FindFirstFile(DirSpec, & FindFileData);
if (hFind == INVALID_HANDLE_VALUE) {
wprintf(L " Invalid file handle. Error is %u " , GetLastError());
return ;
}
bool bFinish = false ;
while ( ! bFinish)
{
if (FindFileData.dwFileAttributes != FILE_ATTRIBUTE_DIRECTORY )
{
wchar_t temp[ 3000 ];
memset(temp, 0 , 3000 * sizeof (wchar_t));
wcscpy(temp,pFilePath);
wcscat(temp,L " \\ " );
wcscat(temp,FindFileData.cFileName);
string rawtext = "" ;
string line;
ifstream infile;
infile.open(temp);
if (infile)
{
while (getline(infile,line))
{
rawtext += line;
}
}
infile.clear();
infile.close();
InsertArticlesToDataBase(rawtext);
wstring path(temp);
string spath = myWideCharToMultibyte(path);
cout << " finishprocess " << spath << endl;
}
bFinish = (FindNextFile(hFind, & FindFileData) == false );
}
}
{
WIN32_FIND_DATA FindFileData;
HANDLE hFind = INVALID_HANDLE_VALUE;
wchar_t DirSpec[MAX_PATH + 1 ]; // 指定路径
DWORD dwError;
wcsncpy (DirSpec, pFilePath, wcslen(pFilePath) + 1 );
wcsncat (DirSpec, L " \\\* " , 3 );
hFind = FindFirstFile(DirSpec, & FindFileData);
if (hFind == INVALID_HANDLE_VALUE) {
wprintf(L " Invalid file handle. Error is %u " , GetLastError());
return ;
}
bool bFinish = false ;
while ( ! bFinish)
{
if (FindFileData.dwFileAttributes != FILE_ATTRIBUTE_DIRECTORY )
{
wchar_t temp[ 3000 ];
memset(temp, 0 , 3000 * sizeof (wchar_t));
wcscpy(temp,pFilePath);
wcscat(temp,L " \\ " );
wcscat(temp,FindFileData.cFileName);
string rawtext = "" ;
string line;
ifstream infile;
infile.open(temp);
if (infile)
{
while (getline(infile,line))
{
rawtext += line;
}
}
infile.clear();
infile.close();
InsertArticlesToDataBase(rawtext);
wstring path(temp);
string spath = myWideCharToMultibyte(path);
cout << " finishprocess " << spath << endl;
}
bFinish = (FindNextFile(hFind, & FindFileData) == false );
}
}
![](https://i-blog.csdnimg.cn/blog_migrate/8f900a89c6347c561fdf2122f13be562.gif)
![ExpandedBlockStart.gif](https://i-blog.csdnimg.cn/blog_migrate/961ddebeb323a10fe0623af514929fc1.gif)
string
ProcessforMSSQL(
string
src)
{
int pos = src.find( ' \ '' );
while (pos != string ::npos)
{ // string& replace ( size_t pos1, size_t n1, size_t n2, char c );
src = src.replace(pos, 1 , 1 , ' \" ' );
pos = src.find( ' \ '' ,pos);
}
return src;
}
{
int pos = src.find( ' \ '' );
while (pos != string ::npos)
{ // string& replace ( size_t pos1, size_t n1, size_t n2, char c );
src = src.replace(pos, 1 , 1 , ' \" ' );
pos = src.find( ' \ '' ,pos);
}
return src;
}
int
_tmain(
int
argc, _TCHAR
*
argv[])
{
int end;
// DictionaryToDataBase();
FindFile(L " E:\\新闻语料\\reuters21578 " );
cout << " finish " << endl;
cin >> end;
}
{
int end;
// DictionaryToDataBase();
FindFile(L " E:\\新闻语料\\reuters21578 " );
cout << " finish " << endl;
cin >> end;
}
![](https://i-blog.csdnimg.cn/blog_migrate/8f900a89c6347c561fdf2122f13be562.gif)
![ExpandedBlockStart.gif](https://i-blog.csdnimg.cn/blog_migrate/961ddebeb323a10fe0623af514929fc1.gif)
void
InsertArticlesToDataBase(
string
rawtext)
{
vector < ARTICLE > articleCollection = FindArticles(rawtext);
CoInitialize(NULL);
_ConnectionPtr pConn(__uuidof(Connection));
// _RecordsetPtr pRst(__uuidof(Recordset));
pConn -> ConnectionString = " Provider=SQLOLEDB.1;Password=xxxx;Persist Security Info=True; User ID=sa;Initial Catalog=NewsInfo " ;
pConn -> Open( "" , "" , "" ,adConnectUnspecified);
char * sqlInsert = new char [ 1000000 ];
for (vector < ARTICLE > ::iterator it = articleCollection.begin();it != articleCollection.end(); ++ it)
{
_variant_t RecordsAffected;
memset(sqlInsert, 0 , 1000000 );
// 将其中的带引号换为双引号
string url = ProcessforMSSQL(( * it).Categorization);
string title = ProcessforMSSQL(( * it).ArticleTitle);
string text = ProcessforMSSQL(( * it).ArticleText);
sprintf_s(sqlInsert, 1000000 , " insert into ReuteursTest(ArticleTitle,ArticleText,Categorization) values('%s','%s','%s') " ,title.c_str(),text.c_str(),url.c_str());
pConn -> Execute(sqlInsert, & RecordsAffected, - 1 );
cout << title << " 添加完毕 " << endl;
}
delete sqlInsert;
pConn -> Close();
pConn.Release();
CoUninitialize();
}
{
vector < ARTICLE > articleCollection = FindArticles(rawtext);
CoInitialize(NULL);
_ConnectionPtr pConn(__uuidof(Connection));
// _RecordsetPtr pRst(__uuidof(Recordset));
pConn -> ConnectionString = " Provider=SQLOLEDB.1;Password=xxxx;Persist Security Info=True; User ID=sa;Initial Catalog=NewsInfo " ;
pConn -> Open( "" , "" , "" ,adConnectUnspecified);
char * sqlInsert = new char [ 1000000 ];
for (vector < ARTICLE > ::iterator it = articleCollection.begin();it != articleCollection.end(); ++ it)
{
_variant_t RecordsAffected;
memset(sqlInsert, 0 , 1000000 );
// 将其中的带引号换为双引号
string url = ProcessforMSSQL(( * it).Categorization);
string title = ProcessforMSSQL(( * it).ArticleTitle);
string text = ProcessforMSSQL(( * it).ArticleText);
sprintf_s(sqlInsert, 1000000 , " insert into ReuteursTest(ArticleTitle,ArticleText,Categorization) values('%s','%s','%s') " ,title.c_str(),text.c_str(),url.c_str());
pConn -> Execute(sqlInsert, & RecordsAffected, - 1 );
cout << title << " 添加完毕 " << endl;
}
delete sqlInsert;
pConn -> Close();
pConn.Release();
CoUninitialize();
}