文本分类二_文本二分类字典-CSDN博客

本文链接：https://blog.csdn.net/scuxc/article/details/20741623

上一篇中将语料进行了简单的整理。接下来就是：

1、形成字典

2、特征选择

3、分类训练

4、测试

现在只做到了形成字典和特征选取。不过效果感觉不理想，选出来的特征有一些不好的项。下周有机会去问下老师~~

下面是形成字典的代码：

字典的结构是：map<string,map<int,int> > myDic

例子：

中国

0 5 1 23 2 34 5 9 7 2 9 68

第一组是 0 5 其中 5 表示map<int,int>的size

以后分别是 i DF 表示 term（中国）在类别i中的文档频率

最后是 9 DF 表示在全部文档中的文档频率

然后将结果保存起来。

这是初次这种东西，C++也不怎么写过。以后有时间再整理一下！

/******************************************************************************* 
 由工程test1修改而来
1、将其中不用的部分——导入用户词典删去
2、尽量写得易懂一些，当然这还不是最好版本
3、尝试努力写一个可以生成字典的程序
*******************************************************************************/
#ifndef OS_LINUX
#include <Windows.h>
#pragma comment(lib, "ICTCLAS50.lib") //ICTCLAS50.lib库加入到工程中
#endif

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <iostream>

#include <string>
#include <io.h>
#include <fstream>
#include<sstream>
#include <fcntl.h>
#include <vector>
#include <map>
#include <set>
#include "ICTCLAS50.h"

using namespace std;

#define POS_TAGGER_TEST
#ifdef POS_TAGGER_TEST
bool g_bPOSTagged=true;
#else
bool g_bPOSTagged=false;
#endif	
map<string,map<int,int> >myDic;
set<string> stopDic;//停用词词典
void testICTCLAS_ParagraphProcess(string folderPath,int folderId);

int main()
{
	if(!ICTCLAS_Init()) //初始化分词组件。
	{
		printf("Init fails\n");  
		return 0;
	}
	else
	{
		printf("Init ok\n");
	}

	ifstream stopDicFile("E:\\stopwords.txt");
	string stopWord;
	while(stopDicFile>>stopWord)
		stopDic.insert(stopWord);
   //设置词性标注集(0 计算所二级标注集，1 计算所一级标注集，2 北大二级标注集，3 北大一级标注集)
	ICTCLAS_SetPOSmap(2);

    testICTCLAS_ParagraphProcess("E:\\traintext\\1_train",0);//分词用例
	
	ICTCLAS_Exit();	//释放资源退出
	
	ofstream ofile("E:\\dic.txt");
	map<string,map<int,int> >::iterator map_it2;
	map<int,int>::iterator map_it3;
	int dic_num = 0;
	int flag = 0;
	for(map_it2 = myDic.begin(); map_it2 != myDic.end(); map_it2++)
	{
		ofile<<map_it2->first<<endl;
		ofile<<flag<<" "<<map_it2->second.size();
		dic_num++;
		for(map_it3 = map_it2->second.begin(); map_it3 != map_it2->second.end(); map_it3++)
		{
			ofile<<" "<<map_it3->first<<" "<<map_it3->second;
		}
		ofile<<endl;
	}
	printf("\n共出现词：%d\n",dic_num);
	//getchar();
	return 0;
}
void testICTCLAS_ParagraphProcess(string folderPath,int folderId)  //path开始路径
{

	long Handle;
	struct _finddata_t FileInfo;

	string fpath = folderPath + "\\*";
	if((Handle = _findfirst(fpath.c_str(),&FileInfo)) == -1L)  //遍历目录下的文件
	{
		printf("没有找到匹配的项目");
		exit(-1);
	}


	do{
        //判断是否有子目录
        if (FileInfo.attrib & _A_SUBDIR)    
        {
            //这个语句很重要
            if( (strcmp(FileInfo.name,".") != 0 ) &&(strcmp(FileInfo.name,"..") != 0))   
            {
                string newPath = folderPath + "\\" + FileInfo.name;
                cout<<"目录名"<<newPath<<endl; 
                testICTCLAS_ParagraphProcess(newPath,folderId+1);
				folderId += 1;
            }
        }
        else  
        {
				//txtCount++;
				char* sSentence =  (char*)malloc(FileInfo.size);
				char ch;
				int len = 0;
			
				string filepath = folderPath + "\\" + FileInfo.name;
				ifstream ifile(filepath.c_str());

				while(ifile>>ch)	//这样读入为了将换行符去掉，ASCII码10、13
				{
					 if(ch == '\n' || ch == '\r')
						 continue;
					 sSentence[len++] = ch;
				}
		
				sSentence[len] = '\0';
				unsigned int nPaLen=strlen(sSentence); // 需要分词的长度
				char* sRst=0;   //用户自行分配空间，用于保存结果；
				sRst=(char *)malloc(nPaLen*6); //建议长度为字符串长度的倍。
				int nRstLen=0; //分词结果的长度
			
				nRstLen = ICTCLAS_ParagraphProcess(sSentence,nPaLen,sRst,CODE_TYPE_UNKNOWN,0);  //字符串处理
				/*收集单词，形成字典*/
				//cout<<"目录为："<<folderId<<endl;
				string words;
				istringstream istream(sRst);
				set<string> txtDic; //表示一篇文章的词典,为了统计词的文档频率DF
				while(istream>>words)
				{

					if((!txtDic.count(words)) && (!stopDic.count(words)))
					{
						++myDic[words][9];
						++myDic[words][folderId];
						txtDic.insert(words);
					}
				}
				txtDic.clear();
		 free(sRst);
        }
	
    }while (_findnext(Handle, &FileInfo) == 0);

		_findclose(Handle);
  
	return ;  
}