CSegGraph class 之自我解析（一）

最新推荐文章于 2021-02-16 20:42:46 发布

Cindyzhj

最新推荐文章于 2021-02-16 20:42:46 发布

阅读量346

点赞数

分类专栏： ICTCLAS 学习文章标签： class byte

本文链接：https://blog.csdn.net/cindyzhj/article/details/7618720

版权

ICTCLAS 学习专栏收录该内容

3 篇文章 0 订阅

订阅专栏

此类的类函数如下，

bool GenerateWordNet(char *sSentence,CDictionary &dictCore,bool bOriginalFreq=false);

参数为，传入的句子，核心字典，以及是否使用原始频率。

在其执行过程中，首先调用了函数 bool AtomSegment(char *sSentence);

即对原始句子进行原子切割。

成员变量，

char m_sAtom[MAX_SENTENCE_LEN][WORD_MAXLENGTH]，用来记录切割好的词；

int m_nAtomLength[MAX_SENTENCE_LEN]，用来记录每个原子词的词长度；

int m_nAtomPOS[MAX_SENTENCE_LEN]，记录每个原子词的词性；

unsigned int m_nAtomCount，保存原子词的总个数；

CDynamicArray m_segGraph，保存分割后二维词图。

先看 AtomSegment 函数，

bool CSegGraph::AtomSegment(char *sSentence)
{
    unsigned int i=0,j=0,nCurType,nNextType;	
	//i is the pointer of sentence string
	//j is the pointer of pAtoms
	char sChar[3];
	sChar[2]=0;//Set the char ending
	m_sAtom[j][0]=0;//Set the first word as null
	m_nAtomLength[j]=0;
	
	//先判别句子是否有起始标志
	if(strncmp(sSentence,SENTENCE_BEGIN,strlen(SENTENCE_BEGIN))==0)
	{//如果有起始标志
		strcpy(m_sAtom[j],SENTENCE_BEGIN);//Set the first word as sentence begining
		m_nAtomLength[j] = strlen(SENTENCE_BEGIN);
		m_nAtomPOS[j] = CT_SENTENCE_BEGIN;
		i += m_nAtomLength[j];//i增一个原子词的长度
		j += 1;//j增1
		//对下一个原子进行初始化
		m_sAtom[j][0] = 0;
		m_nAtomLength[j] = 0;
	}

	while(i<strlen(sSentence))
	{
		if(strncmp(sSentence+i,SENTENCE_END,strlen(SENTENCE_END))==0)
		{//如果是结束标志
			strcpy(m_sAtom[j],SENTENCE_END);//将原子词设为结束标志
			m_nAtomLength[j] = strlen(SENTENCE_END);
			m_nAtomPOS[j] = CT_SENTENCE_END;
			i += m_nAtomLength[j];
			j += 1;
			m_sAtom[j][0]=0;
			m_nAtomLength[j]=0;
			continue;//直接跳到while判别
		}
		sChar[0] = *(sSentence + i);//记录第i个位置的字符
		sChar[1] = 0;
		i += 1;
		/*
		ASCII值最大是 0111 1111 =127 汉字编码为了避免与ASCII冲突  编码要加0xA0 也即1010 
		所以汉字最高位就变成了 1 在char中代表负值 
		*/
		if(sChar[0]<0)//Two byte char,即为汉字
		{
			sChar[1] = *(sSentence+i);//Get the char with second byte
			i += 1;//i increased by 1
		}
		strcat(m_sAtom[j],sChar);//记录第j个原子
		nCurType = charType((unsigned char *)sChar);//记录字符类型
		//如果第一个字符为'.'并且其后跟的字符为数字
		if(sChar[0]=='.' && (charType((unsigned char *)sSentence+i)==CT_NUM || (*(sSentence+i)>='0' && *(sSentence+i)<='9')))
			nCurType = CT_NUM;//Digit after . indicate . as a point in the numeric
		m_nAtomPOS[j] = nCurType;
		//Record its property, just convience for continuous processing
		
		if(nCurType==CT_CHINESE || nCurType==CT_INDEX || nCurType==CT_DELIMITER || nCurType==CT_OTHER)
		{//Chinese char, index number,delimiter and other is treated as atom
			m_nAtomLength[j] = strlen(m_sAtom[j]);//Save its length
			j += 1;//Skip to next atom
			m_sAtom[j][0] = 0;//init
		}
		else 
		{//Number,single char,letter
			nNextType = 255;
			if(i<strlen(sSentence))
				nNextType = charType((unsigned char *)(sSentence+i));
			if(nNextType!=nCurType || i==strlen(sSentence))
			//Reaching end or next char type is different from current char
			{
				m_nAtomLength[j] = strlen(m_sAtom[j]);//Save its length	
				j += 1;
				m_sAtom[j][0] = 0;//init
			}
		}
	}
	m_nAtomCount = j;//The count of segmentation atoms
	return true;
}

此函数主要用途为，将传入的句子进行原子切割，并得出相应原子词的词性及长度等基本信息。较为简单。