CSegGraph class 之自我解析(一)

此类的类函数如下,

bool GenerateWordNet(char *sSentence,CDictionary &dictCore,bool bOriginalFreq=false);

参数为,传入的句子,核心字典,以及是否使用原始频率。

在其执行过程中,首先调用了函数 bool AtomSegment(char *sSentence);

即对原始句子进行原子切割。

成员变量,

char m_sAtom[MAX_SENTENCE_LEN][WORD_MAXLENGTH],用来记录切割好的词;

int m_nAtomLength[MAX_SENTENCE_LEN],用来记录每个原子词的词长度;

int m_nAtomPOS[MAX_SENTENCE_LEN],记录每个原子词的词性;

unsigned int m_nAtomCount,保存原子词的总个数;

CDynamicArray m_segGraph,保存分割后二维词图。

 

先看 AtomSegment 函数,

 

bool CSegGraph::AtomSegment(char *sSentence)
{
    unsigned int i=0,j=0,nCurType,nNextType;	
	//i is the pointer of sentence string
	//j is the pointer of pAtoms
	char sChar[3];
	sChar[2]=0;//Set the char ending
	m_sAtom[j][0]=0;//Set the first word as null
	m_nAtomLength[j]=0;
	
	//先判别句子是否有起始标志
	if(strncmp(sSentence,SENTENCE_BEGIN,strlen(SENTENCE_BEGIN))==0)
	{//如果有起始标志
		strcpy(m_sAtom[j],SENTENCE_BEGIN);//Set the first word as sentence begining
		m_nAtomLength[j] = strlen(SENTENCE_BEGIN);
		m_nAtomPOS[j] = CT_SENTENCE_BEGIN;
		i += m_nAtomLength[j];//i增一个原子词的长度
		j += 1;//j增1
		//对下一个原子进行初始化
		m_sAtom[j][0] = 0;
		m_nAtomLength[j] = 0;
	}

	while(i<strlen(sSentence))
	{
		if(strncmp(sSentence+i,SENTENCE_END,strlen(SENTENCE_END))==0)
		{//如果是结束标志
			strcpy(m_sAtom[j],SENTENCE_END);//将原子词设为结束标志
			m_nAtomLength[j] = strlen(SENTENCE_END);
			m_nAtomPOS[j] = CT_SENTENCE_END;
			i += m_nAtomLength[j];
			j += 1;
			m_sAtom[j][0]=0;
			m_nAtomLength[j]=0;
			continue;//直接跳到while判别
		}
		sChar[0] = *(sSentence + i);//记录第i个位置的字符
		sChar[1] = 0;
		i += 1;
		/*
		ASCII值最大是 0111 1111 =127 汉字编码为了避免与ASCII冲突  编码要加0xA0 也即1010 
		所以汉字最高位就变成了 1 在char中代表负值 
		*/
		if(sChar[0]<0)//Two byte char,即为汉字
		{
			sChar[1] = *(sSentence+i);//Get the char with second byte
			i += 1;//i increased by 1
		}
		strcat(m_sAtom[j],sChar);//记录第j个原子
		nCurType = charType((unsigned char *)sChar);//记录字符类型
		//如果第一个字符为'.'并且其后跟的字符为数字
		if(sChar[0]=='.' && (charType((unsigned char *)sSentence+i)==CT_NUM || (*(sSentence+i)>='0' && *(sSentence+i)<='9')))
			nCurType = CT_NUM;//Digit after . indicate . as a point in the numeric
		m_nAtomPOS[j] = nCurType;
		//Record its property, just convience for continuous processing
		
		if(nCurType==CT_CHINESE || nCurType==CT_INDEX || nCurType==CT_DELIMITER || nCurType==CT_OTHER)
		{//Chinese char, index number,delimiter and other is treated as atom
			m_nAtomLength[j] = strlen(m_sAtom[j]);//Save its length
			j += 1;//Skip to next atom
			m_sAtom[j][0] = 0;//init
		}
		else 
		{//Number,single char,letter
			nNextType = 255;
			if(i<strlen(sSentence))
				nNextType = charType((unsigned char *)(sSentence+i));
			if(nNextType!=nCurType || i==strlen(sSentence))
			//Reaching end or next char type is different from current char
			{
				m_nAtomLength[j] = strlen(m_sAtom[j]);//Save its length	
				j += 1;
				m_sAtom[j][0] = 0;//init
			}
		}
	}
	m_nAtomCount = j;//The count of segmentation atoms
	return true;
}


 此函数主要用途为,将传入的句子进行原子切割,并得出相应原子词的词性及长度等基本信息。较为简单。

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值