此类的类函数如下,
bool GenerateWordNet(char *sSentence,CDictionary &dictCore,bool bOriginalFreq=false);
参数为,传入的句子,核心字典,以及是否使用原始频率。
在其执行过程中,首先调用了函数 bool AtomSegment(char *sSentence);
即对原始句子进行原子切割。
成员变量,
char m_sAtom[MAX_SENTENCE_LEN][WORD_MAXLENGTH],用来记录切割好的词;
int m_nAtomLength[MAX_SENTENCE_LEN],用来记录每个原子词的词长度;
int m_nAtomPOS[MAX_SENTENCE_LEN],记录每个原子词的词性;
unsigned int m_nAtomCount,保存原子词的总个数;
CDynamicArray m_segGraph,保存分割后二维词图。
先看 AtomSegment 函数,
bool CSegGraph::AtomSegment(char *sSentence)
{
unsigned int i=0,j=0,nCurType,nNextType;
//i is the pointer of sentence string
//j is the pointer of pAtoms
char sChar[3];
sChar[2]=0;//Set the char ending
m_sAtom[j][0]=0;//Set the first word as null
m_nAtomLength[j]=0;
//先判别句子是否有起始标志
if(strncmp(sSentence,SENTENCE_BEGIN,strlen(SENTENCE_BEGIN))==0)
{//如果有起始标志
strcpy(m_sAtom[j],SENTENCE_BEGIN);//Set the first word as sentence begining
m_nAtomLength[j] = strlen(SENTENCE_BEGIN);
m_nAtomPOS[j] = CT_SENTENCE_BEGIN;
i += m_nAtomLength[j];//i增一个原子词的长度
j += 1;//j增1
//对下一个原子进行初始化
m_sAtom[j][0] = 0;
m_nAtomLength[j] = 0;
}
while(i<strlen(sSentence))
{
if(strncmp(sSentence+i,SENTENCE_END,strlen(SENTENCE_END))==0)
{//如果是结束标志
strcpy(m_sAtom[j],SENTENCE_END);//将原子词设为结束标志
m_nAtomLength[j] = strlen(SENTENCE_END);
m_nAtomPOS[j] = CT_SENTENCE_END;
i += m_nAtomLength[j];
j += 1;
m_sAtom[j][0]=0;
m_nAtomLength[j]=0;
continue;//直接跳到while判别
}
sChar[0] = *(sSentence + i);//记录第i个位置的字符
sChar[1] = 0;
i += 1;
/*
ASCII值最大是 0111 1111 =127 汉字编码为了避免与ASCII冲突 编码要加0xA0 也即1010
所以汉字最高位就变成了 1 在char中代表负值
*/
if(sChar[0]<0)//Two byte char,即为汉字
{
sChar[1] = *(sSentence+i);//Get the char with second byte
i += 1;//i increased by 1
}
strcat(m_sAtom[j],sChar);//记录第j个原子
nCurType = charType((unsigned char *)sChar);//记录字符类型
//如果第一个字符为'.'并且其后跟的字符为数字
if(sChar[0]=='.' && (charType((unsigned char *)sSentence+i)==CT_NUM || (*(sSentence+i)>='0' && *(sSentence+i)<='9')))
nCurType = CT_NUM;//Digit after . indicate . as a point in the numeric
m_nAtomPOS[j] = nCurType;
//Record its property, just convience for continuous processing
if(nCurType==CT_CHINESE || nCurType==CT_INDEX || nCurType==CT_DELIMITER || nCurType==CT_OTHER)
{//Chinese char, index number,delimiter and other is treated as atom
m_nAtomLength[j] = strlen(m_sAtom[j]);//Save its length
j += 1;//Skip to next atom
m_sAtom[j][0] = 0;//init
}
else
{//Number,single char,letter
nNextType = 255;
if(i<strlen(sSentence))
nNextType = charType((unsigned char *)(sSentence+i));
if(nNextType!=nCurType || i==strlen(sSentence))
//Reaching end or next char type is different from current char
{
m_nAtomLength[j] = strlen(m_sAtom[j]);//Save its length
j += 1;
m_sAtom[j][0] = 0;//init
}
}
}
m_nAtomCount = j;//The count of segmentation atoms
return true;
}
此函数主要用途为,将传入的句子进行原子切割,并得出相应原子词的词性及长度等基本信息。较为简单。