在研究(六)中,我们经过种种努力,终于得到了梦寐以求的分词结果,我得意的笑得意的笑。。。别急,好戏还在后头呢。我们冷静想一想,前面初分的结果主要都是基于词典库的词条得到的,象人名、地名之类的未登录词(即指该词条不在词典库中)该如何识别呢?
典型的象人名,全国上下、古今中外得有多少人名呀,不可能全部做到词库中,必须依照一定的规则和算法对其进行识别,大家可以张华平、刘群的论文《参考基于角色标注的中国人名自动识别研究》和DanceFire的分析文章http://blog.csdn.net/DanceFire/archive/2007/05/13/1606603.aspx,我 就不多做赘述了。
下面以人名的自动识别为例,做个简单的说明。FreeICTCLAS源程序中对人名的识别主要有两步:一、对初分结果进行词性标记;二、按照人名识别的十几种模式规则进行套用,从而识别出句子中的人名。
对照源代码进行分析:
bool CUnknowWord::Recognition(PWORD_RESULT pWordSegResult, CDynamicArray &graphOptimum,CSegGraph &graphSeg,CDictionary &dictCore)

...{
int nStartPos=0,j=0,nAtomStart,nAtomEnd,nPOSOriginal;
ELEMENT_TYPE dValue;

//对初分结果进行词性标记,并记录可能成词的节点位置
m_roleTag.POSTagging(pWordSegResult,dictCore,m_dict);

//Tag the segmentation with unknown recognition roles according the core dictionary and unknown recognition dictionary
for(int i=0;i<m_roleTag.m_nUnknownIndex;i++)

...{
//获取未登录词在原子分词链表中的开始下标和结束下标
while((unsigned int)j<graphSeg.m_nAtomCount&&nStartPos<m_roleTag.m_nUnknownWords[i][0])

...{
nStartPos+=graphSeg.m_nAtomLength[j++];
}
nAtomStart=j;
while((unsigned int)j<graphSeg.m_nAtomCount&&nStartPos<m_roleTag.m_nUnknownWords[i][1])

...{
nStartPos+=graphSeg.m_nAtomLength[j++];
}
nAtomEnd=j;
if(nAtomStart<nAtomEnd)

...{
//如果当前计算出来的值小于原来的值,即该元素实际不存在,则在链表中插入该元素
graphOptimum.GetElement(nAtomStart,nAtomEnd,&dValue,&nPOSOriginal);
if(dValue>m_roleTag.m_dWordsPossibility[i])//Set the element with less frequency
graphOptimum.SetElement(nAtomStart,nAtomEnd,m_roleTag.m_dWordsPossibility[i],m_nPOS,m_sUnknownFlags);
}
}
return true;
}
//POS tagging with Hidden Markov Model
bool CSpan::POSTagging(PWORD_RESULT pWordItems,CDictionary &dictCore,CDictionary &dictUnknown)

...{
//pWordItems: Items; nItemCount: the count of items;core dictionary and unknown recognition dictionary
int i=0,j,nStartPos;
Reset(false);
while(i>-1&&pWordItems[i].sWord[0]!=0)

...{
nStartPos=i;//Start Position
//首先进行句子的分断,依据为找到一个unknonwDict中不存在的词为止
//然后找出每一个词所有可能的词性及其对应的词频
i=GetFrom(pWordItems,nStartPos,dictCore,dictUnknown);
//计算每一个词与前面一个词的所有的词性之间的耦合度,找出值最小的那个
//然后找出每一个词最佳的词性
GetBestPOS();
switch(m_tagType)

...{
case TT_NORMAL://normal POS tagging
j=1;
while(m_nBestTag[j]!=-1&&j<m_nCurLength)

...{//Store the best POS tagging
pWordItems[j+nStartPos-1].nHandle=m_nBestTag[j];
//Let 。be 0
if(pWordItems[j+nStartPos-1].dValue>0&&dictCore.IsExist(pWordItems[j+nStartPos-1].sWord,-1))//Exist and update its frequncy as a POS value
pWordItems[j+nStartPos-1].dValue=dictCore.GetFrequency(pWordItems[j+nStartPos-1].sWord,m_nBestTag[j]);
j+=1;
}
break;
case TT_PERSON://Person recognition
//按照人名的十几种模式,进行匹配,记录所有构成人名的词的位置坐标
PersonRecognize(dictUnknown);
break;
case TT_PLACE://Place name recognition
case TT_TRANS_PERSON://Transliteration Person
PlaceRecognize(dictCore,dictUnknown);
break;
default:
break;
}
Reset();
}

//print all pos info

for(int m=0;m_nTags[m][0]>=0;m++)...{

for(int n=0;m_nTags[m][n]>=0;n++)...{
int pos=m_nTags[m][n];
double value=m_dFrequency[m][n];
TRACE ("%s %d %5d %f ", "word:",m, pos,value);
}
}
return true;
}
在FreeICTCLAS中,对初分结果进行词性标记及后续处理时用到了一个循环,即把初分结果按照一定的条件进行分隔,进行多次处理。这个条件就是当初分结果中的词在unknownDict没有对应的词性时从此处断开,我个人认为没有太大必要,在ictclas4j的处理中我舍弃了这个循环,直接对所有初次结果进行词性标记,减少代码的复杂度。
bool CSpan::PersonRecognize(CDictionary &personDict)

...{
char sPOS[MAX_WORDS_PER_SENTENCE]="z",sPersonName[100];
//0 1 2 3 4 5

char sPatterns[][5]=...{ "BBCD","BBC","BBE","BBZ","BCD","BEE","BE","BG",
"BXD","BZ", "CDCD","CD","EE", "FB", "Y","XD",""};
//BBCD BBC BBE BBZ BCD BEE BE BG

double dFactor[]=...{0.003606,0.000021,0.001314,0.000315,0.656624, 0.000021,0.146116,0.009136,
// BXD BZ CDCD CD EE FB Y XD
0.000042,0.038971,0,0.090367,0.000273,0.009157,0.034324,0.009735,0
};
//About parameter:

/**//*
BBCD 343 0.003606
BBC 2 0.000021
BBE 125 0.001314
BBZ 30 0.000315
BCD 62460 0.656624
BEE 0 0.000000
BE 13899 0.146116
BG 869 0.009136
BXD 4 0.000042
BZ 3707 0.038971
CD 8596 0.090367
EE 26 0.000273
FB 871 0.009157
Y 3265 0.034324
XD 926 0.009735
*/
//The person recognition patterns set
//BBCD:姓+姓+名1+名2;
//BBE: 姓+姓+单名;
//BBZ: 姓+姓+双名成词;
//BCD: 姓+名1+名2;
//BE: 姓+单名;
//BEE: 姓+单名+单名;韩磊磊
//BG: 姓+后缀
//BXD: 姓+姓双名首字成词+双名末字
//BZ: 姓+双名成词;
//B: 姓
//CD: 名1+名2;
//EE: 单名+单名;
//FB: 前缀+姓
//XD: 姓双名首字成词+双名末字
//Y: 姓单名成词

int nPatternLen[]=...{4,3,3,3,3,3,2,2,3,2,4,2,2,2,1,2,0};

for(int i=1;m_nBestTag[i]>-1;i++)//Convert to string from POS
sPOS[i]=m_nBestTag[i]+'A';
sPOS[i]=0;
int j=1,k,nPos;//Find the proper pattern from the first POS
int nLittleFreqCount;//Counter for the person name role with little frequecy
bool bMatched=false;
while(j<i)

...{
bMatched=false;
for(k=0;!bMatched&&nPatternLen[k]>0;k++)

...{
//如果从找到了和模式库中匹配的字符串,且该串前面和后面的字符都不是一个圆点,则认为是一个可能的人名组合
if(strncmp(sPatterns[k],sPOS+j,nPatternLen[k])==0&&strcmp(m_sWords[j-1],"·")!=0&&strcmp(m_sWords[j+nPatternLen[k]],"·")!=0)

...{//Find the proper pattern k

//如果前缀+姓成词并且后面一个名或后缀,则该规则失效
if(strcmp(sPatterns[k],"FB")==0&&(sPOS[j+2]=='E'||sPOS[j+2]=='C'||sPOS[j+2]=='G'))

...{//Rule 1 for exclusion:前缀+姓+名1(名2): 规则(前缀+姓)失效;
continue;
}

/**//* if((strcmp(sPatterns[k],"BEE")==0||strcmp(sPatterns[k],"EE")==0)&&strcmp(m_sWords[j+nPatternLen[k]-1],m_sWords[j+nPatternLen[k]-2])!=0)
{//Rule 2 for exclusion:姓+单名+单名:单名+单名 若EE对应的字不同,规则失效.如:韩磊磊
continue;
}

if(strcmp(sPatterns[k],"B")==0&&m_nBestTag[j+1]!=12)
{//Rule 3 for exclusion: 若姓后不是后缀,规则失效.如:江主席、刘大娘
continue;
}
*/ //Get the possible name
nPos=j;//Record the person position in the tag sequence
sPersonName[0]=0;
nLittleFreqCount=0;//Record the number of role with little frequency
while(nPos<j+nPatternLen[k])

...{//Get the possible person name
//
if(m_nBestTag[nPos]<4&&personDict.GetFrequency(m_sWords[nPos],m_nBestTag[nPos])<LITTLE_FREQUENCY)
nLittleFreqCount++;//The counter increase
strcat(sPersonName,m_sWords[nPos]);
nPos+=1;
}

/**//*
if(IsAllForeign(sPersonName)&&personDict.GetFrequency(m_sWords[j],1)<LITTLE_FREQUENCY)
{//Exclusion foreign name
//Rule 2 for exclusion:若均为外国人名用字 规则(名1+名2)失效
j+=nPatternLen[k]-1;
continue;
}
*/ if(strcmp(sPatterns[k],"CDCD")==0)

...{//Rule for exclusion
//规则(名1+名2+名1+名2)本身是排除规则:女高音歌唱家迪里拜尔演唱
//Rule 3 for exclusion:含外国人名用字 规则适用
//否则,排除规则失效:黑妞白妞姐俩拔了头筹。
if(GetForeignCharCount(sPersonName)>0)
j+=nPatternLen[k]-1;
continue;
}

/**//* if(strcmp(sPatterns[k],"CD")==0&&IsAllForeign(sPersonName))
{//
j+=nPatternLen[k]-1;
continue;
}
if(nLittleFreqCount==nPatternLen[k]||nLittleFreqCount==3)
//马哈蒂尔;小扎耶德与他的中国阿姨胡彩玲受华黎明大使之邀,
//The all roles appear with two lower frequecy,we will ignore them
continue;
*/ m_nUnknownWords[m_nUnknownIndex][0]=m_nWordPosition[j];
m_nUnknownWords[m_nUnknownIndex][1]=m_nWordPosition[j+nPatternLen[k]];
m_dWordsPossibility[m_nUnknownIndex]=-log(dFactor[k])+ComputePossibility(j,nPatternLen[k],personDict);
//Mutiply the factor
m_nUnknownIndex+=1;
j+=nPatternLen[k];
bMatched=true;
}
}
if(!bMatched)//Not matched, add j by 1
j+=1;
}
return true;
}
举例说明,比如例句“张华平说的确实在理”:
初次生成的分词结果:
| 序号 |
分词结果 |
| 0 |
张/ 华/ 平/ 说/ 的/ 确实/ 在/ 理/ |
做为人名的“张华平”还并没有被识别出来,需要按照人名的模式进行标记,结果为BCDAAAAA,BCD符合人名模式:姓+名1+名2,因此我们可以把前面个初分结果进行合并,实际上是在原来的初次结果中插入一个代表“张华平”这样组合的节点,如下图(1,4)节点所示:
经过人名、地名识别后的分词图表:
发表于 @ 2007年06月04日 13:58:00|评论(loading...)|编辑