//计算平滑系数公式
//平滑参数
dSmoothingPara = 0.1
//设置当前节点的频度,如果是已知词性,直接使用频度
dCurFreqency
//一个参数
static int MAX_FREQUENCE = 2079997;
//Two linked Words frequency
dTemp = (double) 1 / MAX_FREQUENCE;
//两词之间的词频?关联度?
nTwoWordsFreq = DictBinary.GetFrequency(sTwoWords, 3);
//这个词的平度
if (pCur.p.nPOS >= 0) {
// It's not an unknown words
dCurFreqency = pCur.p.value;
} else {
// Unknown words
//如果是未知词性,从核心词典中检索词组汉字对应2的频度
dCurFreqency = DictCore.GetFrequency(pCur.p.sWord, 2);
}
/**
* 得到具体词和词性的频度数据
*
* @param sWord
* 单词
* @param nHandle
* 词性
* @return 频度
*/
public int GetFrequency(char[] sWord, int nHandle) {
char sWordFind[] = new char[WORD_MAXLENGTH - 2];
int nPos, nIndex;
PWORD_CHAIN pFound;
Pint pnPos = new Pint();
if (!PreProcessing(sWord, pnPos, sWordFind))
return 0;
nPos = pnPos.value;
Pint pnIndex = new Pint();
if (FindInOriginalTable(nPos, sWordFind, nHandle, pnIndex)) {
nIndex = pnIndex.value;
return m_IndexTable[nPos].pWordItemHead[nIndex].p.nFrequency;
}
nIndex = pnIndex.value;
PPWORD_CHAIN ppFound = new PPWORD_CHAIN(new PWORD_CHAIN(
new WORD_CHAIN()));
if (FindInModifyTable(nPos, sWordFind, nHandle, ppFound)) {
return ppFound.p.p.data.nFrequency;
}
return 0;
}
dValue = -Math
.log(dSmoothingPara * (1 + dCurFreqency) / (MAX_FREQUENCE + 80000)+ (1 - dSmoothingPara)* ((1 - dTemp) * nTwoWordsFreq/ (1 + dCurFreqency) + dTemp));
一些分词中用到的公式-参考ictclas
最新推荐文章于 2022-08-16 20:49:58 发布