#if !defined HownetWord_H
#define HownetWord_H
///
///
Hownet的词的表示(只考虑四大部分而忽略概念,即具体词),与刘群算法不同,对于一个词的多次出现进行合并,因此所谓第一义原也不止一个
///
class HownetWord
{
public:
HownetWord(string strLine):word("")
{
char* token;
token=strtok((char*)strLine.c_str(),"
t");
if(token!=NULL)
{
word=string(token);
token=strtok(NULL,"
t");
}
if(token!=NULL)
{
token=strtok(NULL,"
t");
}
if(token!=NULL)
{
char
*ttok=strtok(token,",");
while(ttok!=NULL)
{
char
firstC=ttok[0];
string
strP="";
string
ps(ttok);
string::size_type
pos=ps.find(" ");
//
关系义原
if(pos!=string::npos)
{
strP
= ps.substr(pos+1,ps.length()-pos-1);
}
//
基本义原
else
if(firstC>='a'&&firstC<='z'||firstC>='A'&&firstC<='Z')
{
strP=ps;
}
//
关系符号描述
else
if(firstC != '(')
{
strP=ps.substr(1,ps.length()-1);
}
if(strP.length()>0&&relationPrimitives.find(strP)==relationPrimitives.end())
{
relationPrimitives.insert(map::value_type(strP,NULL));
}
//
忽略具体词
ttok=strtok(NULL,",n");
}
}
} ///
/// 关系义原描述
///
map relationPrimitives ;
///
/// 词
///
string word ;
};
#endif
// WordRelvanceCalculator.cpp: implementation
of the CWordRelvanceCalculator class.
//
//
#include "WordRelvanceCalculator.h"
//
// Construction/Destruction
//
CWordRelvanceCalculator::CWordRelvanceCalculator():Alpha(1.60),Gama(0.20)
{
TCHAR szPhraseFileName[200]={' '};
LPTSTR lpPhraseFileName=szPhraseFileName;
lpPhraseFileName = szPhraseFileName;
lpPhraseFileName +=
GetSystemDirectory(szPhraseFileName,200);
if (*(lpPhraseFileName-1) != _T('\'))
*lpPhraseFileName++ =
_T('\'); string strDir(szPhraseFileName);
Initialize(strDir);
}
CWordRelvanceCalculator::~CWordRelvanceCalculator()
{
}
void CWordRelvanceCalculator::Initialize(string
&strDictDir)
{
LoadPrimitiveDict(strDictDir
+ "WHOLE.DAT"); LoadWordDict(strDictDir +
"glossary.dat");
}
//读取义原词典
void CWordRelvanceCalculator::LoadPrimitiveDict(string
&strDictPath)
{
FILE*
fhandle=fopen(strDictPath.c_str(),"r");
if(!fhandle)
{
}
else
{
char
in_line[1024];
fgets(in_line,1024,fhandle);
while(!feof(fhandle))
{
string
line(in_line);
HownetPrimitive
hp(line);
//
如果义原在义原词典中重复出现,则只保留最前面一个(暂时先这么做)
if(m_oNamePrimitiveMap.find(hp.name)==m_oNamePrimitiveMap.end())
{
m_oNamePrimitiveMap.insert(map::value_type(hp.name,hp));
}
m_oIdParentMap.insert(map::value_type(hp.id,hp.parent));
fgets(in_line,1024,fhandle);
}
fclose(fhandle);
}
}
//读取词汇字典
void CWordRelvanceCalculator::LoadWordDict(string
&strDictPath)
{
FILE*
fhandle=fopen(strDictPath.c_str(),"r");
if(!fhandle)
{
}
else
{
char in_line[1024];
fgets(in_line,1024,fhandle);
while(!feof(fhandle))
{
string
line(in_line);
HownetWord
hw(line);
//
如果该词在前面已经出现过,则将所有这些词的义原进行合并(暂时先这么做)
map::iterator
hwIte;
if((hwIte=m_oWordMap.find(hw.word))!=m_oWordMap.end())
{
HownetWord
prevHW=hwIte->second;
map::iterator
ite2;
for(ite2=hw.relationPrimitives.begin();ite2!=hw.relationPrimitives.end();ite2++)
{
if(prevHW.relationPrimitives.find(ite2->first)!=prevHW.relationPrimitives.end())
{
prevHW.relationPrimitives.insert(map::value_type(ite2->first,ite2->second));
}
}
}
else
{
m_oWordMap.insert(map::value_type(hw.word,
hw));
}
fgets(in_line,1024,fhandle);
}
fclose(fhandle);
}
}
/*
* 利用hownet计算两个义原之间的关联度
*/
double CWordRelvanceCalculator::ComputePrimitiveSimilarity(string
&strPrimitive1, string &strPrimitive2)
{
if(strPrimitive1==strPrimitive2)
return 1.0;
//
两个义原之间的距离
int distance = 0;
//
如果两个义原有任何一个不是合法的义原,则返回一个默认值
map::iterator
ite=m_oNamePrimitiveMap.find(strPrimitive1);
if(ite==m_oNamePrimitiveMap.end()) return
Gama;
HownetPrimitive firstP =
ite->second;
ite=m_oNamePrimitiveMap.find(strPrimitive2);
if(ite==m_oNamePrimitiveMap.end()) return
Gama;
HownetPrimitive secondP =
ite->second;
//
计算两个义原之间的距离
int firstID = firstP.id;
int secondID = secondP.id;
while(firstID != secondID)
{
//
将id较大的义原沿着树上升一级,直至两个id相等
if(firstID > secondID)
{
int tmpID =
firstID;
firstID =
secondID;
secondID =
tmpID;
} int parentID =
m_oIdParentMap[secondID];
//
如果有某个ID没有父义原,则返回一个较小值
if(secondID == parentID)
{
distance =
15;
break;
}
secondID = parentID;
distance++;
}
return Alpha / (distance + Alpha);
}
/*
* 计算两个词语的相关度
*/
double CWordRelvanceCalculator::ComputeRelevance(const string
&strCnWord1, const string &strCnWord2)
{
if(strCnWord1==strCnWord2)
{
return
1.0;
}
map::iterator
myite= m_oWordMap.find(strCnWord1);
if(myite==m_oWordMap.end()) return
Gama;
HownetWord
&hw1=myite->second;
myite=
m_oWordMap.find(strCnWord2);
if(myite==m_oWordMap.end()) return
Gama;
HownetWord &hw2 =
myite->second;
// 相似度
double fSim = 0;
// 矩阵的长边和短边
int length =
hw1.relationPrimitives.size();
int width =
hw2.relationPrimitives.size();
if(length < width)
{
length =
hw2.relationPrimitives.size();
width =
hw1.relationPrimitives.size();
}
if(length == 0 || width ==
0)
{
return
Gama;
}
//
矩阵用以存两两之间的相似度
vector
> simMatrix;
for(int i=0; i <
hw1.relationPrimitives.size()+1; i++)
{
vector
tv;
simMatrix.push_back(tv);
vector
& tv2 = simMatrix[i];
tv2.resize(hw2.relationPrimitives.size()+1);
//vector
& tv3 = simMatrix[i];
//cout
<< tv3.size() << "t";
//simMatrix[i]
= new double[hw2.relationPrimitives.size()];
}
//
计算两两义原之间的相似度
map::iterator
ite=hw1.relationPrimitives.begin();
int row = 0;
for(;ite!=hw1.relationPrimitives.end();ite++)
{
//
第一个词的第i个义原
string
strP1=ite->first;
int
col=0;
map::iterator
ite2=hw1.relationPrimitives.begin();
for(;ite2!=hw1.relationPrimitives.end();ite2++)
{
string
strP2=ite2->first;
simMatrix[row][col]
= ComputePrimitiveSimilarity(strP1, strP2);
col++;
}
row++;
}
//
从矩阵中找出最大值,然后把该值所在行列清零
double fSimSum = 0;
int MaxRow = -1;
int MaxCol = -1;
double MaxSim = 0;
for(i=0; i < width;
i++)
{
//
找出当前对大的值
for(int m=0;
m < hw1.relationPrimitives.size(); m++)
{
for(int
n=0; n < hw2.relationPrimitives.size(); n++)
{
if(simMatrix[m][n]
> MaxSim)
{
MaxRow
= m;
MaxCol
= n;
MaxSim
= simMatrix[m][n];
}
}
}
fSimSum +=
MaxSim;
MaxSim =
0;
//
将最大值所在行列全部清零
for(int j=0;
j < hw2.relationPrimitives.size(); j++)
{
simMatrix[MaxRow][j]
= 0;
}
for(j=0; j
< hw1.relationPrimitives.size(); j++)
{
simMatrix[j][MaxCol]
= 0;
}
}
//
计算最终相似度,公式推导过程:
//
如果length和width越接近,fSimSum越接近width,则最终的相似度应该越大;
//
所以我们假设那些没有对齐的部分的默认相似度不应该是固定的,我们用(width
/ length) * (fSimSum / width) = fSimSum / length表示
//
所以总共需要补充到fSimSum的数值为: (length - width) * SimSum /
length
// 最后的相似度还要除以
length
// 最后得到下面的计算公式
fSim = fSimSum * (2.0 / length
- width * 1.0 / (length * length));
return fSim;
}
// WordRelvanceCalculator.h: interface for the
CWordRelvanceCalculator class.
//
//
#if
!defined(AFX_WORDRELVANCECALCULATOR_H__C5D16340_1E8E_450C_A65F_7101DA8D638C__INCLUDED_)
#define
AFX_WORDRELVANCECALCULATOR_H__C5D16340_1E8E_450C_A65F_7101DA8D638C__INCLUDED_
#if _MSC_VER > 1000
#pragma once
#endif // _MSC_VER > 1000
#pragma warning(disable:4786)
#include "MyIME.h"
#include "HownetPrimitive.h"
#include "HownetWord.h"
/*
* 利用hownet进行语意相似度计算
*
包括相关度,相似度两个方面 */
class CWordRelvanceCalculator {
public:
CWordRelvanceCalculator();
virtual ~CWordRelvanceCalculator();
private:
void Initialize(string &strDir);
///
/// 载入义原词典
///
///
name="strDictPath">
void LoadPrimitiveDict(string
&strDictPath);
///
/// 载入词典
///
///
name="strDictPath">
void LoadWordDict(string &strDictPath);
///
/// 计算两个义原的相似度
///
///
name="strPrimitive1">
///
name="strPrimitive2">
///
double ComputePrimitiveSimilarity(string
&strPrimitive1, string &strPrimitive2);
public:
///
/// 计算两个词的相关度
///
///
name="strCnWord1">
///
name="strCnWord2">
///
double ComputeRelevance(const string
&strCnWord1, const string &strCnWord2);
private:
///
/// 存储每个词及其相关的义原描述
///
map m_oWordMap ;
///
///
存储每个义原的名称与该义原的对应关系,用于按义原名称检索义原
///
map
m_oNamePrimitiveMap ;
///
///
存储每个义原的ID与其父义原的ID的对应关系,用于从低一级义原往上层回溯
///
map m_oIdParentMap ;
///
/// 计算义原相似度的参数
///
const double Alpha ;
///
///
当一个义原没有对应义原的时候,赋予一个默认值
///
const double Gama ;
};
#endif //
!defined(AFX_WORDRELVANCECALCULATOR_H__C5D16340_1E8E_450C_A65F_7101DA8D638C__INCLUDED_)