实验中,我们采用两种方法计算概率。一:以字符为单位计算概率;二:以汉语词为单位计算概率在第二种情况下,我们采用Jeasy分词组件进行分词处理,该分词组件为基于前向最大匹配的分词方法,分词结果在绝大多数情况下是正确的。
/**
* @author liuyu
* 此实体作为每个字符的一个单位
*
*/
public class Entity
{
String word; // 存储字符
float pValue; // 存储该字符对应的概率值
public Entity() // 类的构造函数
{
pValue = 0 ;
word = "" ;
}
}
* @author liuyu
* 此实体作为每个字符的一个单位
*
*/
public class Entity
{
String word; // 存储字符
float pValue; // 存储该字符对应的概率值
public Entity() // 类的构造函数
{
pValue = 0 ;
word = "" ;
}
}
2.读取文件模块
![](https://i-blog.csdnimg.cn/blog_migrate/81178cc93a2a3bb5048d90d76e7ec935.gif)
public
static
String GetFileText(String path)
throws
FileNotFoundException,IOException
{
InputStreamReader inStreamReader = new InputStreamReader( new FileInputStream(path), " UTF-8 " );
// String strFile1=
BufferedReader bufReader = new BufferedReader(inStreamReader);
String line;
StringBuilder sb = new StringBuilder();
while ((line = bufReader.readLine()) != null )
{
sb.append(line + " " );
}
inStreamReader.close();
bufReader.close();
String strFile = sb.toString();
return strFile;
}
{
InputStreamReader inStreamReader = new InputStreamReader( new FileInputStream(path), " UTF-8 " );
// String strFile1=
BufferedReader bufReader = new BufferedReader(inStreamReader);
String line;
StringBuilder sb = new StringBuilder();
while ((line = bufReader.readLine()) != null )
{
sb.append(line + " " );
}
inStreamReader.close();
bufReader.close();
String strFile = sb.toString();
return strFile;
}
3.分割字符
(1)分词
![](https://i-blog.csdnimg.cn/blog_migrate/81178cc93a2a3bb5048d90d76e7ec935.gif)
public
static
String CutText(String path)
throws
FileNotFoundException,IOException
{
String fileText = GetFileText(path);
MMAnalyzer analyzer = new MMAnalyzer();
String result = null ;
String spliter = " | " ;
try
{
result = analyzer.segment(fileText, spliter);
}
catch (IOException e)
{
e.printStackTrace();
}
// System.out.print(result);
return result;
}
{
String fileText = GetFileText(path);
MMAnalyzer analyzer = new MMAnalyzer();
String result = null ;
String spliter = " | " ;
try
{
result = analyzer.segment(fileText, spliter);
}
catch (IOException e)
{
e.printStackTrace();
}
// System.out.print(result);
return result;
}
(2)分出单个字符
![](https://i-blog.csdnimg.cn/blog_migrate/81178cc93a2a3bb5048d90d76e7ec935.gif)
public
static
String CutTextSingleCharacter(String path)
throws
FileNotFoundException,IOException
{ String text = GetFileText(path);
String proText = null ;
Pattern pattern = Pattern.compile( " [\\u4E00-\\u9FA5\\uF900-\\uFA2D] " );
Matcher m = pattern.matcher(text);
StringBuffer sb = new StringBuffer();
Boolean flag = m.find();
while (flag)
{
int start = m.start();
int end = m.end();
sb.append(text.substring(start, end) + " | " );
// System.out.println(text.substring(start,end));
flag = m.find();
}
proText = sb.toString();
return proText;
}
{ String text = GetFileText(path);
String proText = null ;
Pattern pattern = Pattern.compile( " [\\u4E00-\\u9FA5\\uF900-\\uFA2D] " );
Matcher m = pattern.matcher(text);
StringBuffer sb = new StringBuffer();
Boolean flag = m.find();
while (flag)
{
int start = m.start();
int end = m.end();
sb.append(text.substring(start, end) + " | " );
// System.out.println(text.substring(start,end));
flag = m.find();
}
proText = sb.toString();
return proText;
}
4.计算字符的概率
![](https://i-blog.csdnimg.cn/blog_migrate/81178cc93a2a3bb5048d90d76e7ec935.gif)
public
static
ArrayList
<
Entity
>
CalcuP(String path)
throws
IOException
{ // 以词为单位计算相对熵
// String result=CutText(path);
// 以字为单位计算相对熵
String result = CutTextSingleCharacter(path);
String []words = result.split( " \\| " );
ArrayList < Entity > enList = new ArrayList();
for (String w: words)
{ w = w.trim();
Entity en = new Entity();
en.word = w;
en.pValue = 1 ;
enList.add(en);
// System.out.println(w);
}
float total = enList.size();
for ( int i = 0 ;i < enList.size() - 1 ;i ++ )
{
if ( ! enList.get(i).word.isEmpty())
{
for ( int j = i + 1 ;j < enList.size();j ++ )
{
if (enList.get(i).word.equals(enList.get(j).word))
{
enList.get(i).pValue ++ ;
enList.get(j).pValue = 0 ;
enList.get(j).word = "" ;
}
}
}
}
for ( int i = enList.size() - 1 ;i >= 0 ;i -- )
{
if (enList.get(i).pValue < 1.0 )
enList.remove(i);
}
for ( int i = 0 ;i < enList.size();i ++ )
{
enList.get(i).pValue = enList.get(i).pValue / total;
}
return enList;
}
{ // 以词为单位计算相对熵
// String result=CutText(path);
// 以字为单位计算相对熵
String result = CutTextSingleCharacter(path);
String []words = result.split( " \\| " );
ArrayList < Entity > enList = new ArrayList();
for (String w: words)
{ w = w.trim();
Entity en = new Entity();
en.word = w;
en.pValue = 1 ;
enList.add(en);
// System.out.println(w);
}
float total = enList.size();
for ( int i = 0 ;i < enList.size() - 1 ;i ++ )
{
if ( ! enList.get(i).word.isEmpty())
{
for ( int j = i + 1 ;j < enList.size();j ++ )
{
if (enList.get(i).word.equals(enList.get(j).word))
{
enList.get(i).pValue ++ ;
enList.get(j).pValue = 0 ;
enList.get(j).word = "" ;
}
}
}
}
for ( int i = enList.size() - 1 ;i >= 0 ;i -- )
{
if (enList.get(i).pValue < 1.0 )
enList.remove(i);
}
for ( int i = 0 ;i < enList.size();i ++ )
{
enList.get(i).pValue = enList.get(i).pValue / total;
}
return enList;
}
5.计算相对熵
![](https://i-blog.csdnimg.cn/blog_migrate/81178cc93a2a3bb5048d90d76e7ec935.gif)
/*
用于计算两段文本的相对熵
*/
public static float CalKL(ArrayList < Entity > p,ArrayList < Entity > q)
{
float kl = 0 ;
float infinity = 10000000 ; // 无穷大
double accretion = infinity; // 设置熵增加量的初始值为无穷大。
// 从q中找出与p中相对应词的概率,如果找到了,就将accretion的值更新,并累加到相对熵上面;如果没找到,则增加了为无穷大
for ( int i = 0 ;i < p.size();i ++ )
{
if (q.size() != 0 )
{ for ( int j = q.size() - 1 ;j >= 0 ;j -- )
{
if (p.get(i).word.equals(q.get(j).word))
{ accretion = p.get(i).pValue * Math.log(p.get(i).pValue / q.get(j).pValue);
// q.remove(j);
break ;
}
}
kl += accretion;
accretion = infinity;
}
}
return kl;
}
}
public static float CalKL(ArrayList < Entity > p,ArrayList < Entity > q)
{
float kl = 0 ;
float infinity = 10000000 ; // 无穷大
double accretion = infinity; // 设置熵增加量的初始值为无穷大。
// 从q中找出与p中相对应词的概率,如果找到了,就将accretion的值更新,并累加到相对熵上面;如果没找到,则增加了为无穷大
for ( int i = 0 ;i < p.size();i ++ )
{
if (q.size() != 0 )
{ for ( int j = q.size() - 1 ;j >= 0 ;j -- )
{
if (p.get(i).word.equals(q.get(j).word))
{ accretion = p.get(i).pValue * Math.log(p.get(i).pValue / q.get(j).pValue);
// q.remove(j);
break ;
}
}
kl += accretion;
accretion = infinity;
}
}
return kl;
}
}