中文分词算法

最新推荐文章于 2020-12-05 22:07:32 发布

KerryMo

最新推荐文章于 2020-12-05 22:07:32 发布

阅读量638

点赞数

分类专栏： Java 中文分词自然语言处理文章标签：算法 dictionary string import 搜索引擎 class

Java 同时被 3 个专栏收录

28 篇文章 0 订阅

订阅专栏

自然语言处理

26 篇文章 0 订阅

订阅专栏

中文分词

21 篇文章 0 订阅

订阅专栏

中文分词一向是搜索引擎中的难点，总结了一个简单的算法，由此可以推出逆向最大匹配，当然还有最大概率匹配
import java.lang.*;
import java.io.*;
import java.util.*;
public class FMMSegment
{
Dictionary dic;

public FMMSegment()
{
}

public FMMSegment(Dictionary newDic)
{
  dic = newDic;
}
public int wordSegment(String Sentence)
{
  int senLen = Sentence.length();
  int i=0, j=0;
  int M=12;
  String word;
  boolean bFind = false;

  while(i < senLen)
  {
   int N= i+M<senLen ? i+M : senLen+1;
   bFind=false;
   for(j=N-1; j>i; j--)
   {
    word = Sentence.substring(i, j);
    if(dic.Find(word))
    {
     System.out.print(word + " ");
     bFind=true;
     i=j;
     break;
    }
   }
   if(bFind == false)
   {
    word = Sentence.substring(i, i+1);
    System.out.print(word + " ");
    i=j+1;
   }
  }
  System.out.println();
  return 1;
}
public void fileSegment(String fileName)
{
  try
  {
   BufferedReader in = new BufferedReader(
    new FileReader(fileName) );
   String s;
   while((s = in.readLine()) != null)
   {
    wordSegment(s);
   }
  }
  catch (IOException e)
  {
   System.out.println(e);
  }
}
}