中文分词一向是搜索引擎中的难点,总结了一个简单的算法,由此可以推出逆向最大匹配,当然还有最大概率匹配
import java.lang.*;
import java.io.*;
import java.util.*;
public class FMMSegment
{
Dictionary dic;
public FMMSegment()
{
}
public FMMSegment(Dictionary newDic)
{
dic = newDic;
}
public int wordSegment(String Sentence)
{
int senLen = Sentence.length();
int i=0, j=0;
int M=12;
String word;
boolean bFind = false;
while(i < senLen)
{
int N= i+M<senLen ? i+M : senLen+1;
bFind=false;
for(j=N-1; j>i; j--)
{
word = Sentence.substring(i, j);
if(dic.Find(word))
{
System.out.print(word + " ");
bFind=true;
i=j;
break;
}
}
if(bFind == false)
{
word = Sentence.substring(i, i+1);
System.out.print(word + " ");
i=j+1;
}
}
System.out.println();
return 1;
}
public void fileSegment(String fileName)
{
try
{
BufferedReader in = new BufferedReader(
new FileReader(fileName) );
String s;
while((s = in.readLine()) != null)
{
wordSegment(s);
}
}
catch (IOException e)
{
System.out.println(e);
}
}
}
import java.lang.*;
import java.io.*;
import java.util.*;
public class FMMSegment
{
Dictionary dic;
public FMMSegment()
{
}
public FMMSegment(Dictionary newDic)
{
dic = newDic;
}
public int wordSegment(String Sentence)
{
int senLen = Sentence.length();
int i=0, j=0;
int M=12;
String word;
boolean bFind = false;
while(i < senLen)
{
int N= i+M<senLen ? i+M : senLen+1;
bFind=false;
for(j=N-1; j>i; j--)
{
word = Sentence.substring(i, j);
if(dic.Find(word))
{
System.out.print(word + " ");
bFind=true;
i=j;
break;
}
}
if(bFind == false)
{
word = Sentence.substring(i, i+1);
System.out.print(word + " ");
i=j+1;
}
}
System.out.println();
return 1;
}
public void fileSegment(String fileName)
{
try
{
BufferedReader in = new BufferedReader(
new FileReader(fileName) );
String s;
while((s = in.readLine()) != null)
{
wordSegment(s);
}
}
catch (IOException e)
{
System.out.println(e);
}
}
}