利用ansj分词器的词典进行分词,避免了自定义词的分词不准确问题
AnsjWord.java工具类
//创建自定义歧义词典,list为传入词典信息参数
public void CreateDictionary(List<String> list) throws IOException {URL url =this.getClass().getResource("/library/ambiguity.dic");
System.out.println(url.getFile());
File file=new File(url.getFile());//词典路径
file.delete();
//删除并重新创建文件
if(!file.exists()) {//词典文件不存在就创建文件
file.createNewFile();
}
BufferedWriter writer = new BufferedWriter (new OutputStreamWriter (new FileOutputStream (file,true),"UTF-8"));
//词典只能识别UTF-8格式
for(String str:list ) {writer.write(str+"\tabc\r\n");
}
writer.flush();
writer.close();
}
//自定义歧义词典分词,str为要分的词
public Map<String,String> ansj(String str){
Map<String,String> strMap=new HashMap<String,String>();
//Forest forest = null;
URL url = this.getClass().getResource("/library/ambiguity.dic");
System.out.println(url.getFile());
try {
MyStaticValue.putLibrary(AmbiguityLibrary.DEFAULT,url.getFile(),AmbiguityLibrary.get());
Result result=DicAnalysis.parse(str);//传入forest
List<Term> termList=result.getTerms();
for(Term term:termList){
strMap.put(term.getNatureStr(),term.getName());
System.out.println(term.getName()+":"+term.getNatureStr());
}
} catch (Exception e) {
e.printStackTrace();
}
return strMap;}
//测试
public static void main(String[] args) throws IOException {
List<String> list=new ArrayList<String>();list.add("这里添加词典中的词,可手动添加,也可从数据库查询获取");
list.add("这里是");
list.add("分词的词");
new AnsjWord().CreateDictionary(list);//创建词典
new AnsjWord().ansj("这里是要分词的词");
}
library.properties 配置文件
ambiguityLibrary=target/classes/library/ambiguity.dic
各文件对应路径,ambiguity.dic词典文件就是自动创建的歧义词典