实现wvtool中文功能要implement
WVTTokenizer, TokenEnumeration接口
写道
package ICTCLAS.vsm;
import java.io.Reader;
import java.util.ArrayList;
import java.util.List;
import ICTCLAS.util.ICTCLASUtil;
import edu.udo.cs.wvtool.generic.tokenizer.WVTTokenizer;
import edu.udo.cs.wvtool.main.WVTDocumentInfo;
import edu.udo.cs.wvtool.util.TokenEnumeration;
import edu.udo.cs.wvtool.util.WVToolException;
/**
*@date 2011-3-21
*
*@author Jing Yang
*
*/
public class ChineseTokenizer implements WVTTokenizer, TokenEnumeration {
private final List<String> currentTokens;
private TokenEnumeration input;
private final WVTTokenizer tokenizer;
public ChineseTokenizer(WVTTokenizer tokenizer) {
super();
this.currentTokens = new ArrayList<String>();
this.input = null;
this.tokenizer = tokenizer;
}
public TokenEnumeration tokenize(Reader source, WVTDocumentInfo d)
throws WVToolException {
if (source != null) {
input = tokenizer.tokenize(source, d);
readNextToken();
return this;
} else
return null;
}
// 分词
public void readNextToken() throws WVToolException {
if (input.hasMoreTokens()) {
String token = input.nextToken();
//System.out.println(token);
if (token.length() > 0) {
currentTokens.addAll(ICTCLASUtil.ContentProcess(token));// 这是我根据ictclas编写的分词程序
System.out.println(currentTokens);
}
}
}
public boolean hasMoreTokens() {
if (input != null)
return (currentTokens.size() > 0);
else
return false;
}
public String nextToken() throws WVToolException {
String result = null;
// If unequal null, return the current token and read another one from
// the stream
if (currentTokens.size() > 0) {
result = (String) currentTokens.get(0);
currentTokens.remove(0);
if (currentTokens.size() == 0) {
readNextToken();
}
} else {
result = null;
}
return result;
}
}
我在来贴一下别人的代码做个对比
写道
package com.xh;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import org.wltea.analyzer.IKSegmentation;
import org.wltea.analyzer.Lexeme;
import edu.udo.cs.wvtool.generic.tokenizer.SimpleTokenizer;
import edu.udo.cs.wvtool.generic.tokenizer.WVTTokenizer;
import edu.udo.cs.wvtool.main.WVTDocumentInfo;
import edu.udo.cs.wvtool.util.TokenEnumeration;
import edu.udo.cs.wvtool.util.WVToolException;
/*
* 这个程序是wvtool结合IKAnalyzer的分词而写成的分词代码,
* 程序执行的速度有些慢,由于我也是初学,对于其中的原理也
* 不甚明了,仅供大家做个参考而已
* */
public class IKAnalyzerTokenizer implements WVTTokenizer,TokenEnumeration{
/*
* 一般用Wvtool进行分词的时候,我们都习惯传入SimpleTokentizer,而SimpleTokenizer
* 好像只是提取出了一行文本,根本就没有分词,
* 而这里面,应该是对文本:一行一行的进行分词
* 而且真正实现分词功能的代码在readTokenizer()方法中,这个方法也是这个类里
* 唯一的私有方法
* */
private final List<String> currentToken;
private TokenEnumeration enumeration;
private final WVTTokenizer tokenizer;
public IKAnalyzerTokenizer(WVTTokenizer tokenizer){
this.tokenizer=tokenizer;
currentToken=new ArrayList<String>();
enumeration=null;
}
@Override
public TokenEnumeration tokenize(Reader source, WVTDocumentInfo info)
throws WVToolException {
if(source!=null){
//刚开始看到这段代码,我以为是递归,后来发现跟本不是的,这里它调用的是通过构造函数
//传过来的对象的方法,而不是本方法
enumeration=tokenizer.tokenize(source, info);
readNextTokenizer();
return this;
}else{
return null;
}
}
@Override
public boolean hasMoreTokens() {
if (enumeration != null)
return (currentToken.size() > 0);
else
return false;
}
@Override
public String nextToken() throws WVToolException {
String result = null;
// If unequal null, return the current token and read another one from
// the stream
if (currentToken.size() > 0) {
result = (String) currentToken.get(0);
currentToken.remove(0);
if (currentToken.size() == 0)
readNextTokenizer();
} else
result = null;
return result;
}
private void readNextTokenizer() throws WVToolException{
//我想吧:真正影响程序性能的代码在这里……但怎么优化呢?唉! 我也不知道啦……
if(enumeration.hasMoreTokens()){
//其实吧:就是读入一行文本
String string=enumeration.nextToken();
//包装一下吧
StringReader reader=new StringReader(string);
//好了,分词在这里完成
IKSegmentation seg=new IKSegmentation(reader);
Lexeme lex=new Lexeme(0, 0, 0, 0);
try {
while((lex=seg.next())!=null)
{
currentToken.add(lex.getLexemeText());
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
//最后测试一下吧
public static void main(String[] args) throws WVToolException {
IKAnalyzerTokenizer toker=new IKAnalyzerTokenizer(new SimpleTokenizer());
String string="雅虎新闻雅虎新闻并校十年难言成败\n雅虎新闻雅虎新闻并校十年难言成败";
StringReader reader=new StringReader(string);
WVTDocumentInfo info=new WVTDocumentInfo("text.html", "html", "utf-8", "chinese");
TokenEnumeration enumeration=toker.tokenize(reader, info);
while(enumeration.hasMoreTokens()){
System.out.print(enumeration.nextToken()+"|");
}
//结果:雅虎|新闻|雅虎|新闻|并|校|十年|十|年|难言|成败|雅虎|新闻|雅虎|新闻|并|校|十年|十|年|难言|成败|
}
}
几乎一样啊,呵呵。