stanford corenlp的中文切词有时不尽如意,那我们就需要实现一个自定义切词类,来完全满足我们的私人定制(加各种词典干预)。本篇文章就说下怎么把IKAnalyzer作为corenlp的切词工具。
《stanford corenlp的TokensRegex》提到了corenlp的配置CoreNLP-chinese.properties,其中customAnnotatorClass.segment就是用于指定切词类的,在这里我们只需要模仿ChineseSegmenterAnnotator来实现一个自己的Annotator,并设置在配置文件中即可。
public
class
IKSegmenterAnnotator
extends
ChineseSegmenterAnnotator {
public
IKSegmenterAnnotator() {
super
();
}
public
IKSegmenterAnnotator(
boolean
verbose) {
super
(verbose);
}
public
IKSegmenterAnnotator(String segLoc,
boolean
verbose) {
super
(segLoc, verbose);
}
public
IKSegmenterAnnotator(String segLoc,
boolean
verbose, String serDictionary, String sighanCorporaDict) {
super
(segLoc, verbose, serDictionary, sighanCorporaDict);
}
public
IKSegmenterAnnotator(String name, Properties props) {
super
(name, props);
}
private
List<String> splitWords(String str) {
try
{
List<String> words =
new
ArrayList<String>();
IKSegmenter ik =
new
IKSegmenter(
new
StringReader(str),
true
);
Lexeme lex =
null
;
while
((lex = ik.next()) !=
null
) {
words.add(lex.getLexemeText());
}
return
words;
}
catch
(IOException e) {
//LOGGER.error(e.getMessage(), e);
System.out.println(e);
List<String> words =
new
ArrayList<String>();
words.add(str);
return
words;
}
}
@Override
public
void
runSegmentation(CoreMap annotation) {
//0 2
// A BC D E
// 1 10 1 1
// 0 12 3 4
// 0, 0+1 ,
String text = annotation.get(CoreAnnotations.TextAnnotation.
class
);
List<CoreLabel> sentChars = annotation.get(ChineseCoreAnnotations.CharactersAnnotation.
class
);
List<CoreLabel> tokens =
new
ArrayList<CoreLabel>();
annotation.set(CoreAnnotations.TokensAnnotation.
class
, tokens);
//List<String> words = segmenter.segmentString(text);
List<String> words = splitWords(text);
System.err.println(text);
System.err.println(
"--->"
);
System.err.println(words);
int
pos =
0
;
for
(String w : words) {
CoreLabel fl = sentChars.get(pos);
fl.set(CoreAnnotations.ChineseSegAnnotation.
class
,
"1"
);
if
(w.length() ==
0
) {
continue
;
}
CoreLabel token =
new
CoreLabel();
token.setWord(w);
token.set(CoreAnnotations.CharacterOffsetBeginAnnotation.
class
, fl.get(CoreAnnotations.CharacterOffsetBeginAnnotation.
class
));
pos += w.length();
fl = sentChars.get(pos -
1
);
token.set(CoreAnnotations.CharacterOffsetEndAnnotation.
class
, fl.get(CoreAnnotations.CharacterOffsetEndAnnotation.
class
));
tokens.add(token);
}
}
}