2021SC@SDUSC
此次分析的主要为源码中的simple文件夹下对中文处理的部分,代码如下
首先是chinesesentence类。继承了sentence类
这个类的主要作用在于把每句句子中的词拆出来,对于长文则一句句处理
public class ChineseSentence extends Sentence {
static Properties SINGLE_SENTENCE_DOCUMENT = new Properties()
创建properties类对象,从一个句子中创建文件{{
try (InputStream is = IOUtils.getInputStreamFromURLOrClasspathOrFileSystem("edu/stanford/nlp/pipeline/StanfordCoreNLP-chinese.properties")){
load(is);
此处load函数应是读取is文档,即上面路径中的文件
} catch (IOException e) {
throw new RuntimeIOException(e);
}
setProperty("language", "chinese");
setProperty("annotators", "");
setProperty("ssplit.isOneSentence", "true");
setProperty("tokenize.class", "PTBTokenizer");
setProperty("tokenize.language", "zh");
}};
与上同理,但此处句子为被标记的句子
private static Properties SINGLE_SENTENCE_TOKENIZED_DOCUMENT = new Properties() {{
try (InputStream is = IOUtils.getInputStreamFromURLOrClasspathOrFileSystem("edu/stanford/nlp/pipeline/StanfordCoreNLP-chinese.properties")){
load(is);
} catch (IOException e) {
throw new RuntimeIOException(e);
}
setProperty("language", "chinese");
setProperty("annotators", "");
setProperty("ssplit.isOneSentence", "true");
setProperty("tokenize.class", "WhitespaceTokenizer");
setProperty("tokenize.language", "zh");
setProperty("tokenize.whitespace", "true"); // redundant?
}};
public ChineseSentence(String text) {
super(new ChineseDocument(text), SINGLE_SENTENCE_DOCUMENT);
}
此处放上sentence类中的部分代码,即Chinesesentence父类的代码
protected Sentence(Document doc, Properties props) {
this.document = doc;
if (props.containsKey("ssplit.isOneSentence")) {
this.impl = this.document.sentence(0, props).impl;
} else {
Properties modProps = new Properties(props);
modProps.setProperty("ssplit.isOneSentence", "true");
this.impl = this.document.sentence(0, modProps).impl;
}
// Set tokens
this.tokensBuilders = document.sentence(0).tokensBuilders;
assert (this.document.sentence(0).impl == this.impl);
assert (this.document.sentence(0).tokensBuilders == this.tokensBuilders);
if (props == SINGLE_SENTENCE_TOKENIZED_DOCUMENT) {
this.defaultProps = SINGLE_SENTENCE_DOCUMENT; // no longer care about } else {
this.defaultProps = props;
}
this.docFn = Document::new;
}
public ChineseSentence(List<String> tokens) {
super(ChineseDocument::new, tokens, SINGLE_SENTENCE_TOKENIZED_DOCUMENT);
}
public ChineseSentence(CoreNLPProtos.Sentence proto) {
super(ChineseDocument::new, proto, SINGLE_SENTENCE_DOCUMENT);
}
}
接下来是chinesedocument类
这个类主要是在sentence类处理完之后,对document进行处理的类
public class ChineseDocument extends Document {
继承document类
private static final Lazy<Annotator> chineseSegmenter = Lazy.of(() -> new ChineseSegmenterAnnotator("segment", new Properties() {{
setProperty("segment.model", "edu/stanford/nlp/models/segmenter/chinese/ctb.gz");
setProperty("segment.sighanCorporaDict", "edu/stanford/nlp/models/segmenter/chinese");
setProperty("segment.serDictionary", "edu/stanford/nlp/models/segmenter/chinese/dict-chris6.ser.gz");
setProperty("segment.sighanPostProcessing", "true");
}}));
即从处理过的语段中创建document
附上部分document类代码
public Document(Properties props, Annotation ann) {
this.defaultProps = props;
StanfordCoreNLP.getDefaultAnnotatorPool(props, new AnnotatorImplementations()); // cache the annotator pool
this.impl = new ProtobufAnnotationSerializer(false).toProtoBuilder(ann);
List<CoreMap> sentences = ann.get(CoreAnnotations.SentencesAnnotation.class);
this.sentences = new ArrayList<>(sentences.size());
for (CoreMap sentence : sentences) {
this.sentences.add(new Sentence(this, this.serializer.toProtoBuilder(sentence), sentence.get(CoreAnnotations.TextAnnotation.class), this.defaultProps));
}
}
static final Properties EMPTY_PROPS = new Properties() {{
try (InputStream is = IOUtils.getInputStreamFromURLOrClasspathOrFileSystem("edu/stanford/nlp/pipeline/StanfordCoreNLP-chinese.properties")){
load(is);
} catch (IOException e) {
throw new RuntimeIOException(e);
}
setProperty("language", "chinese");
setProperty("annotators", "");
setProperty("parse.binaryTrees", "true");
}};
捕捉io异常
public ChineseDocument(String text) {
super(ChineseDocument.EMPTY_PROPS, text);
}
@SuppressWarnings("Convert2streamapi")
public ChineseDocument(Annotation ann) {
super(ChineseDocument.EMPTY_PROPS, ann);
}
public ChineseDocument(CoreNLPProtos.Document proto) {
super(ChineseDocument.EMPTY_PROPS, proto);
}
继承
protected ChineseDocument(Properties props, String text) {
super(props, text);
}
同上
@Override
public List<Sentence> sentences(Properties props) {
return this.sentences(props, chineseSegmenter.get());
}
@Override
protected Document runLemma(Properties props) {
return mockLemma(props);
}
@Override
protected Document runSentiment(Properties props) {
throw new IllegalArgumentException("Sentiment analysis is not implemented for Chinese");
}
@Override // TODO(danqi; from Gabor): remove this method when we have a trained NNDep model
Document runDepparse(Properties props) {
return runParse(props);
}
}