根据目前学习,中文断句 standford nlp可以实现中文分词和断句,下面有不同api的例子,大家可以试试
package com.example.utils;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.StringReader;
import java.nio.charset.Charset;
import java.util.List;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.process.CoreLabelTokenFactory;
import edu.stanford.nlp.process.PTBTokenizer;
import edu.stanford.nlp.process.WordToSentenceProcessor;
import edu.stanford.nlp.util.CoreMap;
import opennlp.tools.sentdetect.SentenceDetectorEvaluator;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.sentdetect.SentenceSample;
import opennlp.tools.sentdetect.SentenceSampleStream;
import opennlp.tools.util.MarkableFileInputStreamFactory;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
import opennlp.tools.util.Span;
import opennlp.tools.util.TrainingParameters;
/**
* Filename: NlpSBD.java
* Description: 语句边界消岐 sentence boundary disambiguation
* Copyright: Copyright (c) 2019 All Rights Reserved.
* @author: wangk
* @version: 1.0
* Create at: 2019年5月7日 上午9:26:36
*
* Modification History:
* Date Author Version Description
* ------------------------------------------------------------------
* 2019年5月7日 wangk 1.0 1.0 Version
*
*/
public class NlpSBD {
//除了NLP API 外,还有java类有两个方法可以用,试用简单的文本断句,1使用正则2,使用breakIterator类,可以搜索一下
static String paragraph = "A simple approach to create a class to hold and remove stopwords. Let's IBM. this is a cat.";
static String chineseLanguage = "第一个括号子表达式捕获 Web 地址的协议部分。 该子表达式匹配在冒号和两个正斜杠前面的任何单词。";
public static void main(String[] args) {
NlpSBD ns = new NlpSBD();
//ns.sentDetect(paragraph);
//ns.trainText();
//System. out .println( " 内存信息 :" + toMemoryInfo ());
ns.StanfordCoreNLP(chineseLanguage);
}
/**
* 获取当前 jvm 的内存信息
*
* @return
*/
public static String toMemoryInfo() {
Runtime currRuntime = Runtime.getRuntime ();
int nFreeMemory = ( int ) (currRuntime.freeMemory() / 1024 / 1024);
int nTotalMemory = ( int ) (currRuntime.totalMemory() / 1024 / 1024);
return nFreeMemory + "M/" + nTotalMemory + "M(free/total)" ;
}
/**
* @Description:openNLP 类 sentDetect 方法
* @author wangk
* @param text
* @date: 2019年5月7日 上午9:49:00
*/
public void sentDetect(String text) {
try {
File file = new File(this.getClass().getResource("").getPath()+"/nlpbin","en-sent.bin");
InputStream is = new FileInputStream(file);
SentenceModel model = new SentenceModel(is);
SentenceDetectorME detector = new SentenceDetectorME(model);
String sentences[] = detector.sentDetect(text);
for(String sentence : sentences) {
System.out.println(sentence);
}
//getSentenceProbabilities 代表使用sentDetect 的置信度
double probablities[] = detector.getSentenceProbabilities();
for(double p : probablities) {
System.out.println(p);
}
} catch (IOException e) {
e.printStackTrace();
}
}
/*A simple approach to create a class to hold and remove stopwords.
Let's IBM.this is a cat.
0.996295644468833
0.974478290117916
*/
/**
* @Description: openNLP 类 setPosDetect 方法
* @author wangk
* @param text
* @date: 2019年5月7日 上午10:22:09
*/
public void setPosDetect(String text) {
try {
File file = new File(this.getClass().getResource("").getPath()+"/nlpbin","en-sent.bin");
InputStream is = new FileInputStream(file);
SentenceModel model = new SentenceModel(is);
SentenceDetectorME detector = new SentenceDetectorME(model);
Span spans[] = detector.sentPosDetect(text);
for(Span span : spans) {
System.out.println(span+"["+text.substring(span.getStart(), span.getEnd())+"]");
}
//getSentenceProbabilities 代表使用sentDetect 的置信度
double probablities[] = detector.getSentenceProbabilities();
for(double p : probablities) {
System.out.println(p);
}
} catch (IOException e) {
e.printStackTrace();
}
/*[0..65)[A simple approach to create a class to hold and remove stopwords.]
[66..90)[Let's IBM.this is a cat.]
0.996295644468833
0.974478290117916*/
}
//https://stanfordnlp.github.io/CoreNLP/
/**
* @Description: 训练文本断句模型
* @author wangk
* @date: 2019年5月7日 下午3:49:26
*/
public void trainText() {
try {
File file = new File(this.getClass().getResource("").getPath()+"/exam","sentence.train");
Charset charset = Charset.forName("UTF-8");
ObjectStream<String> lineStream =
new PlainTextByLineStream(new MarkableFileInputStreamFactory(file), charset);
InputStream is = new FileInputStream(file);
ObjectStream<SentenceSample> ss = new SentenceSampleStream(lineStream);
SentenceModel model = SentenceDetectorME.train("en", ss, true,null, TrainingParameters.defaultParams());
OutputStream modelStream = new BufferedOutputStream(new FileOutputStream("modelFile"));
model.serialize(modelStream);
//使用SentenceDetectorEvaluator类评估模型 评估正确率
SentenceDetectorME detector = new SentenceDetectorME(model);
SentenceDetectorEvaluator sde = new SentenceDetectorEvaluator(detector, null);
sde.evaluate(ss);
System.out.println(sde.getFMeasure());
} catch (IOException e) {
e.printStackTrace();
}
}//
//Stanford API
/**
* @Description: Stanford pTBTokenizer 文本断句
* @author wangk
* @param text
* @date: 2019年5月9日 下午2:45:50
*/
public void pTBTokenizer(String text) {
PTBTokenizer ptb = new PTBTokenizer(new StringReader(text),new CoreLabelTokenFactory(),null);
WordToSentenceProcessor wtsp = new WordToSentenceProcessor();//他的process方法,可以根据PTBTokenizer 示例产生的词生成List
List<List<CoreLabel>> sents = wtsp.process(ptb.tokenize());
/*for(List<CoreLabel> sent : sents) {
System.out.println(sent);
}*/
/*英文结果 [A, simple, approach, to, create, a, class, to, hold, and, remove, stopwords, .]
[Let, 's, IBM, .]
[this, is, a, cat, .]*/
}
/**
* @Description: StanfordCoreNLP 处理中文
* @author wangk
* @date: 2019/5/13 15:38
*/
public void StanfordCoreNLP(String text){
String props="StanfordCoreNLP-chinese.properties";
StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
Annotation document = pipeline.process(text);
List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class);
StringBuilder result = new StringBuilder();
for (CoreMap sentence : sentences) {
for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) {
String word = token.get(CoreAnnotations.TextAnnotation.class);
result.append(word).append(" ");
}
}
System.out.println(result.toString());
//中文结果:第一 个 括号 子 表达式 捕获 Web 地址 的 协议 部分 。 该 子 表达式 匹配 在 冒号 和 两 个 正 斜杠 前面 的 任何 单词 。
Annotation ano = new Annotation(text);
pipeline.annotate(ano);
try {
pipeline.xmlPrint(ano,System.out);//可以打印出xml结构
} catch (IOException e) {
e.printStackTrace();
}
/* 第一 个 括号 子 表达式 捕获 Web 地址 的 协议 部分 。 该 子 表达式 匹配 在 冒号 和 两 个 正 斜杠 前面 的 任何 单词 。
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet href="CoreNLP-to-HTML.xsl" type="text/xsl"?>
<root>
<document>
<sentences>
<sentence id="1">
<tokens>
<token id="1">
<word>第一</word>
<lemma>第一</lemma>
<CharacterOffsetBegin>0</CharacterOffsetBegin>
<CharacterOffsetEnd>2</CharacterOffsetEnd>
<POS>OD</POS>
<NER>ORDINAL</NER>
<NormalizedNER>1</NormalizedNER>
<Speaker>PER0</Speaker>
</token>
<token id="2">
<word>个</word>
<lemma>个</lemma>
<CharacterOffsetBegin>2</CharacterOffsetBegin>
<CharacterOffsetEnd>3</CharacterOffsetEnd>
<POS>M</POS>
<NER>O</NER>
<Speaker>PER0</Speaker>
</token>
<token id="3">
<word>括号</word>
<lemma>括号</lemma>
<CharacterOffsetBegin>3</CharacterOffsetBegin>
<CharacterOffsetEnd>5</CharacterOffsetEnd>
<POS>NN</POS>
<NER>O</NER>
<Speaker>PER0</Speaker>
</token>
<token id="4">
<word>子</word>
<lemma>子</lemma>
<CharacterOffsetBegin>5</CharacterOffsetBegin>
<CharacterOffsetEnd>6</CharacterOffsetEnd>
<POS>NN</POS>
<NER>O</NER>
<Speaker>PER0</Speaker>
</token>
<token id="5">
<word>表达式</word>
<lemma>表达式</lemma>
<CharacterOffsetBegin>6</CharacterOffsetBegin>
<CharacterOffsetEnd>9</CharacterOffsetEnd>
<POS>AD</POS>
<NER>O</NER>
<Speaker>PER0</Speaker>
</token>
<token id="6">
<word>捕获</word>
<lemma>捕获</lemma>
<CharacterOffsetBegin>9</CharacterOffsetBegin>
<CharacterOffsetEnd>11</CharacterOffsetEnd>
<POS>VV</POS>
<NER>O</NER>
<Speaker>PER0</Speaker>
</token>
<token id="7">
<word>Web</word>
<lemma>web</lemma>
<CharacterOffsetBegin>12</CharacterOffsetBegin>
<CharacterOffsetEnd>15</CharacterOffsetEnd>
<POS>NN</POS>
<NER>O</NER>
<Speaker>PER0</Speaker>
</token>
<token id="8">
<word>地址</word>
<lemma>地址</lemma>
<CharacterOffsetBegin>16</CharacterOffsetBegin>
<CharacterOffsetEnd>18</CharacterOffsetEnd>
<POS>NN</POS>
<NER>O</NER>
<Speaker>PER0</Speaker>
</token>
<token id="9">
<word>的</word>
<lemma>的</lemma>
<CharacterOffsetBegin>18</CharacterOffsetBegin>
<CharacterOffsetEnd>19</CharacterOffsetEnd>
<POS>DEG</POS>
<NER>O</NER>
<Speaker>PER0</Speaker>
</token>
<token id="10">
<word>协议</word>
<lemma>协议</lemma>
<CharacterOffsetBegin>19</CharacterOffsetBegin>
<CharacterOffsetEnd>21</CharacterOffsetEnd>
<POS>NN</POS>
<NER>O</NER>
<Speaker>PER0</Speaker>
</token>
<token id="11">
<word>部分</word>
<lemma>部分</lemma>
<CharacterOffsetBegin>21</CharacterOffsetBegin>
<CharacterOffsetEnd>23</CharacterOffsetEnd>
<POS>NN</POS>
<NER>O</NER>
<Speaker>PER0</Speaker>
</token>
<token id="12">
<word>。</word>
<lemma>。</lemma>
<CharacterOffsetBegin>23</CharacterOffsetBegin>
<CharacterOffsetEnd>24</CharacterOffsetEnd>
<POS>PU</POS>
<NER>O</NER>
<Speaker>PER0</Speaker>
</token>
</tokens>
</dependencies>
</sentence>
...
</sentences>
<coreference>
<coreference>
<mention representative="true">
<sentence>1</sentence>
<start>1</start>
<end>5</end>
<head>4</head>
<text>第一 个 括号 子</text>
</mention>
<mention>
<sentence>2</sentence>
<start>1</start>
<end>3</end>
<head>2</head>
<text>该 子</text>
</mention>
</coreference>
</coreference>
</document>
</root>*/
}
}