如何进行name提取?
name提取即为在一段短文本中进行name,即词性为nr的提取,我们采用hanlp进行的nlp分词,如果短文本中没有nr词性即利用n词性进行填充
import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.seg.NShort.NShortSegment;
import com.hankcs.hanlp.seg.Segment;
import com.hankcs.hanlp.seg.Viterbi.ViterbiSegment;
import com.hankcs.hanlp.seg.common.Term;
public static String nameextract(String str) {
List list1 = new ArrayList<>();
List list2 = new ArrayList<>();
// HanLP.Config.IOAdapter = new HadoopFileIoAdapter();
Segment segment = HanLP.newSegment()
.enableNameRecognize(true).
enableCustomDictionary(true);
Segment nShortSegment = new NShortSegment().
enableCustomDictionary(true)
.enablePlaceRecognize(true)
.enableOrganizationRecognize(true);
Segment shortestSegment = new ViterbiSegment()
.enableCustomDictionary(true)
.enablePlaceRecognize(true)
.enableOrganizationRecognize(true);
String[] strlist = str.split(",");
for (String str1 : strlist) {
if (str1.length() < 3) {
list1.add(str1);
} else {
List<Term> termList =
segment.seg(str1);
List<Term> termList1 =
nShortSegment.seg(str1);
List<Term> termList2 =
shortestSegment.seg(str1);
/**
* 设定返回字段,如果常规分词检测不到人名字段,转到n-最短路径分词,再转到Viterbi分词
* 如果三个分词均为检测到,分词标准降为名词,依次进行上述三种分词
*/
for (Term aTermList : termList)
if (aTermList.nature.toString().contains("nr")) {
list1.add(aTermList.word);
list2.add(aTermList.word);
}
if (list2.size() == 0) for (Term aTermList : termList1)
if (aTermList.nature.toString().contains("nr")) {
list1.add(aTermList.word);
list2.add(aTermList.word);
}
if (list2.size() == 0) {
for (Term aTermList : termList2)
if (aTermList.nature.toString().contains("nr")) {
list1.add(aTermList.word);
list2.add(aTermList.word);
}
}
if (list2.size() == 0) {
for (Term aTermList : termList)
if (aTermList.nature.toString().contains("n")) {
list1.add(aTermList.word);
list2.add(aTermList.word);
break;
}
if (list2.size() == 0)
for (Term aTermList : termList1)
if (aTermList.nature.toString().contains("n")) {
list1.add(aTermList.word);
list2.add(aTermList.word);
break;
}
if (list2.size() == 0) {
for (Term aTermList : termList2)
if (aTermList.nature.toString().contains("n")) {
list1.add(aTermList.word);
list2.add(aTermList.word);
break;
}
}
}
list2.clear();
}
}
return list1.toString().replace("[", "").replace("]", "");
}
public static void main(String[] args){
Sysout.print.out(nameextract("小红第一次写文章"))
}