java分词器
示例:pandas 是基于NumPy 的一种工具,该工具是为了解决数据分析任务而创建的。
一、代码示例
1.word分词器
代码如下:
import com.alibaba.fastjson.JSON;
import com.chenlb.mmseg4j.ComplexSeg;
import com.chenlb.mmseg4j.Dictionary;
import com.chenlb.mmseg4j.MMSeg;
import com.chenlb.mmseg4j.Seg;
import com.google.common.collect.Lists;
import org.apdplat.word.WordSegmenter;
import org.apdplat.word.segmentation.SegmentationAlgorithm;
import org.apdplat.word.segmentation.Word;
import java.io.IOException;
import java.io.StringReader;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
public class WordFilter {
//分词器方法一
public static List<Word> automaticSelection(String title) {
//移除停用词进行分词
List<Word> list = WordSegmenter.seg(title);
return list;
// System.out.println(JSON.toJSONString(list));
// //保留停用词
// List<Word> lists = WordSegmenter.segWithStopWords(title);
// System.out.println(JSON.toJSONString(lists));
}
//word方法二
public static Map<String, String> segMore(String text) {
Map<String, String> map = new HashMap<>();
for(SegmentationAlgorithm segmentationAlgorithm : SegmentationAlgorithm.values()){
map.put(segmentationAlgorithm.getDes(), seg(text, segmentationAlgorithm));
}
return map;
}
private static String seg(String text, SegmentationAlgorithm segmentationAlgorithm) {
StringBuilder result = new StringBuilder();
for(Word word : WordSegmenter.segWithStopWords(text, segmentationAlgorithm)){
result.append(word.getText()).append(" ");
}
return result.toString();
}
public static void main(String[] args) {
// WordFilter.automaticSelection("我叫李太白,我是一个诗人,我生活在唐朝");
// WordFilter.automaticSelection("在唐朝有一名李太白诗人");
System.out.println( WordFilter.MMSegDemoToString("在唐朝有一名李太白诗人"));
// Map<String, String> map = new WordFilter().segMore("我叫李太白,我是一个诗人,我生活在唐朝");
// System.out.println(map);
}
/**
* mmseg4j 分词
* @param txt
* @return
*/
public static List<String> MMSegDemo(String txt){
StringReader input = new StringReader(txt);
Dictionary dic = Dictionary.getInstance();
Seg seg = new ComplexSeg(dic);//Complex分词
//seg = new SimpleSeg(dic);//Simple分词
MMSeg mmSeg = new MMSeg(input, seg);
com.chenlb.mmseg4j.Word word;
List<String> wordList = Lists.newArrayList();
try {
while ((word = mmSeg.next()) != null) {
//word是单个分出的词,先放到List里下面统一按竖线拼接词打印出来
wordList.add(word.getString());
}
} catch (IOException e) {
e.printStackTrace();
} finally {
input.close();
}
return wordList;
}
/**
* mmseg4j 分词
* @param txt
* @return String
*/
public static String MMSegDemoToString(String txt){
StringReader input = new StringReader(txt);
Dictionary dic = Dictionary.getInstance();
Seg seg = new ComplexSeg(dic);//Complex分词
//seg = new SimpleSeg(dic);//Simple分词
MMSeg mmSeg = new MMSeg(input, seg);
com.chenlb.mmseg4j.Word word;
List<String> wordList = Lists.newArrayList();
String citiesCommaSeparated ="";
try {
while ((word = mmSeg.next()) != null) {
//word是单个分出的词,先放到List里下面统一按竖线拼接词打印出来
wordList.add(word.getString());
}
citiesCommaSeparated = wordList.stream()
.collect(Collectors.joining(","));
} catch (IOException e) {
e.printStackTrace();
} finally {
input.close();
}
return citiesCommaSeparated;
}
}
2.mmseg4j分词器(推荐)
代码如下(示例):
/**
* mmseg4j 分词
* @param txt
* @return
*/
public static List<String> MMSegDemo(String txt){
StringReader input = new StringReader(txt);
Dictionary dic = Dictionary.getInstance();
Seg seg = new ComplexSeg(dic);//Complex分词
//seg = new SimpleSeg(dic);//Simple分词
MMSeg mmSeg = new MMSeg(input, seg);
com.chenlb.mmseg4j.Word word;
List<String> wordList = Lists.newArrayList();
try {
while ((word = mmSeg.next()) != null) {
//word是单个分出的词,先放到List里下面统一按竖线拼接词打印出来
wordList.add(word.getString());
}
} catch (IOException e) {
e.printStackTrace();
} finally {
input.close();
}
return wordList;
}
/**
* mmseg4j 分词
* @param txt
* @return String
*/
public static String MMSegDemoToString(String txt){
StringReader input = new StringReader(txt);
Dictionary dic = Dictionary.getInstance();
Seg seg = new ComplexSeg(dic);//Complex分词
//seg = new SimpleSeg(dic);//Simple分词
MMSeg mmSeg = new MMSeg(input, seg);
com.chenlb.mmseg4j.Word word;
List<String> wordList = Lists.newArrayList();
String citiesCommaSeparated ="";
try {
while ((word = mmSeg.next()) != null) {
//word是单个分出的词,先放到List里下面统一按竖线拼接词打印出来
wordList.add(word.getString());
}
citiesCommaSeparated = wordList.stream()
.collect(Collectors.joining(","));
} catch (IOException e) {
e.printStackTrace();
} finally {
input.close();
}
return citiesCommaSeparated;
}