java分词器


示例:pandas 是基于NumPy 的一种工具,该工具是为了解决数据分析任务而创建的。

一、代码示例

1.word分词器

代码如下:


import com.alibaba.fastjson.JSON;
import com.chenlb.mmseg4j.ComplexSeg;
import com.chenlb.mmseg4j.Dictionary;
import com.chenlb.mmseg4j.MMSeg;
import com.chenlb.mmseg4j.Seg;

import com.google.common.collect.Lists;
import org.apdplat.word.WordSegmenter;
import org.apdplat.word.segmentation.SegmentationAlgorithm;
import org.apdplat.word.segmentation.Word;

import java.io.IOException;
import java.io.StringReader;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;


public class WordFilter {

    //分词器方法一
    public static   List<Word> automaticSelection(String title) {
        //移除停用词进行分词
        List<Word> list = WordSegmenter.seg(title);


        return list;
//        System.out.println(JSON.toJSONString(list));

//        //保留停用词
//        List<Word> lists = WordSegmenter.segWithStopWords(title);
//        System.out.println(JSON.toJSONString(lists));

    }

    //word方法二
    public static Map<String, String> segMore(String text) {
        Map<String, String> map = new HashMap<>();
        for(SegmentationAlgorithm segmentationAlgorithm : SegmentationAlgorithm.values()){
            map.put(segmentationAlgorithm.getDes(), seg(text, segmentationAlgorithm));
        }
        return map;
    }
    private static String seg(String text, SegmentationAlgorithm segmentationAlgorithm) {
        StringBuilder result = new StringBuilder();
        for(Word word : WordSegmenter.segWithStopWords(text, segmentationAlgorithm)){
            result.append(word.getText()).append(" ");
        }
        return result.toString();
    }

    public static void main(String[] args) {
//        WordFilter.automaticSelection("我叫李太白,我是一个诗人,我生活在唐朝");
//        WordFilter.automaticSelection("在唐朝有一名李太白诗人");

        System.out.println( WordFilter.MMSegDemoToString("在唐朝有一名李太白诗人"));

//        Map<String, String> map = new WordFilter().segMore("我叫李太白,我是一个诗人,我生活在唐朝");
//        System.out.println(map);
    }


    /**
     * mmseg4j 分词
     * @param txt
     * @return
     */
    public static  List<String> MMSegDemo(String txt){


        StringReader input = new StringReader(txt);
        Dictionary dic = Dictionary.getInstance();
        Seg seg = new ComplexSeg(dic);//Complex分词
        //seg = new SimpleSeg(dic);//Simple分词
        MMSeg mmSeg = new MMSeg(input, seg);
        com.chenlb.mmseg4j.Word word;
        List<String> wordList = Lists.newArrayList();
        try {
            while ((word = mmSeg.next()) != null) {
                //word是单个分出的词,先放到List里下面统一按竖线拼接词打印出来
                wordList.add(word.getString());
            }
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            input.close();
        }

        return wordList;

    }


    /**
     * mmseg4j 分词
     * @param txt
     * @return String
     */
    public static  String MMSegDemoToString(String txt){


        StringReader input = new StringReader(txt);
        Dictionary dic = Dictionary.getInstance();
        Seg seg = new ComplexSeg(dic);//Complex分词
        //seg = new SimpleSeg(dic);//Simple分词
        MMSeg mmSeg = new MMSeg(input, seg);
        com.chenlb.mmseg4j.Word word;
        List<String> wordList = Lists.newArrayList();
        String citiesCommaSeparated ="";
        try {
            while ((word = mmSeg.next()) != null) {
                //word是单个分出的词,先放到List里下面统一按竖线拼接词打印出来
                wordList.add(word.getString());
            }
            citiesCommaSeparated = wordList.stream()
                    .collect(Collectors.joining(","));

        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            input.close();
        }

        return citiesCommaSeparated;

    }

}

2.mmseg4j分词器(推荐)

代码如下(示例):

    /**
     * mmseg4j 分词
     * @param txt
     * @return
     */
    public static  List<String> MMSegDemo(String txt){


        StringReader input = new StringReader(txt);
        Dictionary dic = Dictionary.getInstance();
        Seg seg = new ComplexSeg(dic);//Complex分词
        //seg = new SimpleSeg(dic);//Simple分词
        MMSeg mmSeg = new MMSeg(input, seg);
        com.chenlb.mmseg4j.Word word;
        List<String> wordList = Lists.newArrayList();
        try {
            while ((word = mmSeg.next()) != null) {
                //word是单个分出的词,先放到List里下面统一按竖线拼接词打印出来
                wordList.add(word.getString());
            }
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            input.close();
        }

        return wordList;

    }


    /**
     * mmseg4j 分词
     * @param txt
     * @return String
     */
    public static  String MMSegDemoToString(String txt){


        StringReader input = new StringReader(txt);
        Dictionary dic = Dictionary.getInstance();
        Seg seg = new ComplexSeg(dic);//Complex分词
        //seg = new SimpleSeg(dic);//Simple分词
        MMSeg mmSeg = new MMSeg(input, seg);
        com.chenlb.mmseg4j.Word word;
        List<String> wordList = Lists.newArrayList();
        String citiesCommaSeparated ="";
        try {
            while ((word = mmSeg.next()) != null) {
                //word是单个分出的词,先放到List里下面统一按竖线拼接词打印出来
                wordList.add(word.getString());
            }
            citiesCommaSeparated = wordList.stream()
                    .collect(Collectors.joining(","));

        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            input.close();
        }

        return citiesCommaSeparated;

    }

  • 0
    点赞
  • 7
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值