拆字工具类



import com.google.gson.Gson;
import lombok.extern.slf4j.Slf4j;
import net.go2global.common.core.bean.dto.StringSplitDTO;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;

import java.io.BufferedReader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
 * 拆词
 * @Author zyh
 * @Date 2020/10/23 17:04
 */
@Slf4j
public class StringSplitUtils {

    public static void main(String[] args) {

        //String input="Günaydın Patron 좋은 아침 Reduce 1 hour of the a-b a_b a:b www.163.com can't remaining building duration你好我就随便测测";
        String input="I love I Beijing Tiananmen Square, the sun rises on Tiananmen Square!";
        List<StringSplitDTO> list = getSplit(input);
        log.info(new Gson().toJson(list));
    }


    public static List<StringSplitDTO> getSplit(String input){

        List<StringSplitDTO> returnList = new ArrayList<>();

        try {
            Analyzer analyzer = new StandardAnalyzer();

            BufferedReader fileReader = null;
            fileReader = new BufferedReader(new StringReader(input));
            List<String> result = new ArrayList<String>();
            TokenStream ts = analyzer.tokenStream(null, fileReader);
            OffsetAttribute offsetAttribute = ts.addAttribute(OffsetAttribute.class);
            CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class);

            ts.reset();//必须的
            while( ts.incrementToken() ){
                int startOffset = offsetAttribute.startOffset();
                int endOffset = offsetAttribute.endOffset();
                String term = charTermAttribute.toString();

                StringSplitDTO stringSplitDTO=new StringSplitDTO();
                stringSplitDTO.setString(term);
                stringSplitDTO.setStartIndex(startOffset);
                stringSplitDTO.setEndIndex(endOffset);

                returnList.add(stringSplitDTO);
                //System.out.println(term + " ["+startOffset+","+endOffset + "]");
            }
            //System.out.println(result.size());
            ts.end();
            ts.close();
        } catch (Exception e) {
            log.error("拆词:"+e);
        }

        return returnList;
    }

    public static Map<String,StringSplitDTO> getSplitMap(String input){

        Map<String,StringSplitDTO> map = new HashMap<>();

        try {
            Analyzer analyzer = new StandardAnalyzer();

            BufferedReader fileReader = null;
            fileReader = new BufferedReader(new StringReader(input));
            List<String> result = new ArrayList<String>();
            TokenStream ts = analyzer.tokenStream(null, fileReader);
            OffsetAttribute offsetAttribute = ts.addAttribute(OffsetAttribute.class);
            CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class);

            ts.reset();//必须的
            while( ts.incrementToken() ){
                int startOffset = offsetAttribute.startOffset();
                int endOffset = offsetAttribute.endOffset();
                String term = charTermAttribute.toString();

                StringSplitDTO stringSplitDTO=new StringSplitDTO();
                stringSplitDTO.setString(term);
                stringSplitDTO.setStartIndex(startOffset);
                stringSplitDTO.setEndIndex(endOffset);

                map.put(term,stringSplitDTO);
                //System.out.println(term + " ["+startOffset+","+endOffset + "]");
            }
            //System.out.println(result.size());
            ts.end();
            ts.close();
        } catch (Exception e) {
            log.error("拆词:"+e);
        }

        return map;
    }








}

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值