import com.google.gson.Gson;
import lombok.extern.slf4j.Slf4j;
import net.go2global.common.core.bean.dto.StringSplitDTO;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import java.io.BufferedReader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* 拆词
* @Author zyh
* @Date 2020/10/23 17:04
*/
@Slf4j
public class StringSplitUtils {
public static void main(String[] args) {
//String input="Günaydın Patron 좋은 아침 Reduce 1 hour of the a-b a_b a:b www.163.com can't remaining building duration你好我就随便测测";
String input="I love I Beijing Tiananmen Square, the sun rises on Tiananmen Square!";
List<StringSplitDTO> list = getSplit(input);
log.info(new Gson().toJson(list));
}
public static List<StringSplitDTO> getSplit(String input){
List<StringSplitDTO> returnList = new ArrayList<>();
try {
Analyzer analyzer = new StandardAnalyzer();
BufferedReader fileReader = null;
fileReader = new BufferedReader(new StringReader(input));
List<String> result = new ArrayList<String>();
TokenStream ts = analyzer.tokenStream(null, fileReader);
OffsetAttribute offsetAttribute = ts.addAttribute(OffsetAttribute.class);
CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class);
ts.reset();//必须的
while( ts.incrementToken() ){
int startOffset = offsetAttribute.startOffset();
int endOffset = offsetAttribute.endOffset();
String term = charTermAttribute.toString();
StringSplitDTO stringSplitDTO=new StringSplitDTO();
stringSplitDTO.setString(term);
stringSplitDTO.setStartIndex(startOffset);
stringSplitDTO.setEndIndex(endOffset);
returnList.add(stringSplitDTO);
//System.out.println(term + " ["+startOffset+","+endOffset + "]");
}
//System.out.println(result.size());
ts.end();
ts.close();
} catch (Exception e) {
log.error("拆词:"+e);
}
return returnList;
}
public static Map<String,StringSplitDTO> getSplitMap(String input){
Map<String,StringSplitDTO> map = new HashMap<>();
try {
Analyzer analyzer = new StandardAnalyzer();
BufferedReader fileReader = null;
fileReader = new BufferedReader(new StringReader(input));
List<String> result = new ArrayList<String>();
TokenStream ts = analyzer.tokenStream(null, fileReader);
OffsetAttribute offsetAttribute = ts.addAttribute(OffsetAttribute.class);
CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class);
ts.reset();//必须的
while( ts.incrementToken() ){
int startOffset = offsetAttribute.startOffset();
int endOffset = offsetAttribute.endOffset();
String term = charTermAttribute.toString();
StringSplitDTO stringSplitDTO=new StringSplitDTO();
stringSplitDTO.setString(term);
stringSplitDTO.setStartIndex(startOffset);
stringSplitDTO.setEndIndex(endOffset);
map.put(term,stringSplitDTO);
//System.out.println(term + " ["+startOffset+","+endOffset + "]");
}
//System.out.println(result.size());
ts.end();
ts.close();
} catch (Exception e) {
log.error("拆词:"+e);
}
return map;
}
}