import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.util.Map.Entry;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import net.paoding.analysis.analyzer.PaodingAnalyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
public class lianxi {
/**
* @param args
* @throws IOException
*/
public static void main(String[] args) throws IOException {
PaodingAnalyzer analyzer = new PaodingAnalyzer();
String pattern = "[\u4e00-\u9fa5]|[,]|[。]|[!]|[?]/.(\\D|$)"; //
Pattern p = Pattern.compile(pattern);
Matcher m;
//这是读,都是样板代码。
BufferedReader br = new BufferedReader(
new InputStreamReader(
new FileInputStream("E:\\sj.txt"), "UTF-8"));
//每一个分出来的词当键
Map<String, Integer> map = new TreeMap<String, Integer>();
StringBuffer sb;
String line;
TokenStream token;
Token t;
while((line = br.readLine()) != null){
// line = br.readLine()读取下一行,然后赋值给line, 然后比较 如果line不为null,说明有下一行,
//如果为null,说明已经读完了,就不执行里面的语句了
//System.out.println(line);
m = p.matcher(line);
sb = new StringBuffer();
while(m.find()){
sb.append(m.group());
}
token = analyzer.tokenStream(sb.toString(), new StringReader(sb.toString()));//就这一行是分词
//这就是分词,固定写法。
while((t = token.next()) != null){
if(t.termText().trim().length() > 1){
if(map.containsKey(t.termText())){
map.put(t.termText(), map.get(t.termText())+1);
}else{
map.put(t.termText(), 1);
}
}
}
}
br.close();
System.out.println("===============");
List<Entry<String, Integer>> list = sort(map);
for(int i = list.size()-1; i > list.size()-200; i--){
Entry<String , Integer> kv = list.get(i);
System.out.println(kv.getKey() + ":" + kv.getValue());
}
//map就是统计后的词频,取长度大于1。
//这是写
// OutputStreamWriter ow = new OutputStreamWriter(new FileOutputStream("E:\\sj.txt"), "utf-8");
// ow.write("hahah" + "\n");
// ow.close();
//是先遍历文件的每一行,然后对每一行做分词,然后把每一行分出的词,做统计
}
//java的针对集合排序接口。
public static List<Entry<String, Integer>> sort(Map<String, Integer> m) {
List<Map.Entry<String, Integer>> mappingList = null;
mappingList = new ArrayList<Map.Entry<String, Integer>>(m.entrySet());
Collections.sort(mappingList, new Comparator<Map.Entry<String, Integer>>() {
public int compare(Map.Entry<String, Integer> mapping1, Map.Entry<String, Integer> mapping2) {
return mapping1.getValue().compareTo(mapping2.getValue());
}
});
return mappingList;
}
}
paoding--分词
最新推荐文章于 2017-05-22 21:15:00 发布