目标:将一段文字做分词处理,并统计分词中出现频度最高的五组词。
代码:
ToAnalysis.java
package com.test;
import java.io.Reader;
import java.util.ArrayList;
import java.util.List;
import org.ansj.domain.Result;
import org.ansj.domain.Term;
import org.ansj.recognition.arrimpl.AsianPersonRecognition;
import org.ansj.recognition.arrimpl.ForeignPersonRecognition;
import org.ansj.recognition.arrimpl.NumRecognition;
import org.ansj.recognition.arrimpl.UserDefineRecognition;
import org.ansj.splitWord.Analysis;
import org.ansj.util.AnsjReader;
import org.ansj.util.Graph;
import org.ansj.util.NameFix;
import org.ansj.util.TermUtil.InsertTermType;
import org.nlpcn.commons.lang.tire.domain.Forest;
/**
* 标准分词
*
* @author ansj
*
*/
public class ToAnalysis extends Analysis {
@Override
protected List<Term> getResult(final Graph graph) {
Merger merger = new Merger() {
@Override
public List<Term> merger() {
graph.walkPath();
// 数字发现
if (isNumRecognition && graph.hasNum) {
new NumRecognition().recognition(graph.terms);
}
// 姓名识别
if (graph.hasPerson && isNameRecognition) {
// 亚洲人名识别
new AsianPersonRecognition().recognition(graph.terms);
graph.walkPathByScore();
NameFix.nameAmbiguity(graph.terms);
// 外国人名识别
new ForeignPersonRecognition().recognition(graph.terms);
graph.walkPathByScore();
}
// 用户自定义词典的识别
userDefineRecognition(graph, forests);
return getResult();
}
private void userDefineRecognition(final Graph graph, Forest... forests) {
new UserDefineRecognition(InsertTermType.SKIP, forests).recognition(graph.terms);
graph.rmLittlePath();
graph.walkPathByScore();
}
private List<Term> getResult() {
List<Term> result = new ArrayList<Term>();
int length = graph.terms.length - 1;
for (int i = 0; i < length; i++) {
if (graph.terms[i] != null) {
result.add(graph.terms[i]);
}
}
setRealName(graph, result);
return result;
}
};
return merger.merger();
}
public ToAnalysis() {
super();
}
public ToAnalysis(Reader reader) {
super.resetContent(new AnsjReader(reader));
}
public static Result parse(String str) {
return new ToAnalysis().parseStr(str);
}
public static Result parse(String str, Forest... forests) {
return new ToAnalysis().setForests(forests).parseStr(str);
}
}
SameStringCount.java
package com.wordcount;
import java.util.HashMap;
public class SameStringCount {
private HashMap map;
private int counter;
public SameStringCount() {
map = new HashMap<String,Integer>();
}
public void hashInsert(String string) {
if (map.containsKey(string)) { //判断指定的Key是否存在
counter = (Integer)map.get(string); //根据key取得value
map.put(string, ++counter);
} else {
map.put(string, 1);
}
}
public HashMap getHashMap(){
return map;
}
}
Result.java
package org.ansj.domain;
import java.util.Iterator;
import java.util.List;
import org.ansj.recognition.Recognition;
import org.nlpcn.commons.lang.util.StringUtil;
/**
* 分词结果的一个封装
*
* @author Ansj
*
*/
public class Result implements Iterable<Term> {
private List<Term> terms = null;
public Result(List<Term> terms) {
this.terms = terms;
}
public List<Term> getTerms() {
return terms;
}
public void setTerms(List<Term> terms) {
this.terms = terms;
}
@Override
public Iterator<Term> iterator() {
return terms.iterator();
}
public int size() {
return terms.size();
}
public Term get(int index) {
return terms.get(index);
}
/**
* 调用一个发现引擎
*
* @return
*/
public Result recognition(Recognition re) {
re.recognition(this);
return this;
}
@Override
public String toString() {
return toString(",");
}
public String toString(String split) {
return StringUtil.joiner(this.terms, split);
}
/**
* 返回没有词性的切分结果
* @return
*/
public String toStringWithOutNature(){
return toStringWithOutNature(",");
}
/**
* 返回没有词性的切分结果
* @return
*/
public String toStringWithOutNature(String split) {
if(terms==null || terms.size()==0){
return "" ;
}
Iterator<Term> iterator = terms.iterator() ;
StringBuilder sb = new StringBuilder(iterator.next().getRealName()) ;
while(iterator.hasNext()){
sb.append(split);
sb.append(iterator.next().getRealName()) ;
}
return sb.toString();
}
}
Term.java
package org.ansj.domain;
import java.io.Serializable;
import java.util.List;
import java.util.Map;
import org.ansj.util.MathUtil;
import org.nlpcn.commons.lang.util.StringUtil;
public class Term implements Serializable{
/**
*
*/
private static final long serialVersionUID = 1L;
// 当前词
private String name;
//
private String realName;
// 当前词的起始位置
private int offe;
// 词性列表
private TermNatures termNatures = TermNatures.NULL;
// 词性列表
private AnsjItem item = AnsjItem.NULL;
// 同一行内数据
private Term next;
// 分数
private double score = 0;
// 本身分数
private double selfScore = 1;
// 起始位置
private Term from;
// 到达位置
private Term to;
// 本身这个term的词性.需要在词性识别之后才会有值,默认是空
private Nature nature = Nature.NULL;
//是否是一个新词
private boolean newWord ;
//同义词
private List<String> synonyms ;
private List<Term> subTerm = null;
public Term(String name, int offe, AnsjItem item) {
super();
this.name = name;
this.offe = offe;
this.item = item;
if (item.termNatures != null) {
this.termNatures = item.termNatures;
if (termNatures.nature != null) {
this.nature = termNatures.nature;
}
}
}
public Term(String name, int offe, TermNatures termNatures) {
super();
this.name = name;
this.offe = offe;
this.termNatures = termNatures;
if (termNatures.nature != null) {
this.nature = termNatures.nature;
}
}
public Term(String name, int offe, String natureStr, int natureFreq) {
super();
this.name = name;
this.offe = offe;
TermNature termNature = new TermNature(natureStr, natureFreq);
this.nature = termNature.nature;
this.termNatures = new TermNatures(termNature);
}
// 可以到达的位置
public int toValue() {
return offe + name.length();
}
public int getOffe() {
return offe;
}
public void setOffe(int offe) {
this.offe = offe;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
/**
* 核心构建最优的路径
*
* @param term
*/
public void setPathScore(Term from, Map<String, Double> relationMap) {
// 维特比进行最优路径的构建
double score = MathUtil.compuScore(from, this, relationMap);
if (this.from == null || this.score == 0 || this.score >= score) {
this.setFromAndScore(from, score);
}
}
/**
* 核心分数的最优的路径,越小越好
*
* @param term
*/
public void setPathSelfScore(Term from) {
double score = this.selfScore + from.score;
// 维特比进行最优路径的构建
if (this.from == null || this.score > score) {
this.setFromAndScore(from, score);
}
}
private void setFromAndScore(Term from, double score) {
this.from = from;
this.score = score;
}
/**
* 进行term合并
*
* @param term
* @param maxNature
*/
public Term merage(Term to) {
this.name = this.name + to.getName();
if (StringUtil.isNotBlank(this.realName) && StringUtil.isNotBlank(to.getRealName())) {
this.realName = this.realName + to.getRealName();
}
this.setTo(to.to);
return this;
}
/**
* 进行term合并,能合并空白字符
*
* @param term
* @param maxNature
*/
public Term merageWithBlank(Term to) {
this.name = this.name + to.getName();
this.realName = this.realName + to.getRealName();
this.setTo(to.to);
return this;
}
/**
* 更新偏移量
*
* @param offe
*/
public void updateOffe(int offe) {
this.offe += offe;
}
public Term next() {
return next;
}
/**
* 返回他自己
*
* @param next
* 设置他的下一个
* @return
*/
public Term setNext(Term next) {
this.next = next;
return this;
}
public Term from() {
return from;
}
public Term to() {
return to;
}
public void setFrom(Term from) {
this.from = from;
}
public void setTo(Term to) {
this.to = to;
}
/**
* 获得这个term的所有词性
*
* @return
*/
public TermNatures termNatures() {
return termNatures;
}
public void setNature(Nature nature) {
this.nature = nature;
}
/**
* 获得这个词的词性.词性计算后才可生效
*
* @return
*/
public Nature natrue() {
return nature;
}
public String getNatureStr() {
return nature.natureStr;
}
@Override
public String toString() {
if ("null".equals(nature.natureStr)) {
return this.getRealName();
}
//return this.getRealName() + "/" + nature.natureStr;
return this.getRealName(); //自己修改的地方(lhy)
}
/**
* 将term的所有分数置为0
*/
public void clearScore() {
this.score = 0;
this.selfScore = 0;
}
public void setSubTerm(List<Term> subTerm) {
this.subTerm = subTerm;
}
public List<Term> getSubTerm() {
return subTerm;
}
public String getRealName() {
if (realName == null) {
return name;
}
return realName;
}
public void setRealName(String realName) {
this.realName = realName;
}
public double score() {
return this.score;
}
public void score(double score) {
this.score = score;
}
public double selfScore() {
return this.selfScore;
}
public void selfScore(double selfScore) {
this.selfScore = selfScore;
}
public AnsjItem item() {
return this.item;
}
public boolean isNewWord() {
return newWord;
}
public void setNewWord(boolean newWord) {
this.newWord = newWord;
}
public void updateTermNaturesAndNature(TermNatures termNatures) {
this.termNatures = termNatures;
this.nature = termNatures.nature ;
}
public List<String> getSynonyms() {
return synonyms;
}
public void setSynonyms(List<String> synonyms) {
this.synonyms = synonyms;
}
}
WordCount.java
package com.wordcount;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.ansj.splitWord.analysis.ToAnalysis;
import org.ansj.util.MyStaticValue;
/*
* 功能:分词频度最高的五组词,和对应的频度
* 作者:lhy
* 时间:2017年8月17日
*/
public class WordCount {
public HashMap<String , Integer> getGarticiples(String str){
MyStaticValue.isNumRecognition = true ;
MyStaticValue.isQuantifierRecognition = false ;
String[]ss;
String s = ToAnalysis.parse(str).toString();
ss = s.split(",");
SameStringCount Count = new SameStringCount();
String regex = "([\u4e00-\u9fa5]+){2,10}"; //匹配两个以上中文的正则表达式
for(int i=0; i<ss.length; i++){
boolean flag = match(regex, ss[i].toString());
if(flag){
Count.hashInsert(ss[i]); //添加该分词
}
}
HashMap map = Count.getHashMap();
HashMap<String, Integer> news = new HashMap<String,Integer>(); //定义一个新的哈希图来保存分词
String temp ;
List<Map.Entry<String,Integer>> list=new ArrayList<>();
list.addAll(map.entrySet());
WordCount.ValueComparator vc=new ValueComparator();
Collections.sort(list,vc); //分词频度的排序
int num = 0;
Iterator<Map.Entry<String, Integer>> it = list.iterator();
while (it.hasNext()) {
if(num == 5){
break;
}
Map.Entry<String, Integer> entry = it.next();
// System.out.println("key=" + entry.getKey() + ",value=" + entry.getValue());
news.put(entry.getKey(), entry.getValue());
num++;
}
return news; //返回频度最高的五组词
}
//正则表达式的判定
private static boolean match(String regex, String str) {
Pattern pattern = Pattern.compile(regex);
Matcher matcher = pattern.matcher(str);
return matcher.matches();
}
private static class ValueComparator implements Comparator<Map.Entry<String,Integer>> {
public int compare(Map.Entry<String,Integer> m,Map.Entry<String,Integer> n){
return n.getValue()-m.getValue();
}
}
}
package com.wordcount;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.ansj.splitWord.analysis.ToAnalysis;
import org.ansj.util.MyStaticValue;
/*
* 功能:分词频度最高的五组词,返回五组词,空格相连
* 作者:lhy
* 时间:2017年8月17日
*/
public class WordDegrees {
public String WordProcessing(String str){
MyStaticValue.isNumRecognition = true ;
MyStaticValue.isQuantifierRecognition = false ;
String[]ss;
String s = ToAnalysis.parse(str).toString();
ss = s.split(",");
SameStringCount Count = new SameStringCount();
String regex = "([\u4e00-\u9fa5]+){2,10}"; //匹配两个以上中文的正则表达式
for(int i=0; i<ss.length; i++){
boolean flag = match(regex, ss[i].toString());
if(flag){
Count.hashInsert(ss[i]); //添加该分词
}
}
HashMap map = Count.getHashMap();
String temp ;
List<Map.Entry<String,Integer>> list=new ArrayList<>();
list.addAll(map.entrySet());
WordDegrees.ValueComparator vc=new ValueComparator();
Collections.sort(list,vc); //分词频度的排序
int num = 0;
String key=""; //存放五组分词的字符串
Iterator<Map.Entry<String, Integer>> it = list.iterator();
while (it.hasNext()) {
if(num == 5){
break;
}
Map.Entry<String, Integer> entry = it.next();
// System.out.println("key=" + entry.getKey() + ",value=" + entry.getValue());
key +=entry.getKey().toString()+" ";
num++;
}
return key; //返回频度最高的五组词
}
//正则表达式的判定
private static boolean match(String regex, String str) {
Pattern pattern = Pattern.compile(regex);
Matcher matcher = pattern.matcher(str);
return matcher.matches();
}
private static class ValueComparator implements Comparator<Map.Entry<String,Integer>> {
public int compare(Map.Entry<String,Integer> m,Map.Entry<String,Integer> n){
return n.getValue()-m.getValue();
}
}
}
Test.java
package com.wordcount;
import java.util.HashMap;
public class Test {
public static void main(String[] args) {
// TODO Auto-generated method stub
String str = "《热爱生命》,可以说是汪国真的代表作之一,这首诗以四个肯定的回答表达出为何要热爱生命的哲理.四个段落,看似相似,却各有其趣.四个段落分别以“成功”、“爱情”、“奋斗历程”和“未来”为意象进行分析和回答.这四个意象可以说是包括汪国真、席慕容在内的一些清新哲理派诗人惯用的几个意象,不晦涩,不故弄玄虚,不生僻难解,可以说是完全区别于朦胧诗的特点,也是汪国真的诗歌取得成功之原因所在.";
//第一种方式
WordCount word = new WordCount();
HashMap<String, Integer> hash = word.getGarticiples(str);
System.out.println("第一种方式:");
System.out.println(hash);
//第二种方式
System.out.println("第二种方式:");
WordDegrees degre = new WordDegrees();
String result = degre.WordProcessing(str);
System.out.println(result);
}
}
截图:
总结:主要是借鉴别人的开源项目,自己添加的一个功能。