自然语言分词处理，词频统计

最新推荐文章于 2023-01-17 12:40:41 发布

潇潇雨歇_

最新推荐文章于 2023-01-17 12:40:41 发布

阅读量1.7k

点赞数

分类专栏：全文检索相关文章标签：自然语言前端事务博客

本文链接：https://blog.csdn.net/linhaiyun_ytdx/article/details/77337646

版权

全文检索相关同时被 2 个专栏收录

19 篇文章 2 订阅

订阅专栏

ELK 全文检索

17 篇文章 4 订阅

订阅专栏

目标：将一段文字做分词处理，并统计分词中出现频度最高的五组词。

代码：

ToAnalysis.java

package com.test;

import java.io.Reader;
import java.util.ArrayList;
import java.util.List;

import org.ansj.domain.Result;
import org.ansj.domain.Term;
import org.ansj.recognition.arrimpl.AsianPersonRecognition;
import org.ansj.recognition.arrimpl.ForeignPersonRecognition;
import org.ansj.recognition.arrimpl.NumRecognition;
import org.ansj.recognition.arrimpl.UserDefineRecognition;
import org.ansj.splitWord.Analysis;
import org.ansj.util.AnsjReader;
import org.ansj.util.Graph;
import org.ansj.util.NameFix;
import org.ansj.util.TermUtil.InsertTermType;
import org.nlpcn.commons.lang.tire.domain.Forest;

/**
 * 标准分词
 * 
 * @author ansj
 * 
 */
public class ToAnalysis extends Analysis {

	@Override
	protected List<Term> getResult(final Graph graph) {

		Merger merger = new Merger() {
			@Override
			public List<Term> merger() {

				graph.walkPath();

				// 数字发现
				if (isNumRecognition && graph.hasNum) {
					new NumRecognition().recognition(graph.terms);
				}

				// 姓名识别
				if (graph.hasPerson && isNameRecognition) {
					// 亚洲人名识别
					new AsianPersonRecognition().recognition(graph.terms);
					graph.walkPathByScore();
					NameFix.nameAmbiguity(graph.terms);
					// 外国人名识别
					new ForeignPersonRecognition().recognition(graph.terms);
					graph.walkPathByScore();
				}

				// 用户自定义词典的识别
				userDefineRecognition(graph, forests);

				return getResult();
			}

			private void userDefineRecognition(final Graph graph, Forest... forests) {
				new UserDefineRecognition(InsertTermType.SKIP, forests).recognition(graph.terms);
				graph.rmLittlePath();
				graph.walkPathByScore();
			}

			private List<Term> getResult() {
				List<Term> result = new ArrayList<Term>();
				int length = graph.terms.length - 1;
				for (int i = 0; i < length; i++) {
					if (graph.terms[i] != null) {
						result.add(graph.terms[i]);
					}
				}
				setRealName(graph, result);
				return result;
			}
		};
		return merger.merger();
	}

	public ToAnalysis() {
		super();
	}

	public ToAnalysis(Reader reader) {
		super.resetContent(new AnsjReader(reader));
	}
	
	public static Result parse(String str) {
		return new ToAnalysis().parseStr(str);
	}

	public static Result parse(String str, Forest... forests) {
		return new ToAnalysis().setForests(forests).parseStr(str);
	}

}

SameStringCount.java

package com.wordcount;

import java.util.HashMap;

public class SameStringCount {
	 private HashMap map;  
	 private int counter;
	 public SameStringCount() {  
	        map = new HashMap<String,Integer>();  
	 }  
	 
	 public void hashInsert(String string) {  
	    if (map.containsKey(string)) {   //判断指定的Key是否存在  
	         counter = (Integer)map.get(string);  //根据key取得value  
	         map.put(string, ++counter);  
	      } else {  
	            map.put(string, 1);  
	    }  
	 } 
	 
	 public HashMap getHashMap(){  
	    return map;  
	 }  
}

Result.java

package org.ansj.domain;

import java.util.Iterator;
import java.util.List;

import org.ansj.recognition.Recognition;
import org.nlpcn.commons.lang.util.StringUtil;

/**
 * 分词结果的一个封装
 * 
 * @author Ansj
 *
 */
public class Result implements Iterable<Term> {

	private List<Term> terms = null;

	public Result(List<Term> terms) {
		this.terms = terms;
	}

	public List<Term> getTerms() {
		return terms;
	}

	public void setTerms(List<Term> terms) {
		this.terms = terms;
	}

	@Override
	public Iterator<Term> iterator() {
		return terms.iterator();
	}

	public int size() {
		return terms.size();
	}

	public Term get(int index) {
		return terms.get(index);
	}

	/**
	 * 调用一个发现引擎
	 * 
	 * @return
	 */
	public Result recognition(Recognition re) {
		re.recognition(this);
		return this;
	}

	@Override
	public String toString() {
		return toString(",");
	}

	
	public String toString(String split) {
		return StringUtil.joiner(this.terms, split);
	}

	/**
	 * 返回没有词性的切分结果
	 * @return
	 */
	public String toStringWithOutNature(){
		return  toStringWithOutNature(",");
	}
	
	/**
	 * 返回没有词性的切分结果
	 * @return
	 */
	public String toStringWithOutNature(String split) {
		
		if(terms==null || terms.size()==0){
			return "" ;
		}
		
		Iterator<Term> iterator = terms.iterator() ;
		
		StringBuilder sb = new StringBuilder(iterator.next().getRealName()) ;
		
		while(iterator.hasNext()){
			sb.append(split);
			sb.append(iterator.next().getRealName()) ;
		}
		
		return sb.toString();
	}

}

Term.java

package org.ansj.domain;

import java.io.Serializable;
import java.util.List;
import java.util.Map;

import org.ansj.util.MathUtil;
import org.nlpcn.commons.lang.util.StringUtil;

public class Term implements Serializable{
	/**
	 * 
	 */
	private static final long serialVersionUID = 1L;
	// 当前词
	private String name;
	//
	private String realName;
	// 当前词的起始位置
	private int offe;
	// 词性列表
	private TermNatures termNatures = TermNatures.NULL;
	// 词性列表
	private AnsjItem item = AnsjItem.NULL;
	// 同一行内数据
	private Term next;
	// 分数
	private double score = 0;
	// 本身分数
	private double selfScore = 1;
	// 起始位置
	private Term from;
	// 到达位置
	private Term to;
	// 本身这个term的词性.需要在词性识别之后才会有值,默认是空
	private Nature nature = Nature.NULL;
	//是否是一个新词
	private boolean newWord ;
	//同义词
	private List<String> synonyms ;
	

	private List<Term> subTerm = null;

	public Term(String name, int offe, AnsjItem item) {
		super();
		this.name = name;
		this.offe = offe;
		this.item = item;
		if (item.termNatures != null) {
			this.termNatures = item.termNatures;
			if (termNatures.nature != null) {
				this.nature = termNatures.nature;
			}
		}
	}

	public Term(String name, int offe, TermNatures termNatures) {
		super();
		this.name = name;
		this.offe = offe;
		this.termNatures = termNatures;
		if (termNatures.nature != null) {
			this.nature = termNatures.nature;
		}
	}

	public Term(String name, int offe, String natureStr, int natureFreq) {
		super();
		this.name = name;
		this.offe = offe;
		TermNature termNature = new TermNature(natureStr, natureFreq);
		this.nature = termNature.nature;
		this.termNatures = new TermNatures(termNature);
	}

	// 可以到达的位置
	public int toValue() {
		return offe + name.length();
	}

	public int getOffe() {
		return offe;
	}

	public void setOffe(int offe) {
		this.offe = offe;
	}

	public String getName() {
		return name;
	}

	public void setName(String name) {
		this.name = name;
	}

	/**
	 * 核心构建最优的路径
	 * 
	 * @param term
	 */
	public void setPathScore(Term from, Map<String, Double> relationMap) {
		// 维特比进行最优路径的构建
		double score = MathUtil.compuScore(from, this, relationMap);
		if (this.from == null || this.score == 0 || this.score >= score) {
			this.setFromAndScore(from, score);
		}
	}

	/**
	 * 核心分数的最优的路径,越小越好
	 * 
	 * @param term
	 */
	public void setPathSelfScore(Term from) {
		double score = this.selfScore + from.score;
		// 维特比进行最优路径的构建
		if (this.from == null || this.score > score) {
			this.setFromAndScore(from, score);
		}
	}

	private void setFromAndScore(Term from, double score) {
		this.from = from;
		this.score = score;
	}

	/**
	 * 进行term合并
	 * 
	 * @param term
	 * @param maxNature
	 */
	public Term merage(Term to) {
		this.name = this.name + to.getName();
		if (StringUtil.isNotBlank(this.realName) && StringUtil.isNotBlank(to.getRealName())) {
			this.realName = this.realName + to.getRealName();
		}
		this.setTo(to.to);
		return this;
	}

	/**
	 * 进行term合并,能合并空白字符
	 * 
	 * @param term
	 * @param maxNature
	 */
	public Term merageWithBlank(Term to) {
		this.name = this.name + to.getName();
		this.realName = this.realName + to.getRealName();
		this.setTo(to.to);
		return this;
	}
	
	/**
	 * 更新偏移量
	 * 
	 * @param offe
	 */
	public void updateOffe(int offe) {
		this.offe += offe;
	}

	public Term next() {
		return next;
	}

	/**
	 * 返回他自己
	 * 
	 * @param next
	 *            设置他的下一个
	 * @return
	 */
	public Term setNext(Term next) {
		this.next = next;
		return this;
	}

	public Term from() {
		return from;
	}

	public Term to() {
		return to;
	}

	public void setFrom(Term from) {
		this.from = from;
	}

	public void setTo(Term to) {
		this.to = to;
	}

	/**
	 * 获得这个term的所有词性
	 * 
	 * @return
	 */
	public TermNatures termNatures() {
		return termNatures;
	}

	public void setNature(Nature nature) {
		this.nature = nature;
	}

	/**
	 * 获得这个词的词性.词性计算后才可生效
	 * 
	 * @return
	 */
	public Nature natrue() {
		return nature;
	}

	public String getNatureStr() {
		return nature.natureStr;
	}

	@Override
	public String toString() {
		if ("null".equals(nature.natureStr)) {
			return this.getRealName();
		}	
	  //return this.getRealName() + "/" + nature.natureStr;
		return this.getRealName();    //自己修改的地方(lhy)
	}

	/**
	 * 将term的所有分数置为0
	 */
	public void clearScore() {
		this.score = 0;
		this.selfScore = 0;
	}

	public void setSubTerm(List<Term> subTerm) {
		this.subTerm = subTerm;
		 
	}

	public List<Term> getSubTerm() {
		return subTerm;
	}

	public String getRealName() {
		if (realName == null) {
			return name;
		}
		return realName;
	}

	public void setRealName(String realName) {
		this.realName = realName;
	}

	public double score() {
		return this.score;
	}

	public void score(double score) {
		this.score = score;
	}

	public double selfScore() {
		return this.selfScore;
	}

	public void selfScore(double selfScore) {
		this.selfScore = selfScore;
	}

	public AnsjItem item() {
		return this.item;
	}

	public boolean isNewWord() {
		return newWord;
	}

	public void setNewWord(boolean newWord) {
		this.newWord = newWord;
	}

	public void updateTermNaturesAndNature(TermNatures termNatures) {
		this.termNatures = termNatures;
		this.nature = termNatures.nature ;
	}

	public List<String> getSynonyms() {
		return synonyms;
	}

	public void setSynonyms(List<String> synonyms) {
		this.synonyms = synonyms;
	}
	
}

WordCount.java

package com.wordcount;

import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.ansj.splitWord.analysis.ToAnalysis;
import org.ansj.util.MyStaticValue;

/*
 * 功能：分词频度最高的五组词，和对应的频度
 * 作者：lhy
 * 时间：2017年8月17日
 */
public class WordCount {
    public HashMap<String , Integer> getGarticiples(String str){
		MyStaticValue.isNumRecognition = true ;
		MyStaticValue.isQuantifierRecognition = false ;
		String[]ss;
		String s = ToAnalysis.parse(str).toString(); 
		ss = s.split(",");
		SameStringCount Count = new SameStringCount();
		
		String regex = "([\u4e00-\u9fa5]+){2,10}";    //匹配两个以上中文的正则表达式
		for(int i=0; i<ss.length; i++){
			boolean flag = match(regex, ss[i].toString());
			if(flag){
				Count.hashInsert(ss[i]);   //添加该分词
			}
		}
		
		HashMap map = Count.getHashMap();
		HashMap<String, Integer> news = new HashMap<String,Integer>();      //定义一个新的哈希图来保存分词
	    String temp ;
	    List<Map.Entry<String,Integer>> list=new ArrayList<>();   
	    list.addAll(map.entrySet());  
	    WordCount.ValueComparator vc=new ValueComparator();  
	    Collections.sort(list,vc);      //分词频度的排序
	    int num = 0;
	    Iterator<Map.Entry<String, Integer>> it = list.iterator();
	    while (it.hasNext()) {
	       if(num == 5){
	    	   break;
	       }
	       Map.Entry<String, Integer> entry = it.next();
//	       System.out.println("key=" + entry.getKey() + ",value=" + entry.getValue());
	       news.put(entry.getKey(), entry.getValue());
	       num++;
	   }
	    return news;     //返回频度最高的五组词
    }
    
	//正则表达式的判定
	private static boolean match(String regex, String str) {
		Pattern pattern = Pattern.compile(regex);
		Matcher matcher = pattern.matcher(str);
		return matcher.matches();
    }
	
	 private static class ValueComparator implements Comparator<Map.Entry<String,Integer>> {  
	    public int compare(Map.Entry<String,Integer> m,Map.Entry<String,Integer> n){  
		      return n.getValue()-m.getValue();  
		 }  
	 }  
}

WordDegrees.java

package com.wordcount;

import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.ansj.splitWord.analysis.ToAnalysis;
import org.ansj.util.MyStaticValue;

/*
 * 功能：分词频度最高的五组词,返回五组词，空格相连
 * 作者：lhy
 * 时间：2017年8月17日
 */
public class WordDegrees {
    public String WordProcessing(String str){
		MyStaticValue.isNumRecognition = true ;
		MyStaticValue.isQuantifierRecognition = false ;
		String[]ss;
		String s = ToAnalysis.parse(str).toString(); 
		ss = s.split(",");
		SameStringCount Count = new SameStringCount();
		
		String regex = "([\u4e00-\u9fa5]+){2,10}";    //匹配两个以上中文的正则表达式
		for(int i=0; i<ss.length; i++){
			boolean flag = match(regex, ss[i].toString());
			if(flag){
				Count.hashInsert(ss[i]);   //添加该分词
			}
		}
		
		HashMap map = Count.getHashMap();
	    String temp ;
	    List<Map.Entry<String,Integer>> list=new ArrayList<>();   
	    list.addAll(map.entrySet());  
	    WordDegrees.ValueComparator vc=new ValueComparator();  
	    Collections.sort(list,vc);      //分词频度的排序
	    int num = 0;
	    String key="";    //存放五组分词的字符串
	    Iterator<Map.Entry<String, Integer>> it = list.iterator();
	    while (it.hasNext()) {
	       if(num == 5){
	    	   break;
	       }
	       Map.Entry<String, Integer> entry = it.next();
//	       System.out.println("key=" + entry.getKey() + ",value=" + entry.getValue());
	       key +=entry.getKey().toString()+" ";
	       num++;
	   }
	    return key;     //返回频度最高的五组词
    }
    
	//正则表达式的判定
	private static boolean match(String regex, String str) {
		Pattern pattern = Pattern.compile(regex);
		Matcher matcher = pattern.matcher(str);
		return matcher.matches();
    }
	
	 private static class ValueComparator implements Comparator<Map.Entry<String,Integer>> {  
	    public int compare(Map.Entry<String,Integer> m,Map.Entry<String,Integer> n){  
		      return n.getValue()-m.getValue();  
		 }  
	 }  
}

Test.java

package com.wordcount;

import java.util.HashMap;

public class Test {

	public static void main(String[] args) {
		// TODO Auto-generated method stub
	   String str = "《热爱生命》,可以说是汪国真的代表作之一,这首诗以四个肯定的回答表达出为何要热爱生命的哲理.四个段落,看似相似,却各有其趣.四个段落分别以“成功”、“爱情”、“奋斗历程”和“未来”为意象进行分析和回答.这四个意象可以说是包括汪国真、席慕容在内的一些清新哲理派诗人惯用的几个意象,不晦涩,不故弄玄虚,不生僻难解,可以说是完全区别于朦胧诗的特点,也是汪国真的诗歌取得成功之原因所在.";
	   
	   //第一种方式
	   WordCount word = new WordCount();
	   HashMap<String, Integer> hash = word.getGarticiples(str);
	   System.out.println("第一种方式：");
	   System.out.println(hash);
	   
	   //第二种方式
	   System.out.println("第二种方式：");
	   WordDegrees degre = new WordDegrees();
	   String result = degre.WordProcessing(str);
	   System.out.println(result); 
	}

}

截图：