英文文章词频统计java

使用词表,遍历替换文章单词,再进行词频统计

要求

  1. 去除介词等无用单词
  2. 单词单复数,动词不同时态等合并

代码

import java.awt.print.Printable;
import java.io.RandomAccessFile;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.TreeMap;

/* 使用两重循环,分别遍历时态替换表 lemmas.txt 和介词表 disablewords.txt 替换文章重复时态和介词
 * 之后使用 treeMap 统计对应词的数量
 */

public class WordCount {

    public static void main(String[] args) {
        try {
            RandomAccessFile txtPoint = new RandomAccessFile("E:\\tt.txt", "r");
            RandomAccessFile lemmasPoint = new RandomAccessFile("E:\\lemmas.txt", "r");
            RandomAccessFile prePoint = new RandomAccessFile("E:\\disablewords.txt", "r");
            
            long txtLen = txtPoint.length();
            long lemmasLen = lemmasPoint.length();
            long preLen = prePoint.length();
            int lineLen;

            String regx = "[\\s\\pP\\d]+";
            String txt = null;
            String lemmas = null;
            String pre = null;
            String[] lemmasChange;
            
            while (txtPoint.getFilePointer() < txtLen) {
                txt = txt + txtPoint.readLine();
                
            }
            String txtChange[] = txt.toLowerCase().split(regx);
            System.out.println("txt结束");
            
            while (lemmasPoint.getFilePointer() < lemmasLen) {
            	lemmas = lemmasPoint.readLine();
            	lemmasChange = lemmas.split("\\s");
            	lineLen = lemmasChange.length;
            	for (int i=1;i<lineLen;i++) {            	
            		for (int j=0;j <txtChange.length;j++) {
            			if (lemmasChange[i].matches(txtChange[j])) {
            				txtChange[j] = lemmasChange[0];
            			}
            		}
            	}   	
            }
            System.out.println("lemmas结束");
            while (prePoint.getFilePointer() < preLen) {
            	pre = prePoint.readLine();            	
            		for (int j=0;j <txtChange.length;j++) {
            			if (pre.matches(txtChange[j])) {
            				txtChange[j] = "";
            			}
            		}
            }            
            System.out.println("pre结束");
            Map<String,Integer> map = new TreeMap<String,Integer>();
            for (String sss : txtChange) {
    			if (!sss.matches("")) {
    				 if(map.get(sss) != null) {
    		                int value = ((Integer)map.get(sss)).intValue();
    		                value++;
    		                map.put(sss, new Integer(value));
    		            } 
    		            else {
    		                map.put(sss, new Integer(1));
    		            }
    			}
            }

            List<Entry<String, Integer>> list = new ArrayList<Entry<String, Integer>>(map.entrySet());
            
            Collections.sort(list,new Comparator<Map.Entry<String,Integer>>() {
                //升序排序
                public int compare(Entry<String, Integer> o1, Entry<String, Integer> o2) {
                    return o2.getValue().compareTo(o1.getValue());
                }
            });
            
          
            int i = 0;
            for (Entry<String, Integer> e: list) {
            	i++;
            	if (i >= 10) {
            		break;
            	}
                System.out.println(e.getKey()+":"+e.getValue());
            }
            
        } catch (Exception e) {
        	System.out.println(e);
        }
    }
}
  • 1
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值