多个文本中找到出现频率最高的100个单词

解决思路:
1.使用newFixedThreadPool构建大小为5的线程池。让线程分别读取不同的文件。
2.使用ConcurrentHashMap对象hashmap作为共享变量,用于存放每个单词出现的频率。
3.采用最小堆在hashmap中找到频率最高的100个数据单词,并打印。

解题步骤:
1、构建Words类,该类定义了单词的拼写和数量。

class Words implements Comparable<Words>{
    String word ;
    int counts;
    public Words() {
        // TODO Auto-generated constructor stub
    }
    public Words(String word,int counts) {
        // TODO Auto-generated constructor stub
        this.word = word;
        this.counts = counts ;
    }
    //重写compareTo方法,用于比较两个对象大小
    @Override 
    public int compareTo(Words w) {
        // TODO Auto-generated method stub
        if(this.counts > w.counts){
            return 1;
        }
        if(this.counts < w.counts){
            return -1;
        }        
        return 0;
    }
    //重写clone方法(深复制)
    @Override
    protected Words clone() throws CloneNotSupportedException {
        // TODO Auto-generated method stub
        Words words = new Words(word,counts);
        return words;
    }
}

2.构建线程读取文件类,最小堆排序。

import java.io.*;
import java.util.Arrays;
import java.util.Map;
import java.util.Map.Entry;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;

public class Top100 {
    public static Map<String, Integer> hashMap = new ConcurrentHashMap<String, Integer>();
    /**
     * @param args
     */
    public static void main(String[] args) {
        // TODO Auto-generated method stub
        try {
        //构建大小为5的线程池
            ExecutorService service = Executors.newFixedThreadPool(5);
            //多个文本文件的根目录
            String path = "E:\\TestData\\testsTop100";
            File[] files = new File(path).listFiles();
            for (File file : files) {
            //遍历文件,使用子线程读取
                MyFileUtil f = new MyFileUtil();
                f.filrname = file.getPath();
                service.execute(f);
            }
            service.shutdown();
            while (true) {
            //等待所有线程执行完毕
                if (service.isTerminated()) {
                    getTop100();//获取频率最高的100个单词
                    break;
                }
                Thread.sleep(1000);
            }
            // 关闭线程池
            service.shutdown();
        } catch (Exception e) {
            // TODO: handle exception
            System.out.println(e.toString());
        }
    }
    //获取频率最高的100个单词
    private static void getTop100() {
        // TODO Auto-generated method stub
        Words[] words = null;
        //单词个数少于100直接排序输出
        if(hashMap.size()<=100){
            words = new Words[hashMap.size()];
            int i = 0;
            for(Entry<String, Integer> entry:hashMap.entrySet()){
                words[i++] = new Words(entry.getKey(),entry.getValue());
            }
            Arrays.sort(words); 
            for(int k =words.length-1;k>0;k--){
                System.out.println(words[k].word+" "+words[k].counts);           
            }
        }else{
        //否则,构建最小堆
            words = new Words[101];
            int i = 1;
            for(Entry<String, Integer> entry:hashMap.entrySet()){
                if(i>100){
                //将新元素与堆顶元素相比,大于对顶元素则替换掉堆顶元素
                    if(entry.getValue()>words[1].counts){
                        words[1].word = entry.getKey();
                        words[1].counts = entry.getValue();
                        //维护最大堆的性质
                        heapFy(words, 1, 100);
                    }
                }else{
                    words[i++] = new Words(entry.getKey(),entry.getValue());
                    if(i==101){
                        buildMinHeap(words);//构建最小堆
                    }
                }               
            }   
            //堆排序,将第一个元素与“最后一个元素”交换,然后维护最小堆的性质
            for(int k = 1;k<words.length-1;k++){
                exchange(words, 1, words.length-k);
                heapFy(words,1,words.length-1-k);
            }
            for(int k =1 ;k<=words.length-1;k++){
                System.out.println(words[k].word+" "+words[k].counts);           
            }
        }

    }

    //构建最小堆
    private static void buildMinHeap(Words[] words) {
        // TODO Auto-generated method stub
        int length = words.length-1;
        for(int i = length/2;i>=1;i--){
            heapFy(words,i,length);
        }
    }
    //维护最小堆
    private static void heapFy(Words[] words, int i, int length) {
        // TODO Auto-generated method stub
        if(i>length)
            return ;
        int left = left(i);
        int right = right(i);
        int minIndex = i;
        //找到左右节点中最小的节点
        if(left>=1&&left<=length){
            if(words[left].compareTo(words[minIndex])< 0){
                minIndex = left;
            }
        }
        if(right>=1&&right<=length){
            if(words[right].compareTo(words[minIndex])<0){
                minIndex = right;
            }
        }
        if(minIndex==i){
            return ;
        }else{
            exchange(words,i,minIndex);
            heapFy(words, minIndex, length);
        }
    }
    //交换两个元素
    private static void exchange(Words[] words, int i, int maxindex) {
        // TODO Auto-generated method stub
        try {
            Words temp;     
            temp = words[i].clone();
            words[i] = words[maxindex].clone();
            words[maxindex] = temp.clone();
        } catch (CloneNotSupportedException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }

    }
    //返回左节点索引
    private static int left(int i) {
        // TODO Auto-generated method stub
        return 2*i;
    }
    //返回右节点索引
    private static int right(int i) {
        // TODO Auto-generated method stub
        return 2*i+1;
    }
    //实现Runnable的工具类
    static class MyFileUtil implements Runnable {
        public String filrname;

        @Override
        public void run() {
            // TODO Auto-generated method stub
            FileReader fr = null;
            BufferedReader br = null;
            try {
                fr = new FileReader(filrname);
                br = new BufferedReader(fr);
                String line = "";
                String[] arrs = null;
                while ((line = br.readLine()) != null) {
                    // System.out.println(line);
                    arrs  = line.split(" ");
                    for (String string : arrs) {
                        string = prePro(string);
                        if(string.equals("")){
                            continue;
                        }
                        if (hashMap.containsKey(string.trim())) {
                            hashMap.put(string, hashMap.get(string) + 1);
                        } else {
                            hashMap.put(string, 1);
                        }
                    }                   
                }
            } catch (FileNotFoundException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            } finally {
                try {
                    if (br != null)
                        br.close();
                    if (fr != null)
                        fr.close();
                } catch (IOException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
            }
        }
        //由于测试文件是英文小说原著,里面包含了大量的符号,这里进行预处理
        private String prePro(String str) {
            // TODO Auto-generated method stub
            StringBuilder sb = new StringBuilder();
            char[] ch = str.trim().toCharArray();
            for (char c : ch) {
                if((c>='a'&&c<='z')||(c>='A'&&c<='Z')){
                    sb.append(c);
                }
            }
            return sb.toString();
        }

    }
}

测试文件:
哈利波特英文原著,7个TXT文件。
测试结果:(下面仅截取一部分)

the 94537
to 53227
and 50683
of 42893
a 39611
Harry 33407
was 30889
he 29620
said 28713
his 27301
in 24545
it 21347
I 20535
you 20235
had 20017
that 18936

关于堆排序算法可以看看我前面的博客,在排序算法中有实现。

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值