Multiple Threads: Word Frequency

Directory and File:

data3(dir)
    file1.txt(file)
    file2.txt(file)
    file3.txt(file)
filter(dir)
    filter.txt(file)

File contents:

file1.txt
Free Shipping  Mini Car Auto12v  Fresh Air Purifier Oxygen Bar
freeshipping freeshipping
file2.txt
freeshipping new  Electromagnetic parking sensor no holes need to be drilled
freeshipping
freeshipping
file3.txt
DC 12V 1 to 3 Car Cigarette Lighter Socket Power Adapter Splitter with 1 USB Port  free shipping  #9622 [aaa bbb] ccc{ ddd}
freeshipping

Filter contents:

filter.txt
sensor            
bbb            
lighter              
auto12v              
usb             
oxygen             
ddd             
parking              
cigarette             
port 
1
free

shipping
no
need

Java Code:

WordsAnalysis.java
package com.algorithms.multiple.threads.frequency.word;

/**
 * Created with IntelliJ IDEA.
 * User: 1O1O
 * Date: 2015-04-01
 * Time: 19:31 PM
 * :)~
 * MULTIPLE-THREADS-WORD-FREQUENCY:WORD-FREQUENCY
 */

import java.io.*;
import java.text.SimpleDateFormat;
import java.util.*;

public class WordsAnalysis {

    private static final String FILTER_WORDS_FILE_PATH = "//Users//robot//TEMP//testData//filter//filter.txt";
    private static Set<String> filterWordsSet = new HashSet<String>();

    /**
     * @param args
     * @throws IOException
     */
    public static void main(String[] args) throws IOException {

        loadFilterWords();
        File f = new File("//Users//robot//TEMP//testData//data3");
        File[] fs = f.listFiles();
        // 分成两半
        List<File> files1 = new ArrayList<File>();
        for (int i = 0; i < fs.length/2; i++) {
            files1.add(fs[i]);

        }
        List<File> files2 = new ArrayList<File>();
        for (int i = fs.length/2; i < fs.length; i++) {
            files2.add(fs[i]);
        }

        // 工作线程总数
        int threadCount = 0;
        // 共享数据
        AllCountModel acm = new AllCountModel();
        acm.setThreadCount(++threadCount);
        CountWordsThread tt1 = new CountWordsThread(files1, acm);
        // 1号线程
        System.out.println("Thread 1: start!");
        tt1.start();

        acm.setThreadCount(++threadCount);
        CountWordsThread tt2 = new CountWordsThread(files2, acm);
        // 2号线程
        System.out.println("Thread 2: start!");
        tt2.start();

        MonitorThread mt = new MonitorThread(acm);
        // 监视线程
        System.out.println("Thread Monitor: start!");
        mt.start();
    }

    /**
     *
     * @param file
     * @param wordsMap
     * @return
     * @throws IOException
     */
    public Map<String, Integer> countWords(File file, Map<String, Integer> wordsMap) throws IOException{
        String text = readFile(file).toLowerCase();// 将所有字母化为小写
        text = text.replaceAll("[`~!@#$%^&*()+=|{}':;',//\\[//\\].<>/?~!@#¥%……&*()——+|{}【】‘;:”“’。,、?]|\\s+|\t|\r", " ");
        String words[] = text.split("\\s+");// 取出单词,并将单词存入数组中
        for (int i = 0; i < words.length; i++) {
            String word = words[i].trim();
            // 重现的单词
            if(wordsMap.containsKey(word) && !filterWordsSet.contains(word)){
                // 计数
                wordsMap.put(word, (wordsMap.get(word) + 1));
            }else if(!wordsMap.containsKey(word) && !filterWordsSet.contains(word)){
                // 第一次出现的新单词
                wordsMap.put(word, 1);
            }
        }
        return wordsMap;
    }

    public static List<Map.Entry<String, Integer>> hashSort(Map<String, Integer> dataHash) {
        List<Map.Entry<String, Integer>> list_Data = new ArrayList<Map.Entry<String, Integer>>(dataHash.entrySet());
        Collections.sort(list_Data, new Comparator<Map.Entry<String, Integer>>() {
            public int compare(Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2) {
                if (o2.getValue() != null && o1.getValue() != null && o2.getValue().compareTo(o1.getValue()) > 0) {
                    return 1;
                } else {
                    return -1;
                }
            }
        });
        return list_Data;
    }

    /**
     * 打印结果
     * @param AllCountModel 共享的结果集
     */
    public static void show(AllCountModel acm){
        System.out.println("Number of threads left: "+acm.getThreadCount());
        Map<String, Integer> dataHash = acm.getDataHash();
        List<Map.Entry<String, Integer>> dataList = hashSort(dataHash);
        System.out.println("Start: write word and frequency");

        int size = dataList.size();
        int number = 1;

        SimpleDateFormat df = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");//设置日期格式
        System.out.println("========================"+df.format(new Date())+"=========================");// new Date()为获取当前系统时间

        System.out.println(String.format("%-15s", "word number") + String.format("%-30s", "word") + String.format("%-15s","frequency"));
        for (int i = 0; i < size; i++) {
            String word = dataList.get(i).getKey();
            int frequency = dataList.get(i).getValue();

            System.out.print(String.format("%-15d", number++));
            System.out.print(String.format("%-30s", word));
            System.out.print(String.format("%-15d", frequency));
            System.out.println();
        }
        System.out.println("End: write word and frequency");
    }

    public static void loadFilterWords() {
        String filterWordsText = readFileByPath(FILTER_WORDS_FILE_PATH);
        String words[] = filterWordsText.split("\\s+|\\t|\\r|\\n");// 取出单词,并将单词存入数组中
        System.out.println("Number of filter words: "+words.length);
        for(String word : words){
            filterWordsSet.add(word);
        }
    }

    /**
     * read content from filePath and return content
     * @param filePath
     */
    public static String readFileByPath(String filePath) {
        File file = new File(filePath);
        StringBuffer result = new StringBuffer();
        BufferedReader reader = null;
        try {
            reader = new BufferedReader(new FileReader(file));
            String tempString = null;
            while ((tempString = reader.readLine()) != null) {
                result.append(" ");
                result.append(tempString);
            }
            reader.close();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            if (reader != null) {
                try {
                    reader.close();
                } catch (IOException e1) {
                }
            }
        }
        return result.toString();
    }

    public static String readFile(File file) {
        //File file = new File(filePath);
        StringBuffer result = new StringBuffer();
        BufferedReader reader = null;
        try {
            reader = new BufferedReader(new FileReader(file));
            String tempString = null;
            while ((tempString = reader.readLine()) != null) {
                result.append(" ");
                result.append(tempString);
            }
            reader.close();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            if (reader != null) {
                try {
                    reader.close();
                } catch (IOException e1) {
                }
            }
        }
        return result.toString();
    }
}
AllCountModel.java
package com.algorithms.multiple.threads.frequency.word;

/**
 * Created with IntelliJ IDEA.
 * User: 1O1O
 * Date: 2015-04-01
 * Time: 19:31 PM
 * :)~
 * MULTIPLE-THREADS-WORD-FREQUENCY:WORD-FREQUENCY
 */

import java.util.HashMap;
import java.util.Map;

public class AllCountModel {

    // 在运行的线程总数
    private int threadCount;

    //所有线程共有的结构:dataHash,用于存储最终的结果集
    private static Map<String, Integer> dataHash = new HashMap<String, Integer>();

    public int getThreadCount() {
        return threadCount;
    }
    public void setThreadCount(int threadCount) {
        this.threadCount = threadCount;
    }

    public Map<String, Integer> getDataHash(){
        return dataHash;
    }

    public void setDataHash(Map<String, Integer> wordsMap){
        for (String key : wordsMap.keySet()) {
            if ((this.dataHash.get(key) != null)) {
                // 将单个线程中生成的map数据映射到公共的dataHash:value对应单词出现的频率,单词已在dataHash中存在,则value相加
                int value = ((Integer) this.dataHash.get(key)).intValue()+((Integer) wordsMap.get(key)).intValue();
                this.dataHash.put(key, new Integer(value));
            } else if((this.dataHash.get(key) == null) ){
                // 将单个线程中生成的map数据映射到公共的dataHash:value对应单词出现的频率,单词未在dataHash中存在,则赋为该线程的value值
                this.dataHash.put(key, ((Integer) wordsMap.get(key)).intValue());
            }
        }
    }
}
CountWordsThread.java
package com.algorithms.multiple.threads.frequency.word;

/**
 * Created with IntelliJ IDEA.
 * User: 1O1O
 * Date: 2015-04-01
 * Time: 19:31 PM
 * :)~
 * MULTIPLE-THREADS-WORD-FREQUENCY:WORD-FREQUENCY
 */

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.HashMap;
import java.util.Map;

public class CountWordsThread extends Thread{

    private List<File> files = new ArrayList<File>();
    private Map<String, Integer> wordsMap = new HashMap<String, Integer>();
    private AllCountModel allCountModel;

    // 每一个线程都传入不一样的files,所以不用担心这个对象的同步冲突
    public CountWordsThread(List<File> files, AllCountModel allCountModel){
        this.files = files;
        this.allCountModel = allCountModel;
    }

    public void run() {
        WordsAnalysis wa = new WordsAnalysis();
        // 解析传入的全部文件
        for (File file : files) {
            try {
                // 解析文件内容
                wordsMap = wa.countWords(file, wordsMap);
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        // 锁住共享数据(必须这么做,否则共享的数据会紊乱)
        synchronized (allCountModel) {
            // 更新线程总数
            allCountModel.setThreadCount(allCountModel.getThreadCount() - 1);
            System.out.println("Thread: stop!");
            // 更新结果集
            allCountModel.setDataHash(wordsMap);
        }
    }
}
MonitorThread.java
package com.algorithms.multiple.threads.frequency.word;

/**
 * Created with IntelliJ IDEA.
 * User: 1O1O
 * Date: 2015-04-01
 * Time: 19:31 PM
 * :)~
 * MULTIPLE-THREADS-WORD-FREQUENCY:WORD-FREQUENCY
 */

public class MonitorThread extends Thread{

    // 共享数据
    private AllCountModel acm;

    public MonitorThread(AllCountModel acm){
        this.acm = acm;
    }

    public void run() {
        while(true){
            try {
                // 隔段时间检查一次
                sleep(500);
            } catch (InterruptedException e) {
                e.printStackTrace();
            }
            // 线程全部工作完毕
            if(0 >= acm.getThreadCount()){
                // 打印出结果
                WordsAnalysis.show(acm);
                System.out.println("Thread Monitor: end!");
                return;
            }
        }
    }
}

Outputs:

Number of filter words: 16
Thread 1: start!
Thread 2: start!
Thread Monitor: start!
Thread: stop!
Thread: stop!
Number of threads left: 0
Start: write word and frequency
========================2015-04-01 19:49:12=========================
word number    word                          frequency      
1              freeshipping                  6              
2              to                            2              
3              car                           2              
4              be                            1              
5              mini                          1              
6              holes                         1              
7              bar                           1              
8              9622                          1              
9              air                           1              
10             ccc                           1              
11             power                         1              
12             socket                        1              
13             drilled                       1              
14             fresh                         1              
15             electromagnetic               1              
16             new                           1              
17             3                             1              
18             splitter                      1              
19             purifier                      1              
20             adapter                       1              
21             12v                           1              
22             aaa                           1              
23             with                          1              
24             dc                            1              
End: write word and frequency
Thread Monitor: end!
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值