王鹏亮 的专栏

千里之行始于足下!

统计英文文章中出现的单词数量(基于字节实现)

本文的java程序用于读取一篇英语文章中单词出现的次数,基于字节实现,整个统计过程不需要转换为实际字符:

package jaas;

import java.io.FileInputStream;
import java.io.InputStream;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicInteger;

public class WordAnalyzer {
    private Map<ByteArrayWrapper, AtomicInteger> resultMap = new ConcurrentHashMap<>();

    public class ByteArrayWrapper {
        private byte[] bytes;

        public ByteArrayWrapper(byte[] bytes) {
            this.bytes = bytes;
        }

        public ByteArrayWrapper(Byte[] bytesVal) {
            bytes = new byte[bytesVal.length];
            for (int i = 0; i < bytes.length; i++) {
                bytes[i] = bytesVal[i];
            }
        }

        @Override
        public int hashCode() {
            int hashCode = 1;
            for (byte e : bytes) {
                hashCode = 31 * hashCode + e;
            }
            return hashCode;
        }

        @Override
        public boolean equals(Object obj) {
            if (!(obj instanceof ByteArrayWrapper)) return false;
            return Arrays.equals(bytes, ((ByteArrayWrapper) obj).bytes);
        }

        @Override
        public String toString() {
            return Arrays.toString(bytes);
        }
    }

    public class WordEndChecker {
        // 仅仅支持英文字符串的统计分析,这里应该尽可能地添加所有的单词分隔的符号
        private char[] wordEndChars = {'\r', '\n', ' ', '\t', '.', ',', '!', '?', '\''
//                , ' ', ',', '.', '?', '!'//不支持:全角符号,注意,全角字符占两个或三个字节,本程序只能按单字节进行截取
        };
        private byte[] wordEndBytes;

        {
            wordEndBytes = new byte[wordEndChars.length];
            for (int i = 0; i < wordEndBytes.length; i++) {
                wordEndBytes[i] = (byte) wordEndChars[i];
            }
        }

        boolean isWordEnd(byte b) {
            for (byte t : wordEndBytes) {
                if (b == t) {
                    return true;
                }
            }
            return false;
        }
    }

    public static void main(String[] args) throws Exception {
        WordAnalyzer analyzer = new WordAnalyzer();
        try (InputStream ins = new FileInputStream("/home/conquer/Downloads/zzzzzzzzzzzzzzzzz/a.txt")) {
//        try (InputStream ins = new ByteArrayInputStream(text.getBytes())) {
            analyzer.analyzeInputStream(ins);
            analyzer.doResult();
        }
    }

    public void doResult() {
        System.out.println("单词总数:" + resultMap.size());

//        for (Map.Entry<ByteArrayWrapper, AtomicInteger> e : map.entrySet()) {
//            // in bytes:按照字节打印出现的次数
//            System.out.println(e.getKey() + "-" + e.getValue());
//            // in String:将单词的字节转换为字符串,打印单词字符串出现的次数
//            System.out.println(new String(e.getKey().bytes) + "-" + e.getValue());
//        }

        // 排序:
        List<Map.Entry<ByteArrayWrapper, AtomicInteger>> list = new ArrayList(resultMap.entrySet());
        Collections.sort(list, new Comparator<Map.Entry<ByteArrayWrapper, AtomicInteger>>() {
            @Override
            public int compare(Map.Entry<ByteArrayWrapper, AtomicInteger> o1, Map.Entry<ByteArrayWrapper, AtomicInteger> o2) {
                return o2.getValue().get() - o1.getValue().get();
            }
        });
        // 打印:
        for (Map.Entry<ByteArrayWrapper, AtomicInteger> e : list) {
            System.out.println(new String(e.getKey().bytes) + ": " + e.getValue());
        }
    }

    public void analyzeInputStream(InputStream fis) throws Exception {
        final WordEndChecker wordEndChecker = new WordEndChecker();
        byte[] temp = new byte[1024];//缓冲区
        List<Byte> listTemp = new ArrayList(20);//单词字节临时存储空间
        for (int read; (read = fis.read(temp)) != -1; ) {
//            System.out.println("读取到:" + read);
            for (int i = 0; i < read; i++) {
                byte cb = temp[i];
                if (wordEndChecker.isWordEnd(cb)) {
                    flushlistTempToResultMap(listTemp);
                } else {
                    listTemp.add(cb);
                }
            }
        }
        // 文章结尾可能无换行符号 \r 等,所以最后一个单词要手动添加进来
        flushlistTempToResultMap(listTemp);
//        System.out.println("分析完成!");
    }

    public void flushlistTempToResultMap(List<Byte> listTemp) {
        if (listTemp.isEmpty()) {
            return;
        }
        Byte[] toArray = listTemp.toArray(new Byte[listTemp.size()]);
        ByteArrayWrapper byteArray = new ByteArrayWrapper(toArray);
        AtomicInteger count = resultMap.get(byteArray);
        if (count == null) {
            count = new AtomicInteger(0);
            resultMap.put(byteArray, count);
        }
        count.incrementAndGet();
        listTemp.clear();
    }
}


阅读更多
个人分类: 其它 小算法程序
想对作者说点什么? 我来说一句

没有更多推荐了,返回首页

加入CSDN,享受更精准的内容推荐,与500万程序员共同成长!
关闭
关闭