1、介绍
TreeMap中的数据可以按照ASCII码自动排序,但是对中文排序支持较差,参考blog进行处理。
英文文本中可能有不同的符号{“.”, ” “, “?”, “!”,”\”“, “:”, “’”},因此需进行替换处理。
统计英文单词时需要统一为小写。
2、算法思路
(1)由于TreeMap集合具有不能保存重复对象,且可以按ASCII码自动排序的特性,因此用于保存统计结果;
(2)将字符串转换成char或String数组;
(3)遍历数组,判断每个元素是否已经包含在集合中,如果是,则将其对应的value加1;如果否,则将该key放入集合,对应的value为1。
3、具体代码
package com.peter.algorithm.other;
import org.junit.Test;
import java.text.CollationKey;
import java.text.Collator;
import java.util.Comparator;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
public class DataCount {
@Test
public void test() {
String origin = "Enjoy Your Day with Gratitude.--生活需要怀抱一颗感恩之心";
Map<Character, Integer> result = characterCount(origin);
for (Map.Entry<Character, Integer> entry : result.entrySet()) {
System.out.println("字符" + "\"" + entry.getKey() + "\"" + "的个数为: " + entry.getValue());
}
String text = "When you are about to give up think of that one reason why you wanted to start.";
Map<String, Integer> wordsNum = wordCount(text);
for (Map.Entry<String, Integer> entry : wordsNum.entrySet()) {
System.out.println("单词" + "\"" + entry.getKey() + "\"" + "的个数为: " + entry.getValue());
}
}
public static Map<Character, Integer> characterCount(String origin) {
if (origin == null || origin.isEmpty()) {
return null;
}
CollatorComparator comparator = new CollatorComparator();
Map<Character, Integer> result = new TreeMap<>(comparator);
Set<Character> keys = result.keySet();
char[] chars = origin.toCharArray();
int len = chars.length;
for (int i = 0; i < len; i++) {
char tempChar = chars[i];
if (keys.contains(tempChar)) {
result.put(tempChar, result.get(tempChar) + 1);
} else {
result.put(tempChar, 1);
}
}
return result;
}
public static Map<String, Integer> wordCount(String origin) {
if (origin == null || origin.isEmpty()) {
return null;
}
Map<String, Integer> result = new TreeMap<>();
Set<String> keys = result.keySet();
//替换段落中的标点符号与空格
String[] symbols = {".", " ", "?", "!","\"", ":", "'"};
for (String symbol : symbols) {
origin = origin.replace(symbol, ",");
}
String[] words = origin.split(",");
int len = words.length;
for (int i = 0; i < len; i++) {
String word = words[i];
String wordLowerCase = word.toLowerCase();
if (!wordLowerCase.equals("")) {
if (keys.contains(wordLowerCase)) {
result.put(wordLowerCase, result.get(wordLowerCase) + 1);
} else {
result.put(wordLowerCase, 1);
}
}
}
return result;
}
//在TreeMap中支持中文排序
public static class CollatorComparator implements Comparator {
Collator collator = Collator.getInstance();
public int compare(Object element1, Object element2) {
CollationKey key1 = collator.getCollationKey(element1.toString());
CollationKey key2 = collator.getCollationKey(element2.toString());
return key1.compareTo(key2);
}
}
}