题目
10GB,100GB,1TB文件,文件格式都是每一行一个字符串;一个字符串在三个文件中都出现,才计数(最小为3,在3个文件中分别出现一次),否则不计数;
我的解答
package com.bigdata.splitfile;
import com.bigdata.utils.FileUtil;
import com.bigdata.utils.HashCodeUtil;
import org.apache.commons.io.FileUtils;
import java.io.*;
import java.util.*;
import static java.util.Objects.hash;
/**
* 10GB,100GB,1TB文件,文件格式都是每一行一个字符串;一个字符串在三个文件中都出现,才计数(最小为3,在3个文件中分别出现一次),否则不计数;
* // 只有10GB的计算内存,算TOP 10, LAST 10字符串出现次数的统计排序
* <p>
* AFile=10GB
* BFile=100GB
* CFile=1000GB
*/
public class SplitThenMerger<E extends Comparable> {
private final static String PRE_PATH = "/Users/xxx/Desktop/data/";
private static int K=2; // 堆的最大容量,即 topk,所以maxsize=k
static Comparator<MapBeans> cmp = new Comparator<MapBeans>() {
public int compare(MapBeans b1, MapBeans b2) {
return b1.getV() - b2.getV();
}
};
static Comparator<MapBeans> lastCmp = new Comparator<MapBeans>() {
public int compare(MapBeans b1, MapBeans b2) {
return b2.getV() - b1.getV();
}
};
static PriorityQueue<MapBeans> queueTopN = new PriorityQueue<>(K, cmp);
static PriorityQueue<MapBeans> queueLastN = new PriorityQueue<>(K, lastCmp);
public static void main(String[] args) throws IOException {
// 测试切割文件
String pre = "a";
// splitFile("/Users/xxx/Desktop/data/input/"+pre+"/"+pre+"_data.txt", "/Users/xxx/Desktop/data/output/"+pre+"/", 10, pre+"_");
// 测试寻找A/B/C的共同数据,并写入新的文件
// String num = "2";
// String pathA = "/Users/xxx/Desktop/data/output/a/a_" + num + ".txt";
// String pathB = "/Users/xxx/Desktop/data/output/b/b_" + num + ".txt";
// String pathC = "/Users/xxx/Desktop/data/output/c/c_" + num + ".txt";
// getCommonSequence(pathA, pathB, pathC);
// 测试小文件合并
// String commonFile = "/Users/xxx/Desktop/data/common/common_1.txt";
// List<MapBeans> list = readCommonFile(commonFile);
// 求topN
List<String> pathList = new ArrayList<>();
pathList.add("/Users/xxx/Desktop/data/common/common_0.txt");
pathList.add("/Users/xxx/Desktop/data/common/common_1.txt");
pathList.add("/Users/xxx/Desktop/data/common/common_2.txt");
List<MapBeans> topN = readSmallFile(pathList, queueTopN, K);
List<MapBeans> lastN = lastN(pathList, queueLastN, K);
}
private static List<MapBeans> lastN(List<String> pathList, PriorityQueue<MapBeans> queueLastN, int k) throws IOException {
if (pathList.size() <= 0) {
return null;
}
for (int i = 0; i < pathList.size(); i++) {
List<MapBeans> list = readCommonFile(pathList.get(i));
mergerFileLastN(list,k);
}
List<MapBeans> res = new ArrayList<>();
for (int i = 0; i < k; i++) {
MapBeans poll = queueLastN.poll();
res.add(poll);
}
return res;
}
private static void mergerFileLastN(List<MapBeans> list, int k) {
//将元素加大顶堆 求lastN
for (int i = 0; i < list.size(); i++) {
if (queueLastN.size() < k) {
queueLastN.add(list.get(i));
} else {
MapBeans head = queueLastN.peek();
if (list.get(i).getV() < head.getV()) {
queueLastN.poll();
queueLastN.add(list.get(i));
}
}
}
}
private static List<MapBeans> readSmallFile(List<String> pathList, PriorityQueue<MapBeans> queueTopN, int k) throws IOException {
if (pathList.size() <= 0) {
return null;
}
for (int i = 0; i < pathList.size(); i++) {
List<MapBeans> list = readCommonFile(pathList.get(i));
mergerFileTopN(list,k);
}
List<MapBeans> res = new ArrayList<>();
for (int i = 0; i < k; i++) {
MapBeans poll = queueTopN.poll();
res.add(poll);
}
return res;
}
/**
* 读小文件转为数组
*/
private static ArrayList<MapBeans> readCommonFile(String commonFile) throws IOException {
ArrayList<MapBeans> list = new ArrayList<>();
File infile = new File(commonFile);
BufferedReader br = new BufferedReader(new FileReader(infile));
String line;
while ((line = br.readLine()) != null) {
// 加载到数组
String k = line.split("=")[0];
Integer v = Integer.valueOf(line.split("=")[1]);
MapBeans m = new MapBeans(k, v);
list.add(m);
}
return list;
}
// 一、先分
//1. 切割文件 hash(str) % 1000,
/**
* @param
* @param nums
* @param pre
* @return
*/
static File targetFile = null;
public static void splitFile(String inpathname, String outpathname, int nums, String pre) throws IOException {
File infile = new File(inpathname);
BufferedReader br = new BufferedReader(new FileReader(infile));
String line;
while ((line = br.readLine()) != null) {
// TODO hash算法涉及很重要
int outNum = Math.abs(hash(line)) % nums;
targetFile = FileUtil.getFile(outpathname + pre + outNum + ".txt");
FileUtils.writeStringToFile(targetFile, line + "\n", true);
}
}
/**
* 统计每3个小文件的共同字符串出现的频率
* <p>
* AFile切割后 10G/1000=10M
* BFile切割后 100G/1000=100M
* CFile切割后 1000G/1000=1G
* 10G内存足够处理这些数据
*/
public static void getCommonSequence(String pathA, String pathB, String pathC) throws IOException {
//获取文件名的最后的编号 A_999.txt,获取编号999
String fileNameNum = getFileNameNum(pathA);
//同一批处理 hash%1000相同的文件的里面的数据
// 如 A_0.txt
ArrayList<String> strA = file2StringArray(pathA);
// 如 B_0.txt
ArrayList<String> strB = file2StringArray(pathB);
// 如 C_0.txt
ArrayList<String> strC = file2StringArray(pathC);
Map<String, Integer> mapA = new HashMap<>();
Map<String, Integer> mapB = new HashMap<>();
Map<String, Integer> mapC = new HashMap<>();
//特殊判断,如果其中一个文件为空就不用找共同的数据,肯定没有
if (strA.size() == 0 || strB.size() == 0 || strC.size() == 0) {
return;
}
// 先局部聚合
Map<String, Integer> AFrequence = countFrequence(strA, mapA);
Map<String, Integer> BFrequence = countFrequence(strB, mapB);
Map<String, Integer> CFrequence = countFrequence(strC, mapC);
Map<String, Integer> commonAB = mapJoin(AFrequence, BFrequence);
Map<String, Integer> commonABC = mapJoin(commonAB, CFrequence);
// 公共的文件写入文件 common_0.txt,common_1.txt......common_10
targetFile = FileUtil.getFile(PRE_PATH + "common/common_" + fileNameNum + ".txt");
Set<Map.Entry<String, Integer>> entries = commonABC.entrySet();
for (Map.Entry<String, Integer> map : entries) {
String key = map.getKey();
Integer value = map.getValue() * 3;
FileUtils.writeStringToFile(targetFile, key + "=" + value + "\n", true);
}
}
/**
* 求两个文件的交集
*
* @param aFrequence
* @param bFrequence
* @return
*/
private static Map<String, Integer> mapJoin(Map<String, Integer> aFrequence, Map<String, Integer> bFrequence) {
Map<String, Integer> commonMap = new HashMap<>();
Set<Map.Entry<String, Integer>> entries = bFrequence.entrySet();
for (Map.Entry<String, Integer> map : entries) {
String key = map.getKey();
Integer value = map.getValue();
if (aFrequence.containsKey(key)) {
commonMap.put(key, value);
} else {
continue;
}
}
return commonMap;
}
/**
* 局部聚合每个文件的字符串,统计出现的频率
*
* @param strA
* @param mapA
* @return
*/
private static Map<String, Integer> countFrequence(ArrayList<String> strA, Map<String, Integer> mapA) {
for (int i = 0; i < strA.size(); i++) {
String keyA = strA.get(i);
if (mapA.containsKey(keyA)) {
Integer count = mapA.get(keyA);
mapA.put(keyA, count + 1);
} else {
mapA.put(keyA, 1);
}
}
return mapA;
}
// TODO 获取文件的末尾编号
public static String getFileNameNum(String pathA) {
String num = pathA.split("_")[1].split("\\.")[0];
return num;
}
public static ArrayList<String> file2StringArray(String pathA) throws IOException {
ArrayList<String> res = new ArrayList<>();
File infile = new File(pathA);
BufferedReader br = new BufferedReader(new FileReader(infile));
String line;
while ((line = br.readLine()) != null) {
res.add(line);
}
return res;
}
// 二、合并
// 1. 逐个合并,
public static void mergerFileTopN(List<MapBeans> list,int k) {
//将元素加入堆 求topN构建小顶堆
for (int i = 0; i < list.size(); i++) {
if (queueTopN.size() < k) {
queueTopN.add(list.get(i));
} else {
MapBeans head = queueTopN.peek();
if (list.get(i).getV() > head.getV()) {
queueTopN.poll();
queueTopN.add(list.get(i));
}
}
}
}
}
class MapBeans {
private String k;
private Integer v;
public MapBeans() {
}
public MapBeans(String k, Integer v) {
this.k = k;
this.v = v;
}
public String getK() {
return k;
}
public void setK(String k) {
this.k = k;
}
public Integer getV() {
return v;
}
public void setV(Integer v) {
this.v = v;
}
}