阿里P7面试题--归并思想（大数据方向）

最新推荐文章于 2024-04-21 16:55:32 发布

明喆_sama

最新推荐文章于 2024-04-21 16:55:32 发布

阅读量175

点赞数

分类专栏：算法与数据结构

本文链接：https://blog.csdn.net/u010848845/article/details/118874946

版权

算法与数据结构专栏收录该内容

13 篇文章 1 订阅

订阅专栏

题目

10GB，100GB，1TB文件，文件格式都是每一行一个字符串；一个字符串在三个文件中都出现，才计数（最小为3，在3个文件中分别出现一次），否则不计数；

我的解答

package com.bigdata.splitfile;

import com.bigdata.utils.FileUtil;
import com.bigdata.utils.HashCodeUtil;
import org.apache.commons.io.FileUtils;

import java.io.*;
import java.util.*;

import static java.util.Objects.hash;

/**
 * 10GB，100GB，1TB文件，文件格式都是每一行一个字符串；一个字符串在三个文件中都出现，才计数（最小为3，在3个文件中分别出现一次），否则不计数；
 * // 只有10GB的计算内存，算TOP 10， LAST 10字符串出现次数的统计排序
 * <p>
 * AFile=10GB
 * BFile=100GB
 * CFile=1000GB
 */
public class SplitThenMerger<E extends Comparable> {
    private final static String PRE_PATH = "/Users/xxx/Desktop/data/";
    private static int K=2; // 堆的最大容量,即 topk,所以maxsize=k

    static Comparator<MapBeans> cmp = new Comparator<MapBeans>() {
        public int compare(MapBeans b1, MapBeans b2) {
            return b1.getV() - b2.getV();
        }
    };

    static Comparator<MapBeans> lastCmp = new Comparator<MapBeans>() {
        public int compare(MapBeans b1, MapBeans b2) {
            return b2.getV() - b1.getV();
        }
    };

    static PriorityQueue<MapBeans> queueTopN = new PriorityQueue<>(K, cmp);

    static PriorityQueue<MapBeans> queueLastN = new PriorityQueue<>(K, lastCmp);

    public static void main(String[] args) throws IOException {
        // 测试切割文件
        String pre = "a";
//        splitFile("/Users/xxx/Desktop/data/input/"+pre+"/"+pre+"_data.txt", "/Users/xxx/Desktop/data/output/"+pre+"/", 10, pre+"_");

        // 测试寻找A/B/C的共同数据，并写入新的文件
//        String num = "2";
//        String pathA = "/Users/xxx/Desktop/data/output/a/a_" + num + ".txt";
//        String pathB = "/Users/xxx/Desktop/data/output/b/b_" + num + ".txt";
//        String pathC = "/Users/xxx/Desktop/data/output/c/c_" + num + ".txt";
//        getCommonSequence(pathA, pathB, pathC);

        // 测试小文件合并
//        String commonFile = "/Users/xxx/Desktop/data/common/common_1.txt";
//        List<MapBeans> list = readCommonFile(commonFile);
        // 求topN
        List<String> pathList = new ArrayList<>();
        pathList.add("/Users/xxx/Desktop/data/common/common_0.txt");
        pathList.add("/Users/xxx/Desktop/data/common/common_1.txt");
        pathList.add("/Users/xxx/Desktop/data/common/common_2.txt");


        List<MapBeans> topN = readSmallFile(pathList, queueTopN, K);

        List<MapBeans> lastN = lastN(pathList, queueLastN, K);

    }

    private static List<MapBeans> lastN(List<String> pathList, PriorityQueue<MapBeans> queueLastN, int k) throws IOException {
        if (pathList.size() <= 0) {
            return null;
        }
        for (int i = 0; i < pathList.size(); i++) {
            List<MapBeans> list = readCommonFile(pathList.get(i));
            mergerFileLastN(list,k);
        }
        List<MapBeans> res = new ArrayList<>();
        for (int i = 0; i < k; i++) {
            MapBeans poll = queueLastN.poll();
            res.add(poll);
        }
        return res;
    }

    private static void mergerFileLastN(List<MapBeans> list, int k) {
        //将元素加大顶堆 求lastN
        for (int i = 0; i < list.size(); i++) {
            if (queueLastN.size() < k) {
                queueLastN.add(list.get(i));
            } else {
                MapBeans head = queueLastN.peek();
                if (list.get(i).getV() < head.getV()) {
                    queueLastN.poll();
                    queueLastN.add(list.get(i));
                }
            }

        }
    }

    private static List<MapBeans> readSmallFile(List<String> pathList, PriorityQueue<MapBeans> queueTopN, int k) throws IOException {

        if (pathList.size() <= 0) {
            return null;
        }
        for (int i = 0; i < pathList.size(); i++) {
            List<MapBeans> list = readCommonFile(pathList.get(i));
            mergerFileTopN(list,k);
        }
        List<MapBeans> res = new ArrayList<>();
        for (int i = 0; i < k; i++) {
            MapBeans poll = queueTopN.poll();
            res.add(poll);
        }
        return res;

    }

    /**
     * 读小文件转为数组
     */
    private static ArrayList<MapBeans> readCommonFile(String commonFile) throws IOException {
        ArrayList<MapBeans> list = new ArrayList<>();
        File infile = new File(commonFile);
        BufferedReader br = new BufferedReader(new FileReader(infile));
        String line;
        while ((line = br.readLine()) != null) {
            // 加载到数组

            String k = line.split("=")[0];
            Integer v = Integer.valueOf(line.split("=")[1]);
            MapBeans m = new MapBeans(k, v);

            list.add(m);

        }
        return list;
    }

    // 一、先分

    //1. 切割文件 hash(str) % 1000,

    /**
     * @param
     * @param nums
     * @param pre
     * @return
     */
    static File targetFile = null;

    public static void splitFile(String inpathname, String outpathname, int nums, String pre) throws IOException {
        File infile = new File(inpathname);
        BufferedReader br = new BufferedReader(new FileReader(infile));
        String line;
        while ((line = br.readLine()) != null) {
            // TODO hash算法涉及很重要
            int outNum = Math.abs(hash(line)) % nums;
            targetFile = FileUtil.getFile(outpathname + pre + outNum + ".txt");
            FileUtils.writeStringToFile(targetFile, line + "\n", true);
        }

    }


    /**
     * 统计每3个小文件的共同字符串出现的频率
     * <p>
     * AFile切割后 10G/1000=10M
     * BFile切割后 100G/1000=100M
     * CFile切割后 1000G/1000=1G
     * 10G内存足够处理这些数据
     */

    public static void getCommonSequence(String pathA, String pathB, String pathC) throws IOException {
        //获取文件名的最后的编号 A_999.txt,获取编号999
        String fileNameNum = getFileNameNum(pathA);
        //同一批处理 hash%1000相同的文件的里面的数据
        // 如 A_0.txt
        ArrayList<String> strA = file2StringArray(pathA);
        // 如 B_0.txt
        ArrayList<String> strB = file2StringArray(pathB);
        // 如 C_0.txt
        ArrayList<String> strC = file2StringArray(pathC);

        Map<String, Integer> mapA = new HashMap<>();
        Map<String, Integer> mapB = new HashMap<>();
        Map<String, Integer> mapC = new HashMap<>();

        //特殊判断，如果其中一个文件为空就不用找共同的数据，肯定没有
        if (strA.size() == 0 || strB.size() == 0 || strC.size() == 0) {
            return;
        }
        // 先局部聚合
        Map<String, Integer> AFrequence = countFrequence(strA, mapA);
        Map<String, Integer> BFrequence = countFrequence(strB, mapB);
        Map<String, Integer> CFrequence = countFrequence(strC, mapC);

        Map<String, Integer> commonAB = mapJoin(AFrequence, BFrequence);
        Map<String, Integer> commonABC = mapJoin(commonAB, CFrequence);


        // 公共的文件写入文件 common_0.txt,common_1.txt......common_10

        targetFile = FileUtil.getFile(PRE_PATH + "common/common_" + fileNameNum + ".txt");
        Set<Map.Entry<String, Integer>> entries = commonABC.entrySet();
        for (Map.Entry<String, Integer> map : entries) {
            String key = map.getKey();
            Integer value = map.getValue() * 3;

            FileUtils.writeStringToFile(targetFile, key + "=" + value + "\n", true);
        }
    }

    /**
     * 求两个文件的交集
     *
     * @param aFrequence
     * @param bFrequence
     * @return
     */
    private static Map<String, Integer> mapJoin(Map<String, Integer> aFrequence, Map<String, Integer> bFrequence) {
        Map<String, Integer> commonMap = new HashMap<>();
        Set<Map.Entry<String, Integer>> entries = bFrequence.entrySet();
        for (Map.Entry<String, Integer> map : entries) {
            String key = map.getKey();
            Integer value = map.getValue();
            if (aFrequence.containsKey(key)) {
                commonMap.put(key, value);
            } else {
                continue;
            }
        }
        return commonMap;
    }

    /**
     * 局部聚合每个文件的字符串，统计出现的频率
     *
     * @param strA
     * @param mapA
     * @return
     */
    private static Map<String, Integer> countFrequence(ArrayList<String> strA, Map<String, Integer> mapA) {
        for (int i = 0; i < strA.size(); i++) {
            String keyA = strA.get(i);
            if (mapA.containsKey(keyA)) {
                Integer count = mapA.get(keyA);
                mapA.put(keyA, count + 1);
            } else {
                mapA.put(keyA, 1);
            }
        }
        return mapA;
    }

    // TODO 获取文件的末尾编号
    public static String getFileNameNum(String pathA) {
        String num = pathA.split("_")[1].split("\\.")[0];
        return num;
    }

    public static ArrayList<String> file2StringArray(String pathA) throws IOException {
        ArrayList<String> res = new ArrayList<>();
        File infile = new File(pathA);
        BufferedReader br = new BufferedReader(new FileReader(infile));
        String line;
        while ((line = br.readLine()) != null) {
            res.add(line);
        }
        return res;
    }

    // 二、合并
    // 1. 逐个合并,
    public static void mergerFileTopN(List<MapBeans> list,int k) {
        //将元素加入堆 求topN构建小顶堆
        for (int i = 0; i < list.size(); i++) {
            if (queueTopN.size() < k) {
                queueTopN.add(list.get(i));
            } else {
                MapBeans head = queueTopN.peek();
                if (list.get(i).getV() > head.getV()) {
                    queueTopN.poll();
                    queueTopN.add(list.get(i));
                }
            }

        }
    }


}

class MapBeans {
    private String k;
    private Integer v;


    public MapBeans() {

    }

    public MapBeans(String k, Integer v) {
        this.k = k;
        this.v = v;
    }


    public String getK() {
        return k;
    }

    public void setK(String k) {
        this.k = k;
    }

    public Integer getV() {
        return v;
    }

    public void setV(Integer v) {
        this.v = v;
    }
}