并行处理大量业务数据情况

KEY WORDS:CompletableFuture函数、List分片

backbone:区间合并问题(日期格式)

所以最简单的处理方法,串行处理,leetcode常见答案版(是模拟业务逻辑代码不是真项目代码

import cn.hutool.core.collection.ListUtil;
import com.google.common.collect.Lists;
import org.apache.commons.collections4.ListUtils;

import java.time.LocalDate;
import java.util.*;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ExecutionException;
import java.util.stream.Collectors;
import java.util.List;

public class completableFutureTest {

    //随机生成实例数据
    public static List<Map<String, LocalDate>> generateRandomInstances(int count) {
        List<Map<String, LocalDate>> instanceList = new ArrayList<>();
        Random random = new Random();

        for (int i = 0; i < count; i++) {
            LocalDate startDate, endDate;
            do {
                startDate = generateRandomDate(LocalDate.of(1949, 1, 1), LocalDate.of(2025, 12, 31), random);
                endDate = generateRandomDate(startDate.plusDays(1), LocalDate.of(2050, 12, 31), random);
            } while (endDate.isBefore(startDate)); // 如果结束日期在开始日期之前,则重新生成

            Map<String, LocalDate> instance = new HashMap<>();
            instance.put("startDate", startDate);
            instance.put("endDate", endDate);

            instanceList.add(instance);
        }

        return instanceList;
    }

    //随机生成时间
    private static LocalDate generateRandomDate(LocalDate start, LocalDate end, Random random) {
        long startEpochDay = start.toEpochDay();
        long endEpochDay = end.toEpochDay();

        if (endEpochDay <= startEpochDay) {
            // 如果 endEpochDay 小于等于 startEpochDay,直接返回 start 的副本
            return LocalDate.of(start.getYear(), start.getMonth(), start.getDayOfMonth());
        }

        // 修正后的生成随机数的方法,确保 bound 是正数
        long randomEpochDay = startEpochDay + random.nextInt((int) (endEpochDay - startEpochDay));
        return LocalDate.ofEpochDay(randomEpochDay);
    }

    //合并区间
    public static List<Map<String, LocalDate>> mergeDays(List<Map<String, LocalDate>> instanceList){
        //结果
        List<Map<String, LocalDate>> mergedDateList = new ArrayList<>();

        //获取第一个区间
        Map<String, LocalDate> currentMap = instanceList.get(0);

        //循环合并
        for(Map<String, LocalDate> instanceDate : instanceList){
            if (currentMap.get("endDate").plusDays(1).isBefore(instanceDate.get("startDate"))){ //无重叠
                mergedDateList.add(currentMap);
                currentMap = new HashMap<>(instanceDate);
            } else {
                if(instanceDate.get("endDate").isAfter(currentMap.get("endDate"))){ //非完全包含
                    currentMap.put("endDate", instanceDate.get("endDate"));
                }
            }
        }
        mergedDateList.add(currentMap); //循环结束,追加

        return mergedDateList;
    }

    public static void main(String[] args) throws ExecutionException, InterruptedException{
        int DateSum = 600000;
        int subListLength = 200000; // 分片数量
        List<Map<String, LocalDate>> instanceList = generateRandomInstances(DateSum); // 随机生成大量数据
        instanceList.sort(Comparator.comparing(map -> map.get("startDate"))); // 排序

        System.out.println("总数据量:" + DateSum + "/每片数据量:" + subListLength);
        
        /**
         * 普通(非多线程)合并方法
         */
        long startTimeNormal = System.currentTimeMillis(); // 开始时间

        List<Map<String, LocalDate>> resultListNormal = mergeDays(instanceList);

        long endTimeNormal = System.currentTimeMillis(); // 结束时间
        long durationNormal = endTimeNormal - startTimeNormal; // 计算运行时间

        System.out.println("【不切片串行合并】:");
        System.out.println("运行时间: " + durationNormal + " 毫秒");

        System.out.println(resultListNormal);

        long totalDaysNormal = resultListNormal.stream()
                .mapToLong(map -> map.get("endDate").toEpochDay() - map.get("startDate").toEpochDay() + 1)
                .sum();

        System.out.println("总计天数:" + totalDaysNormal);

    }
}

问题是当需要处理的业务数据量很大,也就是需要处理的区间数量过多(万+)时,单纯的串行处理会耗费很长时间,所以需要多线程并行处理,考虑CompletableFuture函数。

这个是普通切片方法来做多线程处理:

        /**
         * 多线程合并方法
         * 普通切片方法
         */
        long startTime = System.currentTimeMillis(); // 开始时间

        int partitionSize = subListLength; // 每个分区的大小
        int numPartitions = instanceList.size() / partitionSize; // 计算分区数

        // 结果
        List<CompletableFuture<List<Map<String, LocalDate>>>> futures = new ArrayList<>();
        //List<Map<String, LocalDate>> futures = new ArrayList<>();

        // 分区并行处理
        for (int i = 0; i < numPartitions; i++) {
            int fromIndex = i * partitionSize;
            int toIndex = Math.min(fromIndex + partitionSize, instanceList.size());
            List<Map<String, LocalDate>> subList = instanceList.subList(fromIndex, toIndex);

            CompletableFuture<List<Map<String, LocalDate>>> future = CompletableFuture.supplyAsync(() -> mergeDays(subList));
            futures.add(future);
            //futures.addAll(future.get());
        }
        // 剩余部分
        if (instanceList.size() % partitionSize != 0) {
            List<Map<String, LocalDate>> subList = instanceList.subList(numPartitions * partitionSize, instanceList.size());
            CompletableFuture<List<Map<String, LocalDate>>> future = CompletableFuture.supplyAsync(() -> mergeDays(subList));
            futures.add(future);
            //futures.addAll(future.get());
        }

        // 等待所有 CompletableFuture 完成
        CompletableFuture<Void> allOf = CompletableFuture.allOf(
                futures.toArray(new CompletableFuture[0])
        );

        // 收集所有分区的结果
        List<Map<String, LocalDate>> mergedDateList = allOf.thenApply(v ->
                futures.stream()
                        .map(CompletableFuture::join)
                        .flatMap(List::stream)
                        .collect(Collectors.toList())
        ).get(); // 获取最终结果

        // 再合并
        //List<Map<String, LocalDate>> resultList = mergeDays(futures);
        List<Map<String, LocalDate>> resultList = mergeDays(mergedDateList);

        long endTime = System.currentTimeMillis(); // 结束时间
        long duration = endTime - startTime; // 计算运行时间

        System.out.println("【普通分片方法 + 多线程合并】:");
        System.out.println("运行时间: " + duration + " 毫秒");

        System.out.println(resultList);

        // 计算总天数
        long totalDays = resultList.stream()
                .mapToLong(map -> map.get("endDate").toEpochDay() - map.get("startDate").toEpochDay() + 1) // +1 to include both start and end date
                .sum();

        System.out.println("总计天数:"+ totalDays);

需要注意一下【等待所有completablefuture完成】【收集所有分区结果】是必须的,不能直接在每个completablefuture里调用get,不然就没有多线程的意义了。

如果在启动所有completablefuture后立即调用future.get(),主线程会等待每个任务依次完成,而不是同时处理所有任务,因为future.get()。这导致所有任务还是在串行(一个接一个)执行,不能充分利用多线程的并行优势。因此,虽然你用completablefuture.supplyasync来启动了异步任务,但由于future.get()是阻塞的,你的代码实际上是在每个任务完成前停下来等待,这种行为消除了并行处理的优势。

解决了多线程的问题之后再考虑如何合理给List分片以缩短时间的问题,因为这个业务无非就这两个步骤,分片然后合并。

除了普通的循环遍历分割之外,主要尝试三种方法:Google 的 Guava 框架、Apache 的 commons 框架、Hutool 框架。

  • Google Guava
<!-- google guava 工具类 -->
<!-- https://mvnrepository.com/artifact/com.google.guava/guava -->
<dependency>
  <groupId>com.google.guava</groupId>
  <artifactId>guava</artifactId>
  <version>31.0.1-jre</version>
</dependency>
        /**
         * 多线程合并方法
         * Google Guava框架分片
         */
        long startTime1 = System.currentTimeMillis(); // 开始时间
        List<List<Map<String, LocalDate>>> instanceListChunk1 = Lists.partition(instanceList, subListLength);
        List<CompletableFuture<List<Map<String, LocalDate>>>> futures1 = new ArrayList<>();
        for(List<Map<String, LocalDate>> subList : instanceListChunk1){
            CompletableFuture<List<Map<String, LocalDate>>> futureChunk1 = CompletableFuture.supplyAsync(() -> mergeDays(subList));
            futures1.add(futureChunk1);
        }
        CompletableFuture<Void> allOf1 = CompletableFuture.allOf(
                futures1.toArray(new CompletableFuture[0])
        );
        List<Map<String, LocalDate>> mergedDateList1 = allOf1.thenApply(v ->
                futures1.stream()
                        .map(CompletableFuture::join)
                        .flatMap(List::stream)
                        .collect(Collectors.toList())
        ).get();
        List<Map<String, LocalDate>> resultList1 = mergeDays(mergedDateList1);
        long endTime1 = System.currentTimeMillis();
        long duration1 = endTime1 - startTime1;
        System.out.println("【Guava框架分片 + 多线程合并】:");
        System.out.println("运行时间: " + duration1 + " 毫秒");
        long totalDays1 = resultList1.stream()
                .mapToLong(map -> map.get("endDate").toEpochDay() - map.get("startDate").toEpochDay() + 1) // +1 to include both start and end date
                .sum();
        System.out.println(resultList1);
        System.out.println("总计天数:" + totalDays1);
  • Apache commons
<!-- apache 集合工具类 -->
<!-- https://mvnrepository.com/artifact/org.apache.commons/commons-collections4 -->
<dependency>
  <groupId>org.apache.commons</groupId>
  <artifactId>commons-collections4</artifactId>
  <version>4.4</version>
</dependency>
        /**
         * 多线程合并方法
         * apache commons框架分片
         */
        long startTime2 = System.currentTimeMillis(); // 开始时间
        List<List<Map<String, LocalDate>>> instanceListChunk2 = ListUtils.partition(instanceList, subListLength);
        List<CompletableFuture<List<Map<String, LocalDate>>>> futures2 = new ArrayList<>();
        for(List<Map<String, LocalDate>> subList : instanceListChunk2){
            CompletableFuture<List<Map<String, LocalDate>>> futureChunk2 = CompletableFuture.supplyAsync(() -> mergeDays(subList));
            futures2.add(futureChunk2);
        }
        CompletableFuture<Void> allOf2 = CompletableFuture.allOf(
                futures2.toArray(new CompletableFuture[0])
        );
        List<Map<String, LocalDate>> mergedDateList2 = allOf2.thenApply(v ->
                futures2.stream()
                        .map(CompletableFuture::join)
                        .flatMap(List::stream)
                        .collect(Collectors.toList())
        ).get();
        List<Map<String, LocalDate>> resultList2 = mergeDays(mergedDateList2);
        long endTime2 = System.currentTimeMillis();
        long duration2 = endTime2 - startTime2;
        System.out.println("【commons框架分片 + 多线程合并】:");
        System.out.println("运行时间: " + duration2 + " 毫秒");
        long totalDays2 = resultList2.stream()
                .mapToLong(map -> map.get("endDate").toEpochDay() - map.get("startDate").toEpochDay() + 1) // +1 to include both start and end date
                .sum();
        System.out.println(resultList2);
        System.out.println("总计天数:"+ totalDays2);
  • Hutool
<!-- 工具类 hutool -->
<!-- https://mvnrepository.com/artifact/cn.hutool/hutool-all -->
<dependency>
  <groupId>cn.hutool</groupId>
  <artifactId>hutool-all</artifactId>
  <version>5.7.14</version>
</dependency>
        /**
         * 多线程合并方法
         * Hutool框架分片
         */
        long startTime3 = System.currentTimeMillis(); // 开始时间
        List<List<Map<String, LocalDate>>> instanceListChunk3 = ListUtil.partition(instanceList, subListLength);
        List<CompletableFuture<List<Map<String, LocalDate>>>> futures3 = new ArrayList<>();
        for(List<Map<String, LocalDate>> subList : instanceListChunk3){
            CompletableFuture<List<Map<String, LocalDate>>> futureChunk3 = CompletableFuture.supplyAsync(() -> mergeDays(subList));
            futures3.add(futureChunk3);
        }
        CompletableFuture<Void> allOf3 = CompletableFuture.allOf(
                futures3.toArray(new CompletableFuture[0])
        );
        List<Map<String, LocalDate>> mergedDateList3 = allOf3.thenApply(v ->
                futures3.stream()
                        .map(CompletableFuture::join)
                        .flatMap(List::stream)
                        .collect(Collectors.toList())
        ).get();
        List<Map<String, LocalDate>> resultList3 = mergeDays(mergedDateList3);
        long endTime3 = System.currentTimeMillis();
        long duration3 = endTime3 - startTime3;
        System.out.println("【Hutool框架分片 + 多线程合并】:");
        System.out.println("运行时间: " + duration3 + " 毫秒");
        long totalDays3 = resultList3.stream()
                .mapToLong(map -> map.get("endDate").toEpochDay() - map.get("startDate").toEpochDay() + 1) // +1 to include both start and end date
                .sum();
        System.out.println(resultList3);
        System.out.println("总计天数:"+ totalDays3);

另外stream的方法不推荐因为我没写出来(不是),因为写出来发现时间花销很久。

上面这三种的时间开销都还可以,60w条数据的时候能缩短一半左右。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值