EnhanceIoUtil .java
package work.linruchang.lrcutilsweb.util;
import cn.hutool.core.collection.LineIter;
import cn.hutool.core.io.FileUtil;
import cn.hutool.core.io.IoUtil;
import cn.hutool.core.lang.Assert;
import lombok.SneakyThrows;
import java.io.File;
import java.io.FileReader;
/**
* 增强的IO工具类
* @author LinRuChang
* @version 1.0
* @date 2022/11/26
* @since 1.8
**/
public class EnhanceIoUtil extends IoUtil {
/**
* 返回行遍历器
* @param file 待遍历的文件
* @return
*/
@SneakyThrows
public static LineIter lineIter(String file) {
return lineIter(FileUtil.file(file));
}
/**
* 返回行遍历器
* @param file 待遍历的文件
* @return
*/
@SneakyThrows
public static LineIter lineIter(File file) {
Assert.isTrue(file.exists(), "文件不存在");
return lineIter(new FileReader(file));
}
}
EnhanceFileUtil .java
package work.linruchang.lrcutilsweb.util;
import cn.hutool.core.collection.CollUtil;
import cn.hutool.core.collection.LineIter;
import cn.hutool.core.convert.Convert;
import cn.hutool.core.io.FileUtil;
import cn.hutool.core.lang.UUID;
import cn.hutool.core.stream.StreamUtil;
import cn.hutool.core.thread.ThreadUtil;
import cn.hutool.core.util.StrUtil;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
import java.io.File;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Set;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicLong;
/**
* 增强的文件工具类
*
* @author LinRuChang
* @version 1.0
* @date 2022/09/27
* @since 1.8
**/
@Slf4j
public class EnhanceFileUtil extends FileUtil {
/**
* 获取新的随机文件名
* @param sourceFileName 源文件名
* @return 格式:uuid.源文件类型
*/
public static String newRandomFileName(String sourceFileName) {
return StrUtil.join(StrUtil.DOT, UUID.randomUUID().toString(true), extName(sourceFileName));
}
/**
* 文本文件的总行数
* 注意:大文本文件统计行数非常非常的耗时,不建议使用(一个文件30G就非常不建议使用了)
* @param sourceTextFile 文本文件
* @return
*/
public static Long textFileTotalRow(File sourceTextFile) {
boolean fileExistFlag = sourceTextFile != null && FileUtil.exists(sourceTextFile.toPath(), true);
if (fileExistFlag) {
return StreamUtil.of(sourceTextFile)
.count();
}
return 0L;
}
/**
* 分割文本文件(多线程)- 不太大文件使用
* 注意:需要先读取文件的总行数(所以文件很大很大不建议使用该方法,例如一个文件30G非常不建议使用)
* @param sourceTextFile 源文件
* @param singleFileRowNum 分割后每个文件的最大行数
* @return
*/
@SneakyThrows
public static List<File> textFileCutting(File sourceTextFile, Long singleFileRowNum) {
Long totalRow = textFileTotalRow(sourceTextFile);
log.info("文件【{}】总行数:{}", FileUtil.getName(sourceTextFile), totalRow);
// 文件读取行数
CountDownLatch rowCountDownLatch = new CountDownLatch(Convert.toInt(totalRow));
List<File> cuttingFiles = Collections.synchronizedList(CollUtil.newArrayList());
ThreadPoolExecutor executorService = (ThreadPoolExecutor) ThreadUtil.newExecutor(32);
executorService.setKeepAliveTime(1, TimeUnit.SECONDS);
executorService.allowCoreThreadTimeOut(true);
if (totalRow > 0) {
final File currentSourceTextFileDir = FileUtil.getParent(sourceTextFile, 1);
// 切割的文件数
Long cuttingFileCount = totalRow % singleFileRowNum > 0 ? totalRow / singleFileRowNum + 1 : totalRow % singleFileRowNum;
log.info("文件【{}】切割信息:总行数【{}】、每个切割文件的最大行数【{}】、共切割成【{}】个文件", FileUtil.getName(sourceTextFile), totalRow, singleFileRowNum, cuttingFileCount);
CountDownLatch cuttingFileCountDownLatch = new CountDownLatch(Convert.toInt(cuttingFileCount));
for (int cutFileIndex = 0; cutFileIndex < cuttingFileCount; cutFileIndex++) {
String cutFileIndexStr = StrUtil.padPre(Convert.toStr(cutFileIndex), 5, "0");
executorService.execute(() -> {
File cutFile = FileUtil.file(currentSourceTextFileDir, StrUtil.format("{}_cut{}.{}",FileUtil.mainName(sourceTextFile), cutFileIndexStr, StrUtil.blankToDefault(FileUtil.extName(sourceTextFile),"txt")));
//当前分割文件从源文件读取的开始行、结束行
Long startRow = (Convert.toInt(cutFileIndexStr) * singleFileRowNum) + 1;
Long endRow = ((Convert.toInt(cutFileIndexStr)+ 1) * singleFileRowNum) ;
AtomicLong cuurentRow = new AtomicLong();
LineIter lineIter = EnhanceIoUtil.lineIter(sourceTextFile);
while (lineIter.hasNext() && cuurentRow.incrementAndGet() <= endRow) {
String lineContent = lineIter.next();
if(cuurentRow.get()>=startRow) {
FileUtil.writeLines(Arrays.asList(lineContent), cutFile, StandardCharsets.UTF_8,true);
rowCountDownLatch.countDown();
}
}
cuttingFiles.add(cutFile);
cuttingFileCountDownLatch.countDown();
});
}
cuttingFileCountDownLatch.await();
}
rowCountDownLatch.await();
return cuttingFiles;
}
/**
* 分割文本文件(单线程)- 超大文件使用
* @param sourceTextFile 源文件
* @param singleFileRowNum 分割后每个文件的最大行数
* @return
*/
@SneakyThrows
public static List<File> textFileCutting2(File sourceTextFile, Long singleFileRowNum) {
Set<File> cuttingFiles = Collections.synchronizedSet(CollUtil.newHashSet());
boolean fileFlag = FileUtil.isFile(sourceTextFile);
AtomicLong row = new AtomicLong();
if(fileFlag) {
final File currentSourceTextFileDir = FileUtil.getParent(sourceTextFile, 1);
StreamUtil.of(sourceTextFile)
.forEach(line -> {
long currentRow = row.incrementAndGet();
Long cutFileIndex = (currentRow%singleFileRowNum == 0) ? currentRow/singleFileRowNum-1:currentRow/singleFileRowNum;
String cutFileIndexStr = StrUtil.padPre(Convert.toStr(cutFileIndex), 5, "0");
File cutFile = FileUtil.file(currentSourceTextFileDir, StrUtil.format("{}_cut{}.{}",FileUtil.mainName(sourceTextFile), cutFileIndexStr, StrUtil.blankToDefault(FileUtil.extName(sourceTextFile),"txt")));
FileUtil.writeLines(Arrays.asList(line), cutFile, StandardCharsets.UTF_8,true);
cuttingFiles.add(cutFile);
});
}
log.info("文件【{}】切割信息:总行数【{}】、每个切割文件的最大行数【{}】、共切割成【{}】个文件", FileUtil.getName(sourceTextFile), row, singleFileRowNum, cuttingFiles.size());
return CollUtil.newArrayList(cuttingFiles);
}
}