NIO 实现大文件切割

惊涛骇浪、

已于 2024-01-16 12:11:11 修改

阅读量416

点赞数 1

文章标签： java jvm 开发语言

于 2023-04-07 12:59:41 首次发布

本文链接：https://blog.csdn.net/weixin_44588243/article/details/130009748

版权

通过NIO的FileChannel实现文件的切割。

指定大小

package com.ityj.nio;

import lombok.extern.slf4j.Slf4j;
import org.springframework.util.StopWatch;

import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.StandardOpenOption;
import java.util.ArrayList;
import java.util.List;

@Slf4j
public class FileSplitBySizeExample {

    public static void main(String[] args) throws IOException {
        StopWatch stopWatch = new StopWatch();
        stopWatch.start();
        String inputFilePath = "D:\\迅雷云盘\\国蒙双语.4K.H265.AAC-YYDS.mkv.xltd";
        String outputDirPath = "D:\\XmpCache";
        long maxFileSize = 1024 * 1024 * 1024L; // 每个输出文件的最大大小（1GB）
        Path inputFile = Paths.get(inputFilePath);
        ByteBuffer buffer = ByteBuffer.allocate(1024 * 1024);

        try (FileChannel inputFileChannel = FileChannel.open(inputFile, StandardOpenOption.READ)) {
            List<Path> outputFiles = new ArrayList<>();
            Path outputFile = null;
            long outputFileSize = 0;
            int fileCount = 0;

            while (inputFileChannel.read(buffer) > 0) {
                buffer.flip();

                while (buffer.hasRemaining()) {
                    if (outputFile == null || outputFileSize >= maxFileSize) {
                        outputFile = Paths.get(outputDirPath, "output_" + fileCount++);
                        outputFiles.add(outputFile);
                        outputFileSize = 0;
                    }

                    try (FileChannel outputFileChannel = FileChannel.open(outputFile, StandardOpenOption.CREATE, StandardOpenOption.APPEND)) {
                        outputFileSize += outputFileChannel.write(buffer);
                    }
                }

                buffer.clear();
            }
        }
        stopWatch.stop();
        System.out.println("Split file completed.");
        log.info("Time cost:{}", stopWatch.getTotalTimeSeconds());
    }

}

指定行数

// todo error

package com.ityj.nio;

import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;

public class FileSplitByLinesExample {
    public static void main(String[] args) throws IOException {
        Path inputFilePath = Paths.get( "D:\\XmpCache\\aa.txt");
        Path outputDirPath = Paths.get("D:\\XmpCache");
        int maxLines = 1000000; // 每个输出文件的最大行数
        Charset charset = Charset.forName("UTF-8");
        CharsetDecoder decoder = charset.newDecoder();
        ByteBuffer buffer = ByteBuffer.allocate(1024);
        FileChannel inputChannel = FileChannel.open(inputFilePath);
        List<String> lines = new ArrayList<>();
        int fileCount = 0;
        int lineCount = 0;
        int bytesRead;
        while ((bytesRead = inputChannel.read(buffer)) != -1) {
            buffer.flip();
            String data = decoder.decode(buffer).toString();
            String[] splitData = data.split("\r\n");
            for (String s : splitData) {
                if (lineCount >= maxLines) {
                    writeOutputFile(outputDirPath, fileCount++, lines);
                    lines.clear();
                    lineCount = 0;
                }
                lines.add(s);
                lineCount++;
            }
            buffer.clear();
        }
        if (!lines.isEmpty()) {
            writeOutputFile(outputDirPath, fileCount, lines);
        }
        inputChannel.close();
        System.out.println("Split file completed.");
    }

    private static void writeOutputFile(Path outputDirPath, int fileCount, List<String> lines) throws IOException {
        Path outputFilePath = outputDirPath.resolve("output_" + fileCount);
        Files.write(outputFilePath, lines, Charset.forName("UTF-8"));
    }
}

适配gzip

package com.ityj.utils;

import java.io.*;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;

public class FileSplitter {

    private static final int BUFFER_SIZE = 1024 * 1024;
    private static final String GZIP_EXTENSION = ".gz";

    public static void splitFileByLineCount(File inputFile, int linesPerFile) throws IOException {
        try (BufferedReader reader = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(inputFile))));
        FileChannel inputChannel = new FileInputStream(inputFile).getChannel()) {
            String line;
            int lineCount = 0;
            int fileNumber = 0;
            ByteBuffer buffer = ByteBuffer.allocate(BUFFER_SIZE);
            GZIPOutputStream gzipOut = null;
            while ((line = reader.readLine()) != null) {
                if (lineCount % linesPerFile == 0) {
                    if (gzipOut != null) {
                        gzipOut.finish();
                        gzipOut.close();
                    }
                    fileNumber++;
                    String outputFilePath = inputFile.getParent() + "/" + inputFile.getName() + "." + fileNumber + GZIP_EXTENSION;
                    gzipOut = new GZIPOutputStream(new FileOutputStream(outputFilePath));
                }
                byte[] lineBytes = (line + "\n").getBytes();
                if (buffer.remaining() < lineBytes.length) {
                    buffer.flip();
                    inputChannel.write(buffer);
                    buffer.clear();
                }
                buffer.put(lineBytes);
                lineCount++;
            }
            if (gzipOut != null) {
                gzipOut.finish();
                gzipOut.close();
            }
            buffer.flip();
            inputChannel.write(buffer);
        }
    }

    public static void main(String[] args) throws IOException {
        File inputFile = new File("inputFile.gz");
        int linesPerFile = 100000;
        splitFileByLineCount(inputFile, linesPerFile);
    }
}

简单按行读文件

public static List<String> readFileWithNIO(String filePath) {
        List<String> lines = new ArrayList<>();

        try (FileChannel fileChannel = FileChannel.open(Path.of(filePath), StandardOpenOption.READ)) {
            ByteBuffer byteBuffer = ByteBuffer.allocate(1024); // Buffer to hold data
            StringBuilder lineBuilder = new StringBuilder(); // StringBuilder to accumulate a line

            int bytesRead = fileChannel.read(byteBuffer); // Read first chunk of data from file

            while (bytesRead != -1) { // Loop until end of file
                byteBuffer.flip(); // Switch buffer from writing mode to reading mode

                while (byteBuffer.hasRemaining()) { // Loop until buffer is empty
                    byte b = byteBuffer.get(); // Get one byte

                    if (b == '\n') { // End of line
                        String line = lineBuilder.toString();
                        lineBuilder.setLength(0); // Reset the StringBuilder for the next line
                        // Process the line as needed
                        lines.add(line);
                    } else { // Add the byte to the StringBuilder
                        lineBuilder.append((char) b);
                    }
                }

                byteBuffer.clear(); // Switch buffer back to writing mode
                bytesRead = fileChannel.read(byteBuffer); // Read next chunk of data from file
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
        return lines;
    }

#!/bin/bash

# 输入文件名和输出文件名
input_file="input.txt"
output_file="output.txt"

# 如果输出文件已经存在，则删除它
if [ -f $output_file ]; then
    rm $output_file
fi

# 逐行读取输入文件，并对每行按照 | 进行分隔
while read line
do
    # 使用 awk 命令对每行进行分隔
    # 如果第三个字段不是 A999，则将该行写入输出文件
    # 注意，此处的 $0 代表整行文本
    echo $line | awk -F "|" '{ if ($3 != "A999") print $0 }' >> $output_file
done < $input_file

#!/bin/bash

# Input file name
input_file="input.txt"

# Output file name
output_file="output.txt"

# Loop through each line of the input file
while IFS= read -r line
do
  # Split the line into fields using the "|" delimiter
  fields=($(echo "$line" | tr '|' '\n'))

  # Check if the third field is "A999"
  if [ "${fields[2]}" != "A999" ]; then
    # If the third field is not "A999", output the line to the output file
    echo "$line" >> "$output_file"
  fi
done < "$input_file"

压缩一个大文件(gz)

import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.util.zip.GZIPOutputStream;

public class GzipFileCompressor {
    public static void main(String[] args) {
        String sourceFilePath = "path/to/your/largefile.txt";
        String compressedFilePath = "path/to/your/compressedfile.gz";

        try (
            FileInputStream fileInputStream = new FileInputStream(sourceFilePath);
            FileChannel fileInputChannel = fileInputStream.getChannel();
            GZIPOutputStream gzipOutputStream = new GZIPOutputStream(new FileOutputStream(compressedFilePath))
        ) {
            ByteBuffer buffer = ByteBuffer.allocateDirect(1024);
            while (fileInputChannel.read(buffer) != -1) {
                buffer.flip();
                byte[] data = new byte[buffer.limit()];
                buffer.get(data);
                gzipOutputStream.write(data);
                buffer.clear();
            }

            System.out.println("File compressed successfully.");
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}

import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.util.zip.GZIPOutputStream;

public class NIOGzipFileCompressor {
    public static void main(String[] args) {
        String sourceFilePath = "path/to/your/largefile.txt";
        String compressedFilePath = "path/to/your/compressedfile.gz";

        try (
            FileInputStream fileInputStream = new FileInputStream(sourceFilePath);
            FileOutputStream fileOutputStream = new FileOutputStream(compressedFilePath);
            GZIPOutputStream gzipOutputStream = new GZIPOutputStream(fileOutputStream);
            FileChannel inChannel = fileInputStream.getChannel();
            FileChannel outChannel = fileOutputStream.getChannel()
        ) {
            ByteBuffer buffer = ByteBuffer.allocate(1024);
            while (inChannel.read(buffer) != -1) {
                buffer.flip(); // Switch to read mode
                outChannel.write(buffer);
                buffer.clear(); // Switch to write mode
            }
            
            System.out.println("File compressed successfully.");
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}


ByteBuffer buffer = ByteBuffer.allocate(8192); // 8KB缓冲区大小
            while (inputChannel.read(buffer) > 0 || buffer.position() > 0) {
                buffer.flip();
                gzipOut.write(buffer.array(), 0, buffer.limit());
                buffer.clear();
            }

String url = "http://192.168.142.129:9000/api/rules/search";
String token = "8eb31bf43a5bc196cb9eed880be4a46651fbc8c8:";
String basicAuth = "Basic " + new String(Base64.getEncoder().encode(token.getBytes("UTF-8")));
HttpHeaders requestHeaders = new HttpHeaders();
//添加认证的请求头
requestHeaders.add("Authorization", basicAuth);
HttpEntity<String> requestEntity = new HttpEntity<String>(null, requestHeaders);
ResponseEntity<JSONObject> repsonse = restTemplate.exchange(url, HttpMethod.GET, requestEntity, JSONObject.class);
JSONObject body = repsonse.getBody();
return body;

#!/bin/bash

output_file="/path/to/your/directory/output.log"

for ((count=0; count<250; count++)); do
    current_time=$(date +"%Y-%m-%d %H:%M:%S")
    echo "[$current_time] $(zing-ps)" >> "$output_file"  # 追加带时间戳的输出到文件
    sleep 60
done

import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;

public class GenericSplitCollectionExample {

    public static void main(String[] args) {
        // 创建一个示例列表
        List<String> originalList = List.of("a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t");

        // 将列表分为小集合
        int batchSize = 5; // 定义小集合的大小
        Map<Integer, List<String>> splitCollections = splitCollection(originalList, batchSize);

        // 打印结果
        splitCollections.forEach((key, value) -> System.out.println("Collection " + key + ": " + value));
    }

    // 将列表分为小集合
    private static <T> Map<Integer, List<T>> splitCollection(List<T> list, int batchSize) {
        return list.stream()
                .collect(Collectors.groupingBy(index -> (list.indexOf(index)) / batchSize));
    }
}

package com.ityj.algorithm.gz;

import java.io.*;
import java.util.zip.GZIPOutputStream;

public class CompressFileToGz {
    public static void main(String[] args) {
        // 指定输入文件路径
        String inputFilePath = "data.dat";

        // 指定输出文件路径
        String outputFilePath = "data.dat.gz";

        // 使用try-with-resources确保资源正确关闭
        try (FileInputStream fis = new FileInputStream(inputFilePath);
             GZIPOutputStream gzipOS = new GZIPOutputStream(new FileOutputStream(outputFilePath))) {

            // 设置缓冲区大小
            byte[] buffer = new byte[1024];
            int bytesRead;

            // 从输入文件读取数据并写入GZ压缩输出流
            while ((bytesRead = fis.read(buffer)) != -1) {
                gzipOS.write(buffer, 0, bytesRead);
            }

            System.out.println("文件已成功压缩成 " + outputFilePath);
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}

package com.ityj.algorithm.gz;

import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;

public class WriteCollectionToDatFile {
    public static void main(String[] args) {
        // 创建一个ArrayList并添加一些数据
        ArrayList<String> dataList = new ArrayList<>();
        dataList.add("Item 1");
        dataList.add("Item 2");
        dataList.add("Item 3");

        // 指定输出文件的路径
        String filePath = "data.dat";

        // 使用try-with-resources确保资源正确关闭
        try (BufferedWriter writer = new BufferedWriter(new FileWriter(filePath))) {
            // 按行写入集合数据
            for (String item : dataList) {
                writer.write(item);
                writer.newLine(); // 换行
            }

            System.out.println("集合数据已成功写入到 " + filePath);
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}

package com.ityj.algorithm.gz;

import java.io.*;
import java.util.ArrayList;
import java.util.zip.GZIPOutputStream;

public class WriteCollectionToGzInMemory {
    public static void main(String[] args) {
        // 创建一个ArrayList并添加一些数据
        ArrayList<String> dataList = new ArrayList<>();
        dataList.add("Item 1");
        dataList.add("Item 2");
        dataList.add("Item 3");

        // 使用try-with-resources确保资源正确关闭
        try (ByteArrayOutputStream byteStream = new ByteArrayOutputStream();
             GZIPOutputStream gzipOS = new GZIPOutputStream(byteStream)) {

            // 按行写入集合数据到Gzip压缩输出流
            try (PrintWriter writer = new PrintWriter(new OutputStreamWriter(gzipOS))) {
                for (String item : dataList) {
                    writer.println(item);
                }
            }

            // 获取压缩后的字节数组
            byte[] compressedData = byteStream.toByteArray();

            // 将压缩后的数据保存到文件
            String gzFilePath = "data.dat.gz";
            try (FileOutputStream fos = new FileOutputStream(gzFilePath)) {
                fos.write(compressedData);
            }

            System.out.println("集合数据已成功写入到 " + gzFilePath);
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}



import java.io.BufferedWriter;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import java.util.stream.IntStream;

public class WriteLargeDataFileExample {
    public static void main(String[] args) {
        // 指定要写入的文件路径
        Path filePath = Path.of("large_data_file.txt");

        try (BufferedWriter writer = Files.newBufferedWriter(filePath, StandardOpenOption.CREATE, StandardOpenOption.WRITE)) {
            // 生成要写入文件的大量数据，这里使用简单的示例，写入十亿行每行10个字段的数据
            IntStream.range(0, 1000000000).forEach(i -> {
                StringBuilder line = new StringBuilder();
                IntStream.range(0, 10).forEach(j -> line.append("Field").append(j + 1).append("\t")); // 用制表符分隔字段
                line.append(System.lineSeparator());

                try {
                    writer.write(line.toString());
                } catch (IOException e) {
                    e.printStackTrace();
                }
            });

            System.out.println("大数据文件写入成功！");
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}


import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import java.util.stream.IntStream;

public class WriteBillionDataWithNIO {
    public static void main(String[] args) {
        // 指定要写入的文件路径
        Path filePath = Path.of("billion_data_nio.txt");

        try (FileChannel channel = FileChannel.open(filePath, StandardOpenOption.CREATE, StandardOpenOption.WRITE)) {
            // 生成十亿条包含10个字段的数据
            for (int i = 0; i < 1000000000; i++) {
                StringBuilder line = new StringBuilder();
                for (int j = 0; j < 10; j++) {
                    // 生成字段数据，这里简单地使用数字和逗号分隔
                    line.append("Field").append(j + 1).append(",");
                }

                // 将数据写入ByteBuffer
                ByteBuffer buffer = ByteBuffer.wrap(line.toString().getBytes());

                // 将ByteBuffer写入文件Channel
                channel.write(buffer);

                // 写入换行符
                channel.write(ByteBuffer.wrap(System.lineSeparator().getBytes()));
            }

            System.out.println("十亿条数据使用NIO写入成功！");
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}