分割千万级 csv

剑豪_打手

已于 2023-06-28 18:49:04 修改

阅读量174

点赞数

文章标签： java

于 2023-06-28 18:31:53 首次发布

本文链接：https://blog.csdn.net/qq_41278559/article/details/131442890

版权

依赖

  <dependency>
      <groupId>commons-io</groupId>
      <artifactId>commons-io</artifactId>
      <version>2.9.0</version>
  </dependency>

package org.example;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.LineIterator;
import java.io.*;
import java.util.ArrayList;
import java.util.List;
public class CSVUtil {

    //测试类
    public static void main(String[] args) throws Exception {
        String path = "D:\\CSVDir\\202301-202305.csv";
        new CSVUtil().splitBigFile(path, "utf-8", ",");
    }
    public List<String> index;
    /**
     * 按数据条数分割文件,
     * @param path
     * @param ENCODE
     * @param splitStr
     * @return
     * @throws Exception
     */
    public void splitBigFile(String path, String ENCODE, String splitStr) throws Exception {
        List<List<String>> outArr = new ArrayList<>();
        File filePath = new File(path);
        String destName = filePath.getName().replace(".csv", "");// 重写文件名
        String destpath = filePath.getParent();// 重写文件路径
        int splitLen = 1000000;// 分割子文件的条数,一个文件10w条
        int i = 0;
        // 数据文件不为空
        if (filePath.exists() && filePath.length() > 0) {
            LineIterator it = FileUtils.lineIterator(filePath, ENCODE);
            while (it.hasNext()) {
                String dataLine = it.nextLine();
                if (dataLine.length() != 0) {
                    String[] arr = dataLine.split(splitStr, -1);
                    List<String> out = new ArrayList<>();
                    for (String str : arr) {
                        out.add(str.replace("^", "")); // 清除特殊字符
                    }
                    if (i == 0 && outArr.size() == 1){
                        index = outArr.get(0);
                    }
                    if (i > 0 && outArr.size() == 0){
                        outArr.add(index);
                    }
                    String newName =
                            new File(destpath + File.separator + destName + "_" + i + ".csv")
                                    .getAbsolutePath();
                    reWriteFile(newName, replaceContent(out).toString(), ENCODE);// 重写文件
                    outArr.add(out);
                    if (outArr.size() == splitLen) {
                        outArr.clear();// 重写完清空文件
                        i++;
                    }
                }
            }
            LineIterator.closeQuietly(it);
        }
    }


    private static StringBuffer replaceContent(List<String> outArr) {
        StringBuffer strbuf = new StringBuffer();
        for (String txt : outArr) {
            // 如果是属于这类数据 无效 置空 华为新数据会有这种情况
            if ("--".equals(txt)) {
                txt = "";
            }
            strbuf.append(txt).append(",");//注意：行分割字符
        }
        return strbuf.append("\n");
    }


    /**
     * 重新写入文件
     * @param fileName
     * @param content
     * @param ENCODE
     */
    public static void reWriteFile(String fileName, String content, String ENCODE) {
        try {
            File ff = new File(fileName);
            if (!ff.exists()) {
                ff.createNewFile();
            }
            // 打开一个随机访问文件流，按读写方式
            RandomAccessFile randomFile = new RandomAccessFile(fileName, "rw");
            // 文件长度，字节数
            long fileLength = randomFile.length();
            // 将写文件指针移到文件尾。
            randomFile.seek(fileLength);
            String toCn = null;
            // 处理中文问题
            toCn = new String(content.getBytes(ENCODE), "ISO-8859-1");
            randomFile.writeBytes(toCn);
            randomFile.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

}