文本分割器TXTSpliter

         当下载的日志文件(文本文件)有几十M大小的时候,直接用文本编辑器(notepad++)打开会导致卡死。于是写了一个按字节数均分的文本分割工具TXTSpliterEqualBytes.java ,将文本文件分割成10份(比如原文件50M,分割后生成子文件每个5M)。
        但执行TXTSpliterEqualBytes时可能会遇到一个问题:从第N份子文件开始统统是乱码。原因是按字节均分恰好出现将某个字符(占用超过1个字节)分割的情况。于是又写了一个按字符数均分的文本分割工具TXTSpliterEqualChars.java(比如原文件1千万个字符,分割后生成的每个子文件有1百万字符) 。

下载地址:https://download.csdn.net/download/shushanke/86923522
--------------------------------分割线--------------------------------

TXTSpliterEqualBytes 


import java.io.BufferedReader;
import java.io.Closeable;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.charset.CharacterCodingException;
import java.text.DecimalFormat;
/*
javac -d . -encoding UTF-8 TXTSpliterEqualBytes.java

java TXTSpliterEqualBytes


文本切割器(按字节数均分,可能分割后的文件乱码。比如恰好某个字符不止一个字节,恰好好被分割到两个文件中。)
*/
public class TXTSpliterEqualBytes {
    private static final String dirPath = ".";//当前目录
    //private static final int NUMBER_OF_FILES = 10;//分割成N份
    private static int NUMBER_OF_FILES = 10;//分割成N份

    private static String absoluteDirPath = "";

    //原始文件
    private static String originalFileName = "";
    private static DecimalFormat format;
    private static java.util.LinkedHashSet<String> suffixSetOfTXTFile = new java.util.LinkedHashSet<String>();
    static {
        suffixSetOfTXTFile.add(".log");
        suffixSetOfTXTFile.add(".LOG");
        suffixSetOfTXTFile.add(".txt");
        suffixSetOfTXTFile.add(".TXT");
        suffixSetOfTXTFile.add(".text");
        suffixSetOfTXTFile.add(".TEXT");

        if (NUMBER_OF_FILES < 10) {
            format = new DecimalFormat("0");
        } else if (NUMBER_OF_FILES < 100) {
            format = new DecimalFormat("00");
        } else if (NUMBER_OF_FILES < 1000) {
            format = new DecimalFormat("000");
        }

        getabsoluteDirPath();//计算当前目录的绝对路径
        findTXTFile();//查找文本文件(找到当前目录的第一个文本)
    }

    private static String getabsoluteDirPath() {
        if ("".equals(absoluteDirPath)) {
            File dir = new File(dirPath);
            absoluteDirPath = dir.getAbsolutePath();
            absoluteDirPath = absoluteDirPath.substring(0, absoluteDirPath.length() -1);
            //System.out.println("absoluteDirPath==" + absoluteDirPath);
            if (!absoluteDirPath.endsWith(File.separator)) {
                absoluteDirPath += File.separator;
            }
        }
        return absoluteDirPath;
    }
    private static String findTXTFile() {
        File dir = new File(absoluteDirPath);
        boolean findTXT = false;
        for (File file : dir.listFiles()) {
            if (file.isFile()) {
                String fileName = file.getName();
                int index = fileName.lastIndexOf(".");
                if (index < 1) {
                    continue;
                }
                String suffix = fileName.substring(index, fileName.length());
                if (suffixSetOfTXTFile.contains(suffix)) {
                    originalFileName = fileName;
                    findTXT = true;
                    break;
                }
            }
        }
        if (!findTXT) {
            String tipMsg = "ERROR:请将待分割的文本文件" + suffixSetOfTXTFile.toString() + "放到当前目录下!";
            System.out.println(tipMsg);
            throw new RuntimeException(tipMsg);
        }
        return absoluteDirPath;
    }
    public static void closeCloseable(Closeable closeable) {
        try {
            if (closeable != null) {
                closeable.close();
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public static boolean split() {
        boolean success = false;
        if (NUMBER_OF_FILES < 2) {
            System.out.println("分割后的文件个数不能小于2!");
            return success;
        }

        //文件的绝对路径
        String filePath = absoluteDirPath + originalFileName;
        File originalFile = new File(filePath);
        long sizeTotal = originalFile.length();
        long sizeEach = sizeTotal / NUMBER_OF_FILES;
        long remainder = sizeTotal % NUMBER_OF_FILES;
        long[] sizeArray = new long[NUMBER_OF_FILES];
        for (int i = 0; i < NUMBER_OF_FILES; i++) {
            sizeArray[i] = sizeEach;
        }
        sizeArray[NUMBER_OF_FILES -1] = sizeEach + remainder;

        FileChannel inChannel = null;
        FileChannel outChannel = null;
        try {
            int index = originalFileName.lastIndexOf(".");
            String fileName = originalFileName.substring(0, index);
            String suffix = originalFileName.substring(index, originalFileName.length());
            StringBuilder sb = new StringBuilder();

            inChannel = new FileInputStream(originalFile).getChannel();
            long offset = 0;
            for (int i = 0; i < NUMBER_OF_FILES; i++) {
                sb.setLength(0);
                sb.append(fileName).append("_").append(format.format(i + 1)).append(suffix);
                String newFileName = absoluteDirPath + sb.toString();
                long byteNum = sizeArray[i];
                // 将FileChannel里的全部数据映射到ByteBuffer里
                MappedByteBuffer buffer = inChannel.map(FileChannel.MapMode.READ_ONLY, offset, byteNum);// ①
                offset += byteNum;
                // 创建FileOutputStream,以该文件输出流创建FileChannel
                outChannel = new FileOutputStream(newFileName).getChannel();
                // 直接将buffer里的数据全部输出
                outChannel.write(buffer);// ②
                buffer.clear();//position=0,limit=capacity

                /*
                // 使用GBK/UTF-8字符集来创建解码器
                Charset charset = Charset.forName("UTF-8");
                // 创建解码器(CharsetDecoder)对象
                CharsetDecoder decoder = charset.newDecoder();
                // 使用解码器将ByteBuffer转换成CharBuffer
                CharBuffer charBuffer = decoder.decode(buffer);
                int capacity = charBuffer.capacity();
                int limit = charBuffer.limit();
                // true - false, 因为字节数大于字符数(含中文字符)
                System.out.println((file.length() == capacity) + " - " + (capacity == limit));
                System.out.println(charBuffer);//输出文件内容
                */
            }//end of for-loop
            success = true;
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (CharacterCodingException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            //MyUtil.closeFileChannel(inChannel);
            //MyUtil.closeFileChannel(outChannel);
            closeCloseable(inChannel);
            closeCloseable(outChannel);
        }

        return success;
    }

    public static void main(String... args)throws Exception {
        System.out.println("①输入exit并敲回车,结束程序。");
        System.out.println("②输入大于1的整数(N)并敲回车,将文本分割成N分。");
        //try-with-resource语法
        try (BufferedReader bufReader = new BufferedReader(new InputStreamReader(System.in));){
            String line = null;
            while ((line = bufReader.readLine()) != null) {
                System.out.println("本次输入的内容是:" + line);
                if (line.equalsIgnoreCase("exit")) {
                    break;
                } else {
                    try {
                        int count = Integer.parseInt(line);
                        if (count < 2) {
                            System.out.println("请输入大于1的整数:");
                        } else {
                            NUMBER_OF_FILES = count;
                            System.out.println("文本将分割成" + NUMBER_OF_FILES + "份");
                            long start = System.currentTimeMillis();
                            boolean success = split();
                            long end = System.currentTimeMillis();
                            if (success) {
                                System.out.println("文本分割已完成,耗时(ms)=" + (end -start));
                                break;
                            }
                        }
                    } catch (NumberFormatException e) {
                        System.out.println("请输入大于1的整数:");
                    }
                }
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}


--------------------------------分割线--------------------------------

TXTSpliterEqualChars 


import java.io.BufferedReader;
import java.io.Closeable;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.text.DecimalFormat;
/*
javac -d . -encoding UTF-8 TXTSpliterEqualChars.java

java TXTSpliterEqualChars


文本切割器(按字符数均分)
*/
public class TXTSpliterEqualChars {
    private static final String dirPath = ".";//当前目录
    //private static final int NUMBER_OF_FILES = 10;//分割成N份
    private static int NUMBER_OF_FILES = 10;//分割成N份

    private static String absoluteDirPath = "";

    public static Charset CHARSET_UTF8 = Charset.forName("UTF-8");// UTF-8字符集,创建解码器/编码器的字符集
    public static Charset CHARSET_GBK = Charset.forName("GBK");// GBK字符集,创建解码器/编码器的字符集
    //原始文件
    private static String originalFileName = "";
    private static DecimalFormat format;
    private static java.util.LinkedHashSet<String> suffixSetOfTXTFile = new java.util.LinkedHashSet<String>();
    static {
        suffixSetOfTXTFile.add(".log");
        suffixSetOfTXTFile.add(".LOG");
        suffixSetOfTXTFile.add(".txt");
        suffixSetOfTXTFile.add(".TXT");
        suffixSetOfTXTFile.add(".text");
        suffixSetOfTXTFile.add(".TEXT");

        if (NUMBER_OF_FILES < 10) {
            format = new DecimalFormat("0");
        } else if (NUMBER_OF_FILES < 100) {
            format = new DecimalFormat("00");
        } else if (NUMBER_OF_FILES < 1000) {
            format = new DecimalFormat("000");
        }

        getabsoluteDirPath();//计算当前目录的绝对路径
        findTXTFile();//查找文本文件(找到当前目录的第一个文本)
    }

    private static String getabsoluteDirPath() {
        if ("".equals(absoluteDirPath)) {
            File dir = new File(dirPath);
            absoluteDirPath = dir.getAbsolutePath();
            absoluteDirPath = absoluteDirPath.substring(0, absoluteDirPath.length() -1);
            //System.out.println("absoluteDirPath==" + absoluteDirPath);
            if (!absoluteDirPath.endsWith(File.separator)) {
                absoluteDirPath += File.separator;
            }
        }
        return absoluteDirPath;
    }

    private static String findTXTFile() {
        File dir = new File(absoluteDirPath);
        boolean findTXT = false;
        for (File file : dir.listFiles()) {
            if (file.isFile()) {
                String fileName = file.getName();
                int index = fileName.lastIndexOf(".");
                if (index < 1) {
                    continue;
                }
                String suffix = fileName.substring(index, fileName.length());
                if (suffixSetOfTXTFile.contains(suffix)) {
                    originalFileName = fileName;
                    findTXT = true;
                    break;
                }
            }
        }
        if (!findTXT) {
            String tipMsg = "ERROR:请将待分割的文本文件" + suffixSetOfTXTFile.toString() + "放到当前目录下!";
            System.out.println(tipMsg);
            throw new RuntimeException(tipMsg);
        }
        return absoluteDirPath;
    }
    public static void closeCloseable(Closeable closeable) {
        try {
            if (closeable != null) {
                closeable.close();
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public static boolean split() {
        boolean success = false;
        if (NUMBER_OF_FILES < 2) {
            System.out.println("分割后的文件个数不能小于2!");
            return success;
        }

        //文件的绝对路径
        String filePath = absoluteDirPath + originalFileName;
        File originalFile = new File(filePath);
        long sizeTotal = originalFile.length();

        FileChannel inChannel = null;
        FileChannel outChannel = null;
        try {
            int index = originalFileName.lastIndexOf(".");
            String fileName = originalFileName.substring(0, index);
            String suffix = originalFileName.substring(index, originalFileName.length());
            StringBuilder sb = new StringBuilder();

            inChannel = new FileInputStream(originalFile).getChannel();
            MappedByteBuffer byteBuffer = inChannel.map(FileChannel.MapMode.READ_ONLY, 0, sizeTotal);
            // 创建解码器(CharsetDecoder)对象
            CharsetDecoder decoder = CHARSET_UTF8.newDecoder();
            // 使用解码器将ByteBuffer转换成CharBuffer
            CharBuffer charBuffer = decoder.decode(byteBuffer);
            //int capacity = charBuffer.capacity();//字节数
            int limit = charBuffer.limit();//字符数?
            char[] chars = charBuffer.array();
            // 创建编码器(CharsetEncoder)对象
            CharsetEncoder encoder = CHARSET_UTF8.newEncoder();

            long charNumTotal = limit;
            long charNumEach = charNumTotal / NUMBER_OF_FILES;
            long charRemainder = charNumTotal % NUMBER_OF_FILES;
            long[] charNumArray = new long[NUMBER_OF_FILES];
            for (int i = 0; i < NUMBER_OF_FILES; i++) {
                charNumArray[i] = charNumEach;
            }
            charNumArray[NUMBER_OF_FILES -1] = charNumEach + charRemainder;
            System.out.println("byteNumTotal=" + sizeTotal);
            System.out.println("charNumTotal=" + charNumTotal + ", charNumEach=" + charNumEach +  ", charRemainder=" + charRemainder);
            System.out.println("charBuffer.array().length=" + chars.length);

            long offset = 0;
            for (int i = 0; i < NUMBER_OF_FILES; i++) {
                sb.setLength(0);
                sb.append(fileName).append("_").append(format.format(i + 1)).append(suffix);
                String newFileName = absoluteDirPath + sb.toString();

                long charNum = charNumArray[i];
                System.out.println("from " + offset + " to " + (offset + charNum) + ", charNum=" + charNum + ", charBuffer.remaining()=" + charBuffer.remaining() );
                CharBuffer cBuffer = CharBuffer.wrap(chars, (int) offset, (int) charNum);
                //System.out.println("cBuffer=" + cBuffer);//文本内容
                offset += charNum;
                // 使用编码器将CharBuffer转换成ByteBuffer
                ByteBuffer bBuffer = encoder.encode(cBuffer);
                // 创建FileOutputStream,以该文件输出流创建FileChannel
                outChannel = new FileOutputStream(newFileName).getChannel();
                // 直接将buffer里的数据全部输出
                outChannel.write(bBuffer);// ②
                bBuffer.clear();//position=0,limit=capacity

            }//end of for-loop
            success = true;
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (CharacterCodingException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            //MyUtil.closeFileChannel(inChannel);
            //MyUtil.closeFileChannel(outChannel);
            closeCloseable(inChannel);
            closeCloseable(outChannel);
        }

        return success;
    }

    public static void main(String... args)throws Exception {
        System.out.println("①输入exit并敲回车,结束程序。");
        System.out.println("②输入大于1的整数(N)并敲回车,将文本分割成N分。");
        //try-with-resource语法
        try (BufferedReader bufReader = new BufferedReader(new InputStreamReader(System.in));){
            String line = null;
            while ((line = bufReader.readLine()) != null) {
                System.out.println("本次输入的内容是:" + line);
                if (line.equalsIgnoreCase("exit")) {
                    break;
                } else {
                    try {
                        int count = Integer.parseInt(line);
                        if (count < 2) {
                            System.out.println("请输入大于1的整数:");
                        } else {
                            NUMBER_OF_FILES = count;
                            System.out.println("文本将分割成" + NUMBER_OF_FILES + "份");
                            long start = System.currentTimeMillis();
                            boolean success = split();
                            long end = System.currentTimeMillis();
                            if (success) {
                                System.out.println("文本分割已完成,耗时(ms)=" + (end -start));
                                break;
                            }
                        }
                    } catch (NumberFormatException e) {
                        System.out.println("请输入大于1的整数:");
                    }
                }
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}

--------------------------------分割线--------------------------------

运行环境:JDK 1.7、1.8

windows可执行文件(*.bat)

TXTSpliterEqualChars.bat,内容如下:
javac -d . -encoding UTF-8 TXTSpliterEqualChars.java

java TXTSpliterEqualChars

:pause

TXTSpliterEqualBytes.bat,内容如下:
javac -d . -encoding UTF-8 TXTSpliterEqualBytes.java

java TXTSpliterEqualBytes

:pause

在同目录下放入待分割的文本文件,然后双击可执行文件: 

--------------------------------分割线--------------------------------

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值