用java代码进行文本内容编码格式的转换及文本分割称若干个小文本

最新推荐文章于 2023-12-14 15:39:04 发布

高山之巅心有羽翼

最新推荐文章于 2023-12-14 15:39:04 发布

阅读量274

点赞数

分类专栏：编码格式转换文本分割文章标签：用java代码进行文本内容编码格式的转换及文本分割称若干个小

本文链接：https://blog.csdn.net/waitingandhope/article/details/101070059

版权

编码格式转换同时被 2 个专栏收录

1 篇文章 0 订阅

订阅专栏

文本分割

1 篇文章 0 订阅

订阅专栏

项目以maven工程为例

相关依赖

            <dependencies>
		<dependency>
			<groupId>junit</groupId>
			<artifactId>junit</artifactId>
			<version>4.8.2</version>
		</dependency>
		<dependency>
			<groupId>commons-logging</groupId>
			<artifactId>commons-logging</artifactId>
			<version>1.1.1</version>
		</dependency>	
    	        <dependency>
			<groupId>org.apache.commons</groupId>
			<artifactId>commons-lang3</artifactId>
			<version>3.7</version>
		</dependency>
		<dependency>
			<groupId>commons-io</groupId>
			<artifactId>commons-io</artifactId>
			<version>2.4</version>
		</dependency>
		<dependency>
			<groupId>net.sourceforge.cpdetector</groupId>
			<artifactId>cpdetector</artifactId>
			<version>1.0.7</version>
		</dependency>
		<!-- 转码格式需要的第三方依赖 -->
		<!-- https://mvnrepository.com/artifact/antlr/antlr -->
		<dependency>
			<groupId>antlr</groupId>
			<artifactId>antlr</artifactId>
			<version>2.7.7</version>
		</dependency>
		<!-- https://mvnrepository.com/artifact/org.mozilla.intl/chardet -->
		<dependency>
			<groupId>org.mozilla.intl</groupId>
			<artifactId>chardet</artifactId>
			<version>1.0</version>
		</dependency>
	</dependencies>

2.启动类

        public class FuctionTest {

	public static void main(String[] args) throws Exception {
		/*
		 * 	0-----为非按行分割,1-----为按行分割
		 * */
		SplitEnum.split_01("0");
		
	}

3.方法主体

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.FilenameFilter;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.nio.charset.Charset;
import java.nio.charset.UnsupportedCharsetException;
import java.util.Arrays;
import java.util.HashMap;
import org.apache.commons.lang3.StringUtils;
import info.monitorenter.cpdetector.io.ASCIIDetector;
import info.monitorenter.cpdetector.io.CodepageDetectorProxy;
import info.monitorenter.cpdetector.io.JChardetFacade;
import info.monitorenter.cpdetector.io.ParsingDetector;
import info.monitorenter.cpdetector.io.UnicodeDetector;

public enum SplitEnum {

	// 文本仅做分割处理
	
	F01("0", "E:\\Temp", "E:\\Temp\\tempData\\dirData", 1024 * 1024 * 100),

	// 文逐行分割
	F02("1", "E:\\Temp", "E:\\Temp\\tempData\\dirData", 1024 * 1024 * 100)

	;

	// 分割标志
	private String splitFlag;
	// 欲分割的文本地址
	private String srcFile;
	// 分割后的文本存储的目标地址
	private String dirPath;
	// 等分的文本大小
	private Integer eachSize;

	private SplitEnum(String splitFlag, String srcFile, String dirPath, Integer eachSize) {
		this.splitFlag = splitFlag;
		this.srcFile = srcFile;
		this.dirPath = dirPath;
		this.eachSize = eachSize;
	}

	static HashMap<String, SplitEnum> map;

	public String getSplitFlag() {
		return splitFlag;
	}

	public void setSplitFlag(String splitFlag) {
		this.splitFlag = splitFlag;
	}

	public String getSrcFile() {
		return srcFile;
	}

	public void setSrcFile(String srcFile) {
		this.srcFile = srcFile;
	}

	public String getDirPath() {
		return dirPath;
	}

	public void setDirPath(String dirPath) {
		this.dirPath = dirPath;
	}

	public Integer getEachSize() {
		return eachSize;
	}

	public void setEachSize(Integer eachSize) {
		this.eachSize = eachSize;
	}

	/*
	 * 组装枚举的信息
	 */
	static {
		SplitEnum[] splitEnums = SplitEnum.values();
		map = new HashMap<String, SplitEnum>();
		for (SplitEnum splitEnum : splitEnums) {
			map.put(splitEnum.getSplitFlag(), splitEnum);
		}
	}

	/*
	 * 分割操作
	 */
	public static void split_01(String flag) throws Exception {

		getFiles(flag);
	}

	/*
	 * 按是否需要按行分割作为区分
	 */
	public static void getFiles(String flag) throws Exception {

		if (StringUtils.isNotBlank(flag)) {

			switch (flag.equals("0") ? 0 : 1) {

			case 0:

				SplitEnum splitEnum = map.get(flag);
				// 分割方法
				splitFile(new File(splitEnum.getSrcFile()), new File(splitEnum.getDirPath()), splitEnum.getEachSize(),
						splitEnum.getSplitFlag());

				break;
			case 1:
				if (flag.equals("1")) {

					SplitEnum splitHEnum = map.get(flag);
					// 分割方法
					splitFile(new File(splitHEnum.getSrcFile()), new File(splitHEnum.getDirPath()),
							splitHEnum.getEachSize(), splitHEnum.getSplitFlag());
					break;
				}

			default:

				System.out.println("无有效信息");

				break;
			}

		}

	}

	/*
	 * 分割逻辑
	 */
	public static void splitFile(File srcFile, File dirPath, int eachSize, String flag) throws Exception {

		// 创建目标地址
		if (!dirPath.isDirectory()) {
			System.out.println("不存在，需创建");
			dirPath.mkdirs();
		}
		// 创建文本地址
		if (!srcFile.isDirectory()) {
			srcFile.mkdirs();
		} else {
			// 抓取文本地址下的文件
			File[] listFiles = srcFile.listFiles(new FilenameFilter() {
				// @Override
				public boolean accept(File dir, String name) {

					return name.contains(".");
				}
			});

			if (listFiles.length == 0 || listFiles == null) {

				System.out.println("没有文件");
				return;
			} else {
				for (File file : listFiles) {
					convertFiles(file);
					splitSameSizes(file, srcFile, dirPath, eachSize, flag);
				}
			}

		}
	}

	public static void splitSameSizes(File file, File srcFile, File dirPath, int eachSize, String flag)
			throws Exception {

		// 计算文本大小
		byte[] fileContent = new byte[(int) file.length()];
		BufferedReader fis = null;
		try {
			// 将文件内容读取到内存中
			fis = new BufferedReader(new FileReader(file));
			fis.readLine();
			if (flag.equals("0")) {
				fis.close();
			} else {
				System.out.println("***************文本按行分割开始**************************");
			}
		} catch (Exception e) {
			e.printStackTrace();
		}

		// 计算要次要拆分为多少份
		int fileNumber;
		if (fileContent.length % eachSize == 0) {
			fileNumber = fileContent.length / eachSize;
		} else {
			fileNumber = fileContent.length / eachSize + 1;
		}

		for (int i = 0; i < fileNumber; i++) {
			String fileName = file.getName().split("\\.")[0] + "_" + (i + 1) + "." + file.getName().split("\\.")[1];
			System.out.println(fileName);
			File fi = new File(dirPath, fileName);
			// 无具体要求读取文件
			switch (flag.equals("0") ? 0 : 1) {

			case 0:
				// 在当前文件路径下创建拆分的文件
				byte[] eachContent;
				// 将源文件内容复制到拆分的文件中
				if (i != fileNumber - 1) {
					eachContent = Arrays.copyOfRange(fileContent, eachSize * i, eachSize * (i + 1));
				} else {
					eachContent = Arrays.copyOfRange(fileContent, eachSize * i, fileContent.length);
				}
				try {
					FileOutputStream fos = new FileOutputStream(fi);
					fos.write(eachContent);
					fos.close();
					String fileEncode = getFileEncode(fi.getAbsolutePath());
					System.out.println(fileEncode);
					System.out.printf("输出子文件 %s,其大小是 %d,每个的大小是%d\n", fi.getAbsoluteFile(), fi.length(),
							eachContent.length);
				} catch (Exception e) {
					// TODO: handle exception
					e.printStackTrace();
				}

				break;

			default:
				String lenFiles;
				BufferedWriter fos = new BufferedWriter(new FileWriter(fi));
				while ((lenFiles = fis.readLine()) != null) {

					fos.write(lenFiles);
					fos.newLine();

					if (fi.length() > eachSize && i < fileNumber - 1) {
						System.out.println("拆分的子文本:" + fi.getAbsolutePath());
						break;
					}
				}
				if (i == fileNumber - 1) {
					System.out.println("拆分的子文本:" + fi.getAbsolutePath());
					System.out.println("***************文本按行分割结束**************************");
					break;

				}
				fos.close();
			}
		}
		fis.close();
	}

	public static void convertFiles(File fi) throws Exception {

		String charset = getFileEncode(fi.getAbsolutePath());
		if (charset.equals("UTF-8")) {
			return;
		} else if (Charset.isSupported(charset)) {

			convert(fi.getAbsolutePath(), charset, "UTF-8", new FilenameFilter() {
				// @Override
				public boolean accept(File dir, String name) {
					return name.contains(".");
				}
			});

		}

	}

	/**
	 * 利用第三方开源包cpdetector获取文件编码格式
	 * 
	 * @param path 要判断文件编码格式的源文件的路径
	 * @author huanglei
	 * @version 2012-7-12 14:05
	 */
	public static String getFileEncode(String path) {
		/*
		 * detector是探测器，它把探测任务交给具体的探测实现类的实例完成。
		 * cpDetector内置了一些常用的探测实现类，这些探测实现类的实例可以通过add方法 加进来，如ParsingDetector、
		 * JChardetFacade、ASCIIDetector、UnicodeDetector。
		 * detector按照“谁最先返回非空的探测结果，就以该结果为准”的原则返回探测到的
		 * 字符集编码。使用需要用到三个第三方JAR包：antlr.jar、chardet.jar和cpdetector.jar
		 * cpDetector是基于统计学原理的，不保证完全正确。
		 */
		CodepageDetectorProxy detector = CodepageDetectorProxy.getInstance();

		/*
		 * ParsingDetector可用于检查HTML、XML等文件或字符流的编码,构造方法中的参数用于 指示是否显示探测过程的详细信息，为false不显示。
		 */
		detector.add(new ParsingDetector(false));

         /* JChardetFacade封装了由Mozilla组织提供的JChardet，它可以完成大多数文件的编码 
          	测定。所以，一般有了这个探测器就可满足大多数项目的要求，如果你还不放心，可以 
         	再多加几个探测器，比如下面的ASCIIDetector、UnicodeDetector等。 */

		// 用到antlr.jar、chardet.jar
		// org.mozilla.intl.chardet.nsICharsetDetectionObserver
		detector.add(JChardetFacade.getInstance());
		// ASCIIDetector用于ASCII编码测定
		detector.add(ASCIIDetector.getInstance());
		// UnicodeDetector用于Unicode家族编码的测定
		detector.add(UnicodeDetector.getInstance());
		java.nio.charset.Charset charset = null;
		File f = new File(path);
		try {
			charset = detector.detectCodepage(f.toURI().toURL());
		} catch (Exception ex) {
			ex.printStackTrace();
		}
		if (charset != null)
			return charset.name();
		else
			return null;
	}

	/**
	 * 把指定文件或目录转换成指定的编码
	 * 
	 * @param fileName        要转换的文件
	 * @param fromCharsetName 源文件的编码
	 * @param toCharsetName   要转换的编码
	 * @throws Exception
	 */
	public static void convert(String fileName, String fromCharsetName, String toCharsetName) throws Exception {
		convert(new File(fileName), fromCharsetName, toCharsetName, null);
	}

	/**
	 * 把指定文件或目录转换成指定的编码
	 * 
	 * @param file            要转换的文件或目录
	 * @param fromCharsetName 源文件的编码
	 * @param toCharsetName   要转换的编码
	 * @throws Exception
	 */
	public static void convert(File file, String fromCharsetName, String toCharsetName) throws Exception {
		convert(file, fromCharsetName, toCharsetName, null);
	}

	/**
	 * 把指定文件或目录转换成指定的编码
	 * 
	 * @param file            要转换的文件或目录
	 * @param fromCharsetName 源文件的编码
	 * @param toCharsetName   要转换的编码
	 * @param filter          文件名过滤器
	 * @throws Exception
	 */
	public static void convert(String fileName, String fromCharsetName, String toCharsetName, FilenameFilter filter)
			throws Exception {
		convert(new File(fileName), fromCharsetName, toCharsetName, filter);
	}

	/**
	 * 把指定文件或目录转换成指定的编码
	 * 
	 * @param file            要转换的文件或目录
	 * @param fromCharsetName 源文件的编码
	 * @param toCharsetName   要转换的编码
	 * @param filter          文件名过滤器
	 * @throws Exception
	 */
	public static void convert(File file, String fromCharsetName, String toCharsetName, FilenameFilter filter)
			throws Exception {
		if (file.isDirectory()) {
			File[] fileList = null;
			if (filter == null) {
				fileList = file.listFiles();
			} else {
				fileList = file.listFiles(filter);
			}
			for (File f : fileList) {
				convert(f, fromCharsetName, toCharsetName, filter);
			}
		} else {
			if (filter == null || filter.accept(file.getParentFile(), file.getName())) {
				String fileContent = getFileContentFromCharset(file, fromCharsetName);
				saveFile2Charset(file, toCharsetName, fileContent);
			}
		}
	}

	/**
	 * 以指定编码方式读取文件，返回文件内容
	 *
	 * @param file            要转换的文件
	 * @param fromCharsetName 源文件的编码
	 * @return
	 * @throws Exception
	 */
	public static String getFileContentFromCharset(File file, String fromCharsetName) throws Exception {
		if (!Charset.isSupported(fromCharsetName)) {
			throw new UnsupportedCharsetException(fromCharsetName);
		}
		InputStream inputStream = new FileInputStream(file);
		InputStreamReader reader = new InputStreamReader(inputStream, fromCharsetName);
		char[] chs = new char[(int) file.length()];
		reader.read(chs);
		String str = new String(chs).trim();
		reader.close();
		return str;
	}

	/**
	 * 以指定编码方式写文本文件，存在会覆盖
	 * 
	 * @param file          要写入的文件
	 * @param toCharsetName 要转换的编码
	 * @param content       文件内容
	 * @throws Exception
	 */
	public static void saveFile2Charset(File file, String toCharsetName, String content) throws Exception {
		if (!Charset.isSupported(toCharsetName)) {
			throw new UnsupportedCharsetException(toCharsetName);
		}
		OutputStream outputStream = new FileOutputStream(file);
		OutputStreamWriter outWrite = new OutputStreamWriter(outputStream, toCharsetName);
		outWrite.write(content);
		outWrite.close();
	}
}

高山之巅心有羽翼

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
用java代码进行文本内容编码格式的转换及文本分割称若干个小文本

项目以maven工程为例相关依赖 <dependencies> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>4.8.2</version>...
复制链接

扫一扫

专栏目录