java实现获取文本文件的字符编码TXT和ZIP,解决zip文件读取乱码

2 篇文章 0 订阅

获取file文件字符编码

package com.file;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;

public class CharsetCodeTest {

	public static void main(String[] args) throws Exception {

		String filePath = "D:\\1.txt";
		String content = readTxt(filePath);
		System.out.println(content);
	}

	public static String readTxt(String path) {
		StringBuilder content = new StringBuilder("");
		try {
			String fileCharsetName = getFileCharsetName(path);
			System.out.println("文件的编码格式为:" + fileCharsetName);
			InputStream is = new FileInputStream(path);
			InputStreamReader isr = new InputStreamReader(is, fileCharsetName);
			BufferedReader br = new BufferedReader(isr);
			String str = "";
			boolean isFirst = true;
			while (null != (str = br.readLine())) {
				if (!isFirst)
					content.append(System.lineSeparator());
				// System.getProperty("line.separator");
				else
					isFirst = false;
				content.append(str);
			}
			br.close();
		} catch (Exception e) {
			e.printStackTrace();
			System.err.println("读取文件:" + path + "失败!");
		}
		return content.toString();
	}
	public static String getFileCharsetName(String fileName) throws IOException {
		InputStream inputStream = new FileInputStream(fileName);
		byte[] head = new byte[3];
		inputStream.read(head);
		String charsetName = "GBK";// 或GB2312,即ANSI
		if (head[0] == -1 && head[1] == -2) // 0xFFFE
			charsetName = "UTF-16";
		else if (head[0] == -2 && head[1] == -1) // 0xFEFF
			charsetName = "Unicode";// 包含两种编码格式:UCS2-Big-Endian和UCS2-Little-Endian
		else if (head[0] == -27 && head[1] == -101 && head[2] == -98)
			charsetName = "UTF-8"; // UTF-8(不含BOM)
		else if (head[0] == -17 && head[1] == -69 && head[2] == -65)
			charsetName = "UTF-8"; // UTF-8-BOM
		inputStream.close();
		System.out.println(charsetName);
		return charsetName;
	}
}

获取zip文件编码格式(解决读取乱码)

package com.file;

import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
import java.util.zip.ZipInputStream;

import info.monitorenter.cpdetector.io.ASCIIDetector;
import info.monitorenter.cpdetector.io.CodepageDetectorProxy;
import info.monitorenter.cpdetector.io.JChardetFacade;
import info.monitorenter.cpdetector.io.ParsingDetector;
import info.monitorenter.cpdetector.io.UnicodeDetector;
/**
 * 1、cpDetector内置了一些常用的探测实现类,这些探测实现类的实例可以通过add方法加进来,
 *    ParsingDetector、 JChardetFacade、ASCIIDetector、UnicodeDetector. 
 * 2、detector按照“谁最先返回非空的探测结果,就以该结果为准”的原则. 
 * 3、cpDetector是基于统计学原理的,不保证完全正确.
 */
public class FileCharsetDetector {

    /**
     * 利用第三方开源包cpdetector获取文件编码格式.
     * @param is
     */
    public static String getFileEncode(InputStream is) {
        //    begin     此段为zip格式文件的处理关键
        BufferedInputStream bis = null;
        if (is instanceof BufferedInputStream) {
            bis = (BufferedInputStream) is;
        } else {
            bis = new BufferedInputStream(is);
        }
        //   end
        CodepageDetectorProxy detector = CodepageDetectorProxy.getInstance();
        detector.add(new ParsingDetector(false));
        detector.add(UnicodeDetector.getInstance());
        detector.add(JChardetFacade.getInstance());// 内部引用了 chardet.jar的类
        detector.add(ASCIIDetector.getInstance());
        Charset charset = null;
        try {
            charset = detector.detectCodepage(bis, Integer.MAX_VALUE);// zip 判断的关键代码
        } catch (Exception e) {
           
        } finally {
            if (bis != null) {
                try {
                    bis.close();
                } catch (IOException e) {
          
                }
            }
        }
     // 默认为GBK
        String charsetName = "GBK";
        if (charset != null) {
            if (charset.name().equals("US-ASCII")) {
            	charsetName = "ISO_8859_1";
            } else {
                charsetName = charset.name();
            }
        }
        System.out.println(charsetName);
        return charsetName;        
    }
    
    public static void main(String[] args) throws IOException {
    	String path = "D:\\20210827092009_erqi.zip";
    	ZipFile zipFile = new ZipFile(path);
    	File file = new File(path);
    	ZipInputStream zipInput = new ZipInputStream(new FileInputStream(file));
    	ZipEntry  zipEntry = null;
    	while((zipEntry = zipInput.getNextEntry())!=null){
			zipFile = new ZipFile(file);
			InputStream inputStream = zipFile.getInputStream(zipEntry);
			String encode=getFileEncode(inputStream);
			BufferedReader bufferRead = new BufferedReader(new InputStreamReader(zipFile.getInputStream(zipEntry),encode));
			String strInsert = null;
			while((strInsert = bufferRead.readLine()) != null){
				System.out.println(strInsert);
			}
    	}
	}
}

jar包下载地址https://download.csdn.net/download/m0_37987151/21615417

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值