java检测文件编码——cpdetector

最新推荐文章于 2023-03-22 15:02:59 发布

走慢一点点

最新推荐文章于 2023-03-22 15:02:59 发布

阅读量1.5w

点赞数 4

分类专栏： JAVA 文章标签：编码

本文链接：https://blog.csdn.net/wuseyukui/article/details/45799207

版权

JAVA 专栏收录该内容

31 篇文章 1 订阅

订阅专栏

源码下载：
http://jaist.dl.sourceforge.net/project/cpdetector/cpdetector/sources/cpdetector_eclipse_project_1.0.10.zip
jar包下载：
https://sourceforge.net/projects/cpdetector/?source=typ_redirect

cpdetector一个可以自动检测文本编码格式的项目

detector按照“谁最先返回非空的探测结果，就以该结果为准”的原则返回探测到的字符集编码。
使用需要用到三个第三方JAR包：antlr.jar、chardet.jar和cpdetector.jar
cpDetector是基于统计学原理的，不保证完全正确。

以下是读取xxx.txt文件中的内容，以html的方式返回给浏览器的简单servlet实例。在实现的过程了，遇到的最大问题就是，浏览器打开中文乱码问题，原因是.txt文件保存时的编码不统一，所以在“out.println(new String(buffer, charset));”时charset不能写死，而应该通过某种途径获取.txt文件的编码格式，获取的方式网上主要有以下三种，亲测第三种解决了问题，第一第二中方法都不完善。

package com.hwc.a.servlet;

import info.monitorenter.cpdetector.io.ASCIIDetector;
import info.monitorenter.cpdetector.io.CodepageDetectorProxy;
import info.monitorenter.cpdetector.io.JChardetFacade;
import info.monitorenter.cpdetector.io.ParsingDetector;
import info.monitorenter.cpdetector.io.UnicodeDetector;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintWriter;

import javax.servlet.ServletException;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;

public class TxtToHtmlServlet extends HttpServlet {
    private static final long serialVersionUID = 1L;

    public void doGet(HttpServletRequest request, HttpServletResponse response)
            throws ServletException, IOException {
        String path = request.getParameter("path");
        if (path != null && !"".equals(path)) {
            // TODO 开始下载
            path = new String(path.getBytes("ISO-8859-1"), "utf-8");
            InputStream fis = null;
            PrintWriter out = null;
            try {
                // path是指欲下载的文件的路径。
                // File file = new File(request.getRealPath("/")+"/"+path);
                File file = new File(path);
                // 取得文件名。
                String filename = file.getName();
                // 取得文件的后缀名。
                filename = filename.substring(0, filename.lastIndexOf("."));

                // 以流的形式下载文件。
                fis = new BufferedInputStream(new FileInputStream(file));
                byte[] buffer = new byte[fis.available()];
                fis.read(buffer);
                // 清空response
                response.reset();

                String charset = getFileEncode(path);
                System.out.println("============getFileEncode charset:" + charset);
                if (charset == null) {
                    charset = getCharset(path);
                    System.out.println("============getCharset charset:" + charset);
                }

                response.setHeader("Content-type", "text/html;charset="+ charset);
                response.setContentType("text/html;charset=" + charset);
                out = response.getWriter();
                out.println(new String(buffer, charset));
                out.flush();
            } catch (IOException ex) {
                ex.printStackTrace();
            } finally {
                if (fis != null) {
                    fis.close();
                }
                if (out != null) {
                    out.close();
                }
            }
        }
    }


    /**
     * 方法一： 仅作参考，不准确
     * @param fileName
     * @return
     * @throws IOException
     */
    private String getCharset(String fileName) throws IOException {

        BufferedInputStream bin = new BufferedInputStream(new FileInputStream(
                fileName));
        int p = (bin.read() << 8) + bin.read();

        String code = null;

        switch (p) {
        case 0xefbb:
            code = "UTF-8";
            break;
        case 0xfffe:
            code = "Unicode";
            break;
        case 0xfeff:
            code = "UTF-16BE";
            break;
        default:
            code = "GB2312";
        }
        return code;
    }

    /**
     * 方法二： 仅作参考，不准确
     * @param head
     * @return
     */
    private String codetype(byte[] head) {
        byte[] codehead = new byte[4];
        // 截取数组
        System.arraycopy(head, 0, codehead, 0, 4);
        String code = "";
        if (head[0] == -1 && head[1] == -2) {
            code = "UTF-16";
        } else if (head[0] == -2 && head[1] == -1) {
            code = "Unicode";
        } else if (head[0] == -17 && head[1] == -69 && head[2] == -65)
            code = "UTF-8";
        else {
            code = "gb2312";
        }
        return code;
    }

    /**
     * 方法三：比较准确，解决了实际问题
     * @param filePath
     * @return
     */
    public static String getFileEncode(String filePath) {
        String charsetName = null;
        try {
            File file = new File(filePath);
            CodepageDetectorProxy detector = CodepageDetectorProxy.getInstance();
            detector.add(new ParsingDetector(false));
            detector.add(JChardetFacade.getInstance());
            detector.add(ASCIIDetector.getInstance());
            detector.add(UnicodeDetector.getInstance());
            java.nio.charset.Charset charset = null;
            charset = detector.detectCodepage(file.toURI().toURL());
            if (charset != null) {
                charsetName = charset.name();
            } else {
                charsetName = "UTF-8";
            }
        } catch (Exception ex) {
            ex.printStackTrace();
            return null;
        }
        return charsetName;
    }

    public void doPost(HttpServletRequest request, HttpServletResponse response)
            throws ServletException, IOException {
        doGet(request, response);
    }
}

走慢一点点

关注

4
点赞
踩
12

收藏

觉得还不错? 一键收藏
3
评论
java检测文件编码——cpdetector

cpdetector一个可以自动检测文本编码格式的项目detector按照“谁最先返回非空的探测结果，就以该结果为准”的原则返回探测到的字符集编码。使用需要用到三个第三方JAR包：antlr.jar、chardet.jar和cpdetector.jar cpDetector是基于统计学原理的，不保证完全正确。以下是读取xxx.txt文件中的内容，以html的方式返回给浏览器的简单serv
复制链接

扫一扫