java 检测文件类型

最新推荐文章于 2024-04-19 18:23:25 发布

历经沧桑的少年

最新推荐文章于 2024-04-19 18:23:25 发布

阅读量323

点赞数

分类专栏：笔记文章标签： java

本文链接：https://blog.csdn.net/qq_40530899/article/details/93746887

版权

package com.thinkgem.jeesite.modules.utils;

import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.net.URL;

public class EncodingDetect {
private static final int GB2312 = 0;
private static final int GBK = 1;
private static final int HZ = 2;
private static final int BIG5 = 3;
private static final int EUC_TW = 4;
private static final int ISO_2022_CN = 5;
private static final int UTF8 = 6;
private static final int UNICODE = 7;
private static final int ASCII = 8;
private static final int OTHER = 9;

private static final int TOTAL_ENCODINGS = 10;

// Frequency tables to hold the GB, Big5, and EUC-TW character
// frequencies
private int GBFreq[][];
private int GBKFreq[][];
private int Big5Freq[][];
private int EUC_TWFreq[][];
// int UnicodeFreq[94][128];

public static String[] nicename;
public static String[] codings;

public EncodingDetect() {
    // Initialize the Frequency Table for GB, Big5, EUC-TW
    GBFreq = new int[94][94];
    GBKFreq = new int[126][191];
    Big5Freq = new int[94][158];
    EUC_TWFreq = new int[94][94];

    codings = new String[TOTAL_ENCODINGS];
    codings[GB2312] = "GB2312";
    codings[GBK] = "GBK";
    codings[HZ] = "HZ";
    codings[BIG5] = "BIG5";
    codings[EUC_TW] = "CNS11643";
    codings[ISO_2022_CN] = "ISO2022CN";
    codings[UTF8] = "UTF8";
    codings[UNICODE] = "Unicode";
    codings[ASCII] = "ASCII";
    codings[OTHER] = "OTHER";

    nicename = new String[TOTAL_ENCODINGS];
    nicename[GB2312] = "GB2312";
    nicename[GBK] = "GBK";
    nicename[HZ] = "HZ";
    nicename[BIG5] = "Big5";
    nicename[EUC_TW] = "CNS 11643";
    nicename[ISO_2022_CN] = "ISO 2022-CN";
    nicename[UTF8] = "UTF-8";
    nicename[UNICODE] = "Unicode";
    nicename[ASCII] = "ASCII";
    nicename[OTHER] = "OTHER";

    initialize_frequencies();
}

/**
 * Function : detectEncoding Aruguments: URL Returns : One of the encodings
 * from the Encoding enumeration (GB2312, HZ, BIG5, EUC_TW, ASCII, or OTHER)
 * Description: This function looks at the URL contents and assigns it a
 * probability score for each encoding type. The encoding type with the
 * highest probability is returned.
 */

public int detectEncoding(URL testurl) {
    byte[] rawtext = new byte[10000];
    int bytesread = 0, byteoffset = 0;
    int guess = OTHER;
    InputStream chinesestream;

    try {
        chinesestream = testurl.openStream();

        while ((bytesread = chinesestream.read(rawtext, byteoffset,
                rawtext.length - byteoffset)) > 0) {
            byteoffset += bytesread;
        }
        ;
        chinesestream.close();
        guess = detectEncoding(rawtext);

    } catch (Exception e) {
        System.err.println("Error loading or using URL " + e.toString());
        guess = OTHER;
    }

    return guess;
}

/**
 * Function : detectEncoding Aruguments: File Returns : One of the encodings
 * from the Encoding enumeration (GB2312, HZ, BIG5, EUC_TW, ASCII, or OTHER)
 * Description: This function looks at the file and assigns it a probability
 * score for each encoding type. The encoding type with the highest
 * probability is returned.
 */

public int detectEncoding(File testfile) {
    FileInputStream chinesefile;
    byte[] rawtext;

    rawtext = new byte[(int) testfile.length()];
    try {
        chinesefile = new FileInputStream(testfile);
        chinesefile.read(rawtext);
    } catch (Exception e) {
        System.err.println("Error: " + e);
    }

    return detectEncoding(rawtext);
}

/**
 * Function : detectEncoding Aruguments: byte array Returns : One of the
 * encodings from the Encoding enumeration (GB2312, HZ, BIG5, EUC_TW, ASCII,
 * or OTHER) Description: This function looks at the byte array and assigns
 * it a probability score for each encoding type. The encoding type with the
 * highest probability is returned.
 */

public int detectEncoding(byte[] rawtext) {
    int[] scores;
    int index, maxscore = 0;
    int encoding_guess = OTHER;

    scores = new int[TOTAL_ENCODINGS];

    // Assign Scores
    scores[GB2312] = gb2312_probability(rawtext);
    scores[GBK] = gbk_probability(rawtext);
    scores[HZ] = hz_probability(rawtext);
    scores[BIG5] = big5_probability(rawtext);
    scores[EUC_TW] = euc_tw_probability(rawtext);
    scores[ISO_2022_CN] = iso_2022_cn_probability(rawtext);
    scores[UTF8] = utf8_probability(rawtext);
    scores[UNICODE] = utf16_probability(rawtext);
    scores[ASCII] = ascii_probability(rawtext);
    scores[OTHER] = 0;

    // Tabulate Scores
    for (index = 0; index < TOTAL_ENCODINGS; index++) {
        if (scores[index] > maxscore) {
            encoding_guess = index;
            maxscore = scores[index];
        }
    }

    // Return OTHER if nothing scored above 50
    if (maxscore <= 50) {
        encoding_guess = OTHER;
    }

    return encoding_guess;
}

/*
 * Function: gb2312_probability Argument: pointer to byte array Returns :
 * number from 0 to 100 representing probability text in array uses GB-2312
 * encoding
 */

int gb2312_probability(byte[] rawtext) {
    int i, rawtextlen = 0;

    int dbchars = 1, gbchars = 1;
    long gbfreq = 0, totalfreq = 1;
    float rangeval = 0, freqval = 0;
    int row, column;

    // Stage 1: Check to see if characters fit into acceptable ranges

    rawtextlen = rawtext.length;
    for (i = 0; i < rawtextlen - 1; i++) {
        // System.err.println(rawtext[i]);
        if (rawtext[i] >= 0) {
            // asciichars++;
        } else {
            dbchars++;
            if ((byte) 0xA1 <= rawtext[i] && rawtext[i] <= (byte) 0xF7
                    && (byte) 0xA1 <= rawtext[i + 1]
                    && rawtext[i + 1] <= (byte) 0xFE) {
                gbchars++;
                totalfreq += 500;
                row = rawtext[i] + 256 - 0xA1;
                column = rawtext[i + 1] + 256 - 0xA1;
                if (GBFreq[row][column] != 0) {
                    gbfreq += GBFreq[row][column];
                } else if (15 <= row && row < 55) {
                    gbfreq += 200;
                }

            }
            i++;
        }
    }
    rangeval = 50 * ((float) gbchars / (float) dbchars);
    freqval = 50 * ((float) gbfreq / (float) totalfreq);

    return (int) (rangeval + freqval);
}

/*
 * Function: gb2312_probability Argument: pointer to byte array Returns :
 * number from 0 to 100 representing probability text in array uses GB-2312
 * encoding
 */

int gbk_probability(byte[] rawtext) {
    int i, rawtextlen = 0;

    int dbchars = 1, gbchars = 1;
    long gbfreq = 0, totalfreq = 1;
    float rangeval = 0, freqval = 0;
    int row, column;

    // Stage 1: Check to see if characters fit into acceptable ranges
    rawtextlen = rawtext.length;
    for (i = 0; i < rawtextlen - 1; i++) {
        // System.err.println(rawtext[i]);
        if (rawtext[i] >= 0) {
            // asciichars++;
        } else {
            dbchars++;
            if ((byte) 0xA1 <= rawtext[i] && rawtext[i] <= (byte) 0xF7
                    && // Original GB range
                    (byte) 0xA1 <= rawtext[i + 1]
                    && rawtext[i + 1] <= (byte) 0xFE) {
                gbchars++;
                totalfreq += 500;
                row = rawtext[i] + 256 - 0xA1;
                column = rawtext[i + 1] + 256 - 0xA1;

                // System.out.println("original row " + row + " column " +
                // column);
                if (GBFreq[row][column] != 0) {
                    gbfreq += GBFreq[row][column];
                } else if (15 <= row && row < 55) {
                    gbfreq += 200;
                }

            } else if ((byte) 0x81 <= rawtext[i]
                    && rawtext[i] <= (byte) 0xFE && // Extended GB range
                    (((byte) 0x80 <= rawtext[i + 1] && rawtext[i + 1] <= (byte) 0xFE) || ((byte) 0x40 <= rawtext[i + 1] && rawtext[i + 1] <= (byte) 0x7E))) {
                gbchars++;
                totalfreq += 500;
                row = rawtext[i] + 256 - 0x81;
                if (0x40 <= rawtext[i + 1] && rawtext[i + 1] <= 0x7E) {
                    column = rawtext[i + 1] - 0x40;
                } else {
                    column = rawtext[i + 1] + 256 - 0x80;
                }
                // System.out.println("extended row " + row + " column " +
                // column + " rawtext[i] " + rawtext[i]);
                if (GBKFreq[row][column] != 0) {
                    gbfreq += GBKFreq[row][column];
                }
            }
            i++;
        }
    }
    rangeval = 50 * ((float) gbchars / (float) dbchars);
    freqval = 50 * ((float) gbfreq / (float) totalfreq);

    // For regular GB files, this would give the same score, so I handicap
    // it slightly
    return (int) (rangeval + freqval) - 1;
}

/*
 * Function: hz_probability Argument: byte array Returns : number from 0 to
 * 100 representing probability text in array uses HZ encoding
 */

int hz_probability(byte[] rawtext) {
    int i, rawtextlen;
    int hzchars = 0, dbchars = 1;
    long hzfreq = 0, totalfreq = 1;
    float rangeval = 0, freqval = 0;
    int hzstart = 0, hzend = 0;
    int row, column;

    rawtextlen = rawtext.length;

    for (i = 0; i < rawtextlen; i++) {
        if (rawtext[i] == '~') {
            if (rawtext[i + 1] == '{') {
                hzstart++;
                i += 2;
                while (i < rawtextlen - 1) {
                    if (rawtext[i] == 0x0A || rawtext[i] == 0x0D) {
                        break;
                    } else if (rawtext[i] == '~' && rawtext[i + 1] == '}') {
                        hzend++;
                        i++;
                        break;
                    } else if ((0x21 <= rawtext[i] && rawtext[i] <= 0x77)
                            && (0x21 <= rawtext[i + 1] && rawtext[i + 1] <= 0x77)) {
                        hzchars += 2;
                        row = rawtext[i] - 0x21;
                        column = rawtext[i + 1] - 0x21;
                        totalfreq += 500;
                        if (GBFreq[row][column] != 0) {
                            hzfreq += GBFreq[row][column];
                        } else if (15 <= row && row < 55) {
                            hzfreq += 200;
                        }
                    } else if ((0xA1 <= rawtext[i] && rawtext[i] <= 0xF7)
                            && (0xA1 <= rawtext[i + 1] && rawtext[i + 1] <= 0xF7)) {
                        hzchars += 2;
                        row = rawtext[i] + 256 - 0xA1;
                        column = rawtext[i + 1] + 256 - 0xA1;
                        totalfreq += 500;
                        if (GBFreq[row][column] != 0) {
                            hzfreq += GBFreq[row][column];
                        } else if (15 <= row && row < 55) {
                            hzfreq += 200;
                        }
                    }
                    dbchars += 2;
                    i += 2;
                }
            } else if (rawtext[i + 1] == '}') {
                hzend++;
                i++;
            } else if (rawtext[i + 1] == '~') {
                i++;
            }
        }

    }

    if (hzstart > 4) {
        rangeval = 50;
    } else if (hzstart > 1) {
        rangeval = 41;
    } else if (hzstart > 0) { // Only 39 in case the sequence happened to
        // occur
        rangeval = 39; // in otherwise non-Hz text
    } else {
        rangeval = 0;
    }
    freqval = 50 * ((float) hzfreq / (float) totalfreq);

    return (int) (rangeval + freqval);
}

/**
 * Function: big5_probability Argument: byte array Returns : number from 0
 * to 100 representing probability text in array uses Big5 encoding
 */

int big5_probability(byte[] rawtext) {
    int score = 0;
    int i, rawtextlen = 0;
    int dbchars = 1, bfchars = 1;
    float rangeval = 0, freqval = 0;
    long bffreq = 0, totalfreq = 1;
    int row, column;

    // Check to see if characters fit into acceptable ranges

    rawtextlen = rawtext.length;
    for (i = 0; i < rawtextlen - 1; i++) {
        if (rawtext[i] >= 0) {
            // asciichars++;
        } else {
            dbchars++;
            if ((byte) 0xA1 <= rawtext[i]
                    && rawtext[i] <= (byte) 0xF9
                    && (((byte) 0x40 <= rawtext[i + 1] && rawtext[i + 1] <= (byte) 0x7E) || ((byte) 0xA1 <= rawtext[i + 1] && rawtext[i + 1] <= (byte) 0xFE))) {
                bfchars++;
                totalfreq += 500;
                row = rawtext[i] + 256 - 0xA1;
                if (0x40 <= rawtext[i + 1] && rawtext[i + 1] <= 0x7E) {
                    column = rawtext[i + 1] - 0x40;
                } else {
                    column = rawtext[i + 1] + 256 - 0x61;
                }
                if (Big5Freq[row][column] != 0) {
                    bffreq += Big5Freq[row][column];
                } else if (3 <= row && ro

最低0.47元/天解锁文章

历经沧桑的少年

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
java 检测文件类型

package com.thinkgem.jeesite.modules.utils;import java.io.File;import java.io.FileInputStream;import java.io.InputStream;import java.net.URL;public class EncodingDetect {private static final int...
复制链接

扫一扫