判断原理:定义好常用格式的文件后缀,以及对应文件的字节码前N位,作为判断的一句。
一般而言,如果手工修改文件的后缀,其内部文件字节码是不会发生变化的,以此来作为判断的标准。
因此,工作中,定义一个简要的工具来对常用的文件格式进行校验也是必要的。
以下是定义好的常用几种文件格式的校验,若有需要,可以自行增加。
特殊情况:目前,对于文本文件(*.txt)的文件无法校验,其读取的文件字节码几乎都不一样。当然还可能存在其他的。
字节转换为十六进制工具类:
import java.io.InputStream;
public final class BytesUtil {
public static String readBytesToHex(InputStream in, byte[] buff) {
readBuff(in, buff);
return bytesToHex(buff);
}
public static boolean readBuff(InputStream in, byte[] buff) {
try {
return in.read(buff, 0, buff.length) != -1;
} catch (Exception e) {
throw new RuntimeException(e.getMessage());
}
}
public static String bytesToHex(byte[] buff) {
if (buff == null || buff.length == 0) {
return null;
}
StringBuilder s = new StringBuilder();
for (byte b : buff) {
String hv = Integer.toHexString(b & 0xFF);
if (hv.length() < 2) {
s.append("0");
}
s.append(hv);
}
return s.toString().toUpperCase();
}
}
文件类型校验工具类:
import lombok.Getter;
import lombok.extern.slf4j.Slf4j;
import java.io.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Optional;
/**
* 文件类型校验工具
*/
@Slf4j
public final class FileTypeCheckUtil {
public static boolean checkType(File file) {
try (FileInputStream in = new FileInputStream(file)) {
return checkType(in, file.getName());
} catch (IOException e) {
throw new RuntimeException(e.getMessage());
}
}
public static boolean checkType(InputStream in, String fileName) {
return doCheckRowType(in, fileName, Type.values());
}
public static boolean checkRawType(File file, Type targetType) {
if (file == null) {
throw new RuntimeException("校验的文件为空");
}
try (FileInputStream in = new FileInputStream(file)) {
return checkRowType(in, file.getName(), targetType);
} catch (IOException e) {
throw new RuntimeException(e.getMessage());
}
}
public static boolean checkRowType(InputStream in, String fileName, Type targetType) {
return doCheckRowType(in, fileName, targetType);
}
private static boolean doCheckRowType(InputStream in, String fileName, Type... targetTypes) {
if (in == null) {
throw new RuntimeException("校验的文件流为空");
}
if (fileName == null) {
throw new RuntimeException("校验的文件名为空");
}
String[] arr = fileName.split("\\.");
if (arr.length < 2) {
throw new RuntimeException("校验的文件不是特定格式的文件");
}
String fileSuffix = arr[arr.length - 1];
Optional<Type> op = Arrays.stream(targetTypes).filter(f -> Arrays.stream(f.suffix).anyMatch(s -> s.equalsIgnoreCase(fileSuffix))).findAny();
if (!op.isPresent()) {
throw new RuntimeException("无法校验当前文件格式:" + fileSuffix);
}
Type type = op.get();
byte[] buff = new byte[type.hexCode.length() / 2];
String hex = BytesUtil.readBytesToHex(in, buff);
boolean matched = type.getHexCode().equals(hex);
if (!matched) {
Optional<Type> optional = Arrays.stream(Type.values()).filter(c -> c.getHexCode().equals(hex)).findAny();
optional.ifPresent(t -> log.error("“{}”文件后缀被篡改, 原文件类型:{}", fileName, t.getSuffix()));
}
return matched;
}
//文件的字节码头
@Getter
public enum Type {
XLS_DOC_PPT("D0CF11E0A1B11AE1", "xls", "doc", "ppt"),
XLSX_DOCX_PPTX("504B0304", "xlsx", "docx", "pptx"),
PDF("255044462D312E", "pdf"),
LIBRE_OFFICE("504B030414", "odg", "odp", "ods", "odt"),
ZIP("504B03040A", "zip"),
RAR("52617221", "rar"),
GZ("1F8B08", "gz"),
JPEG("FFD8FF", "jpeg"),
JPG("FFD8FFE0", "jpg"),
PNG("89504E47", "png"),
GIF("47494638", "gif"),
WEBP("52494646", "webp"),
BMP("424D", "bmp"),
BPMN("3C3F786D6C", "bpmn");
Type(String hexCode, String... suffix) {
this.hexCode = hexCode;
this.suffix = suffix;
}
private final String hexCode;
private final String[] suffix;
public static List<String> allSuffix() {
List<String> r = new ArrayList<>();
for (Type type : Type.values()) {
r.addAll(Arrays.asList(type.getSuffix()));
}
return r;
}
}
}