获取 文件 后缀 c++


    正常情况下可以通过SHGetFileInfoA 或 PathFindExtension 获取文件的类型,当文件无后缀时就需要想别的方法了。
  

   以下以DOC和DOCX文件类型为例子:
   

    正常情况生成的doc文件其二进制数据如下:
   

   正常情况docx文件其二进制数据如下:


   通过对比可以看到doc和docx的二进制数据开头都不一样,因此在没有文件后缀的情况下可以从该方面来判断文件类型。

   以下是c++代码:

	   {
                char* files = "c:\\sdfsdf";
	 	FILE* file;
		file = fopen(files, "rb");
		enum{ NONE, DOC, DOCX };
		int fileType = 0;
		if (file)
		{
			unsigned char  buff[32] = { 0 };
			size_t length = fread(buff, 1, 10, file);
			if (length> 4)
			{
				if (buff[0] == 0x50 && buff[1] == 0x4B && buff[2] == 0x03 && buff[3] == 0x04)//正常创建docx
				{ 
					fileType = DOCX; //
				}
				else if (buff[0] == 0xD0 && buff[1] == 0xCF && buff[2] == 0x11 && buff[3] == 0xE0)  //正常创建doc
				{
					fileType = DOC; //
				}
				else if (buff[0] == 0x7B && buff[1] == 0x5C && buff[2] == 0x72 && buff[3] == 0x74)  //非正常创建doc
				{
					fileType = DOC; //
				}
			}
			fclose(file);
		}
           }

         注: 通过查看已知文件的二进制数据可以判断相应类型的文件

    第一次补充:
         通过上面简单的二进制判断可以处理已知文件,但准确率不高。要提高精确度可以使用StructuredStorageHeader结构读取文件流头部信息,再结合具体信息,从而提高判断的精确度。 
   

#include <Shlwapi.h>
#include <Shellapi.h>

typedef unsigned long ULONG;    // 4 Bytes
typedef unsigned short USHORT;  // 2 Bytes
typedef short OFFSET;           // 2 Bytes
typedef ULONG SECT;             // 4 Bytes
typedef ULONG FSINDEX;          // 4 Bytes
typedef USHORT FSOFFSET;        // 2 Bytes
//typedef USHORT WCHAR;           // 2 Bytes
typedef ULONG DFSIGNATURE;      // 4 Bytes
typedef unsigned char BYTE;     // 1 Byte
typedef unsigned short WORD;    // 2 Bytes
typedef unsigned long DWORD;    // 4 Bytes
//typedef ULONG SID;              // 4 Bytes
typedef GUID CLSID;             // 16 Bytes

struct StructuredStorageHeader { // [offset from start (bytes), length (bytes)]
	BYTE _abSig[8];             // [00H,08] {0xd0, 0xcf, 0x11, 0xe0, 0xa1, 0xb1,
	// 0x1a, 0xe1} for current version
	CLSID _clsid;               // [08H,16] reserved must be zero (WriteClassStg/
	// GetClassFile uses root directory class id)
	USHORT _uMinorVersion;      // [18H,02] minor version of the format: 33 is
	// written by reference implementation
	USHORT _uDllVersion;        // [1AH,02] major version of the dll/format: 3 for
	// 512-byte sectors, 4 for 4 KB sectors
	USHORT _uByteOrder;         // [1CH,02] 0xFFFE: indicates Intel byte-ordering
	USHORT _uSectorShift;       // [1EH,02] size of sectors in power-of-two;
	// typically 9 indicating 512-byte sectors
	USHORT _uMiniSectorShift;   // [20H,02] size of mini-sectors in power-of-two;
	// typically 6 indicating 64-byte mini-sectors
	USHORT _usReserved;         // [22H,02] reserved, must be zero
	ULONG _ulReserved1;         // [24H,04] reserved, must be zero
	FSINDEX _csectDir;          // [28H,04] must be zero for 512-byte sectors,
	// number of SECTs in directory chain for 4 KB
	// sectors
	FSINDEX _csectFat;          // [2CH,04] number of SECTs in the FAT chain
	SECT _sectDirStart;         // [30H,04] first SECT in the directory chain
	DFSIGNATURE _signature;     // [34H,04] signature used for transactions; must
	// be zero. The reference implementation
	// does not support transactions
	ULONG _ulMiniSectorCutoff;  // [38H,04] maximum size for a mini stream;
	// typically 4096 bytes
	SECT _sectMiniFatStart;     // [3CH,04] first SECT in the MiniFAT chain
	FSINDEX _csectMiniFat;      // [40H,04] number of SECTs in the MiniFAT chain
	SECT _sectDifStart;         // [44H,04] first SECT in the DIFAT chain
	FSINDEX _csectDif;          // [48H,04] number of SECTs in the DIFAT chain
	SECT _sectFat[109];         // [4CH,436] the SECTs of first 109 FAT sectors
};
#pragma warning(disable:4996)

void main(int argc, char* argv[])
{
	{
#ifdef _DEBUG
		char* files = "sss";
#else
		char* files = argv[1];
#endif
		FILE* file;
		file = fopen(files, "rb");
		enum{ NONE, DOC, DOCX };
		int fileType = 0;
		if (file)
		{
			unsigned char  buff[512] = { 0 };
			StructuredStorageHeader Head;
			int lenth = sizeof(StructuredStorageHeader);
			size_t length = fread(buff, 1, lenth, file); 
			if (length == lenth)
			{
				memcpy(&Head, buff, lenth);
				CLSID docID = { 0 };
				if (docID == Head._clsid)
				{
					fileType = DOC;
				}
				else
				{
					CLSID docxID = { 0 };
					docxID.Data1 = 0x4e870000;
					docxID.Data2 = 0x40e2;
					if (docxID == Head._clsid) //wps
					{
						fileType = DOCX;
					}
					else
					{
						docxID.Data1 = 8;
						docxID.Data2 = 0x21;
						docxID.Data3 = 0x4fbd;
						docxID.Data4[0] = 92;
						docxID.Data4[1] = 82;
						docxID.Data4[2] = 0x9d;
						docxID.Data4[3] = 1;
						docxID.Data4[4] = 0;
						docxID.Data4[5] = 0;
						docxID.Data4[6] = 0x29;
						docxID.Data4[7] = 7; 
						if (docxID == Head._clsid) //office
						{
							fileType = DOCX;
						}						 
					} 
				}
				if (!fileType && buff[0] == 0x7B && buff[1] == 0x5C && buff[2] == 0x72 && buff[3] == 0x74)  //非正常创建doc
				{
					fileType = DOC; //
				}
				printf("%08x-%04x-%x-%02x%02x%02x%02x%02x%02x", Head._clsid.Data1, Head._clsid.Data2, Head._clsid.Data3,
					Head._clsid.Data4[2], Head._clsid.Data4[3],
					Head._clsid.Data4[4], Head._clsid.Data4[5],
					Head._clsid.Data4[6], Head._clsid.Data4[7]);
			}
			fclose(file);
		}
		return;
	}
}


参考:
https://en.wikipedia.org/wiki/Compound_File_Binary_Format   --StructuredStorageHeader

第二次补充:
      在实验过程中,会误判ppt为doc,在查看了ppt文件对应的二进制数据口,重新优化了代码:
FILE* file;
		file = fopen(files, "rb");
		enum{ NONE, DOC, DOCX };
		int fileType = 0;
		if (file)
		{
			unsigned char  buff[512] = { 0 };
			StructuredStorageHeader Head;
			int lenth = sizeof(StructuredStorageHeader);
			size_t length = fread(buff, 1, lenth, file); 
			if (length == lenth)
			{
				memcpy(&Head, buff, lenth);
				CLSID docID = { 0 };
				if (docID == Head._clsid) 
				{
					if(buff[60] ==0x31||  //office doc   //区分 ppt doc
						(buff[60] == 2 && buff[80] == 0xFF))    //wps doc
						fileType = DOC;
				}
				else
				{
					if (Head._abSig[0] == 'P' && Head._abSig[1] == 'K')
					{
						CLSID docxID = { 0 };
						docxID.Data1 = 0x4e870000;
						docxID.Data2 = 0x40e2;
						if (docxID == Head._clsid) //wps
						{
							fileType = DOCX;
						}
						else
						{
							docxID.Data1 = 8;
							docxID.Data2 = 0x21;
							if (docxID.Data1 == Head._clsid.Data1 && docxID.Data2 == Head._clsid.Data2) //office
							{
								fileType = DOCX;
							}
						}
					}
				}
				if (!fileType && buff[0] == 0x7B && buff[1] == 0x5C && buff[2] == 0x72 && buff[3] == 0x74)  //非正常创建doc
				{
					fileType = DOC; //
				}
				printf("%08x-%04x-%x-%02x%02x%02x%02x%02x%02x", Head._clsid.Data1, Head._clsid.Data2, Head._clsid.Data3,
					Head._clsid.Data4[2], Head._clsid.Data4[3],
					Head._clsid.Data4[4], Head._clsid.Data4[5],
					Head._clsid.Data4[6], Head._clsid.Data4[7]);
			}
			fclose(file);
		}
		printf("%s", fileType == 1? "DOC":fileType == 2? "DOCX":"NOLL");

第三次补充:
     由于word支持打开的子版本太多,导致有许多可以用word打开的文件没有成功判断,现更改判断方法:
//BINARY查找字符串
#define _FLY_STRING_FindBitSub(fullStr, fullstrLent, subStr, subStrLen, rstValue)\
{\
	int i = 0, j = 0; \
while (i < fullstrLent && j < subStrLen)\
{\
if (*(fullStr + i) == *(subStr + j))\
{\
	j++; \
}\
else\
	j = 0; \
	i++; \
}\
	rstValue = (subStrLen == j); \
} 
void main(int argc, char* argv[])
{ 
	{
 
		char* files = argv[1];
		FILE* file;
		file = fopen(files, "rb");
		enum{ NONE, DOC, DOCX };
		int fileType = 0;
		if (file)
		{
			unsigned char  buff[512] = { 0 }; 
			int lenth = 512;
			size_t length = fread(buff, 1, lenth, file); 
			if (length == lenth)
			{  
				if (buff[0] == 0xD0 && buff[1] == 0xCF && buff[2] == 0x11 && buff[3] == 0xE0) //doc ppt xml
				{ 
					lenth = 32;
					length = fread(buff, 1, lenth, file);
					if (length == lenth)
					{
						bool rst = false;
						unsigned char wpsDoc[] = {0xFD, 0XFF, 0XFF, 0XFF, 0X05, 0X00, 0,0,0XFE,0XFF,0XFF,0XFF,0X04};
						_FLY_STRING_FindBitSub(buff, length, wpsDoc, 13, rst);
						if (rst) //wps doc
						{
							fileType = DOC;								
						}
						else 
						{
							int seekLen = sizeof(unsigned char)* 512;
							int rst = fseek(file, -seekLen, SEEK_END);
							if (!rst)
							{
								size_t length = fread(buff, 1, seekLen, file);
								bool rst = false;
								_FLY_STRING_FindBitSub(buff, length, "Word.Document", strlen("Word.Document"), rst);
								if (rst) //office doc
								{
									fileType = DOC;
								}
							}
						}
					}
				} 
				else
				{
					if (buff[0] == 'P' && buff[1] == 'K') //docx
					{
						int seekLen = sizeof(unsigned char)* 512;
						int rst = fseek(file, -seekLen, SEEK_END);
						if (!rst)
						{
							size_t length = fread(buff, 1, seekLen, file);
							bool rst = false;
							_FLY_STRING_FindBitSub(buff, length, "ord", strlen("ord"), rst); //word Word ord/
							if (rst) //wps office
							{
								fileType = DOCX;
							} 
						}  
					}
				}
				if (!fileType && buff[0] == 0x7B && buff[1] == 0x5C && buff[2] == 0x72 && buff[3] == 0x74)  //非正常创建doc
				{
					fileType = DOC; //
				} 
			}
			fclose(file);
		}
		printf("%s", fileType == 1? "DOC":fileType == 2? "DOCX":"NOLL");
		return;
	}
}
参考:

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值