最近一段时间接到朋友的关于大文件分解的一个问题,文件大小不确定,几百M到几个G,对于这样的问题,采用内存映射处理是最为高效的手段
需求是这样的,文件包含文件头:136个字节,后面记录的是一条一条的数据包,数据包结构如下:数据包头:4个字节;文件序号:4个字节,数据包大小;要求分解为100M大小左右的文件,根据文件结构代码实现如下:
//FileInfoDef.h
#ifndef __INCLUDE_FILEINFODEF_H__
#define __INCLUDE_FILEINFODEF_H__
#include <iostream>
using namespace std;
typedef char int8;
typedef unsigned char uint8;
typedef short int16;
typedef unsigned short uint16;
typedef int int32;
typedef unsigned int uint32;
//数据包个数,用于计数每个小文件存放数据包个数
const size_t PACKAGE_NUM = 1500;
//申请一块buffer,用于记录存放从大文件读取的数据
const size_t MAX_PACKAGE_SIZE = 20*1024 *1024;
//目前所支持的最大可分解文件
const size_t MAX_FILE_SIZE = 20000*65536;
typedef struct FilePos
{
uint32 NextFileFistCount; //记录一个数据包映射到另外一个试图的数据大小
int32 NextPackageHeadCnt; //记录一个数据包包头映射到另外一个试图长度
char PackageHead[12];
FilePos()
{
memset(this,0,sizeof(FilePos));
}
}FILE_POS_STRU;
#endif
//ByteReader.h
#ifndef __INCLUDE_BYTEREADER_H__
#define __INCLUDE_BYTEREADER_H__
#include "FileInfoDef.h"
class CByteReader
{
public:
char GetByte();
uint16 GetShort();
uint32 GetInt();
void Skip(size_t n);
void GetNInt(uint32 *p, size_t n);
void GetNShort(uint16 *p, size_t n);
void GetNByte(char *p, size_t n);
public:
CByteReader(const char *buffer,size_t maxNum);
virtual ~CByteReader(void);
private:
char * m_buffer; //文件缓存
size_t m_curByte; //当前字节
size_t m_maxByte; //最大字节
};
#endif
//ByteReader.cpp
#include "ByteReader.h"
#include <iostream>
using namespace std;
CByteReader::CByteReader( const char *buffer ,size_t maxNum)
{
m_buffer = const_cast<char*>(buffer);
m_curByte = 0;
m_maxByte = maxNum;
}
CByteReader::~CByteReader(void)
{
m_buffer = NULL;
m_curByte = 0;
m_maxByte = 0;
}
uint32 CByteReader::GetInt()
{
uint32 ret = 0;
if (m_curByte + 4 > m_maxByte)
{
printf("Current Byte is Larger than Max Num!!!");
return ret;
}
for (size_t i = 0; i < 4; i++)
{
unsigned char temp = static_cast<unsigned char>(*m_buffer);
ret |= (temp<<(3 - i) * 8);
m_buffer++;
}
m_curByte += 4;
return ret;
}
uint16 CByteReader::GetShort()
{
uint16 ret = 0;
if (m_curByte + 2 > m_maxByte)
{
printf("Current Byte is Larger than Max Num!!!");
return ret;
}
for (size_t i = 0; i < 2; i++)
{
uint16 temp = static_cast<uint16>(*m_buffer);
ret |= (temp<<(1 - i) * 8);
m_buffer++;
}
m_curByte += 2;
return ret;
}
char CByteReader::GetByte()
{
if (m_curByte + 1 > m_maxByte)
{
printf("Current Byte is Larger than Max Num!!!");
return 0;
}
char ret = *m_buffer;
m_buffer++;
m_curByte += 1;
return ret;
}
void CByteReader::Skip( size_t n )
{
if (m_curByte + n > m_maxByte)
{
printf("Current Byte is Larger than Max Num!!!");
return;
}
m_buffer += n;
m_curByte += n;
}
void CByteReader::GetNInt(uint32 *p, size_t n )
{
if (m_curByte + n*4 > m_maxByte)
{
printf("Current Byte is Larger than Max Num!!!");
return;
}
for(size_t i = 0; i < n; i++)
{
p[i] = GetInt();
}
}
void CByteReader::GetNShort( uint16 *p, size_t n )
{
if (m_curByte + n*2 > m_maxByte)
{
printf("Current Byte is Larger than Max Num!!!");
return;
}
for(size_t i = 0; i < n; i++)
{
p[i] = GetShort();
}
}
void CByteReader::GetNByte( char *p, size_t n )
{
if (m_curByte + n > m_maxByte)
{
printf("Current Byte is Larger than Max Num!!!");
return;
}
memcpy(p, m_buffer,n);
m_curByte += n;
m_buffer += n;
}
//ByteExChange.h
#ifndef __INCLUDE_BYTEEXCHANGE_H__
#define __INCLUDE_BYTEEXCHANGE_H__
class CByteExChange
{
public:
CByteExChange(void);
~CByteExChange(void);
public:
//注意使用完需要释放内存
static char* UnicodeToAnsi( const wchar_t*szStr );
static wchar_t* AnsiToUnicode( const char* szStr );
};
#endif
//ByteExChange.cpp
#include "ByteExChange.h"
#include <Windows.h>
CByteExChange::CByteExChange(void)
{
}
CByteExChange::~CByteExChange(void)
{
}
//将宽字节wchar_t*转化为单字节char*
char* CByteExChange::UnicodeToAnsi( const wchar_t*szStr )
{
int nLen = WideCharToMultiByte( CP_ACP, 0, szStr, -1, NULL, 0, NULL, NULL );
if(nLen == 0)
{
return NULL;
}
char* pResult = new char[nLen];
WideCharToMultiByte(CP_ACP, 0, szStr, -1, pResult, nLen, NULL, NULL );
return pResult;
}
wchar_t* CByteExChange::AnsiToUnicode( const char* szStr )
{
int nLen = MultiByteToWideChar( CP_ACP, MB_PRECOMPOSED, szStr, -1, NULL, 0 );
if(nLen == 0)
{
return NULL;
}
wchar_t*pResult = new wchar_t[nLen];
MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED, szStr, -1, pResult, nLen );
return pResult;
}
//FileProcss.h
#ifndef __INCLUDE_PARSEFILE_H__
#define __INCLUDE_PARSEFILE_H__
#include <string>
#include <list>
#include <fstream>
#include <Windows.h>
#include "FileInfoDef.h"
using namespace std;
class CFileProcess
{
public:
CFileProcess(void);
virtual ~CFileProcess(void);
public:
void SplitFile(string strFileName,string strDes);
private:
string GetFileNameNoSuffix( string &strFileName, string strDes );
void ProcessFile( DWORD dwBlockBytes, string strFileName);
void GetFileList(string filePath,list<string> &fileList,string strType);
bool JudgePackageHeader();
string GetSmallFileName(string & strFileName );
void WriteContentToFile(string & strFileName,size_t CountUint,bool bFlag = false);
void ResizeBuffer(size_t Count);
private:
char *m_buf;
char *m_pBuf;
char *m_packageArray; //记录数据包内容
__int64 m_FileSize; //记录当前映射试图文件大小
__int64 m_dwBlockSize; //记录映射视图数据块大小
size_t m_counter; //记录数据包个数
size_t m_fileNo; //记录新文件编号
bool m_isChangeFile; //记录是否需要打开新文件
char *m_fileHeader; //记录文件头
FILE_POS_STRU m_stfilepos; //记录同一个数据包分成视图文件
size_t m_bufSize; //记录数据包缓存大小
private:
fstream m_fout; //小文件输出流
};
#endif
//FileProcess.cpp
#include "FileProcess.h"
#include "ByteReader.h"
#include "ByteExChange.h"
extern "C" __declspec(dllexport) void __stdcall SplitFile(char * strFileName,char* strDes)
{
CFileProcess file;
file.SplitFile(strFileName,strDes);
}
CFileProcess::CFileProcess(void)
{
try
{
m_FileSize = 0;
m_isChangeFile = true;
m_fileNo = 0;
m_counter = 0;
m_packageArray = new char[MAX_PACKAGE_SIZE];
m_bufSize = MAX_PACKAGE_SIZE;
m_fileHeader = new char[136];
memset(m_fileHeader,0,136);
}
catch (...)
{
}
}
CFileProcess::~CFileProcess(void)
{
//删除数据包Buffer
if (NULL != m_packageArray)
{
delete [] m_packageArray;
m_packageArray = NULL;
}
//删除文件头
if (NULL != m_fileHeader)
{
delete [] m_fileHeader;
m_fileHeader = NULL;
}
//关闭文件流
if (m_fout.is_open())
{
m_fout.close();
}
}
void CFileProcess::SplitFile(string strFileName,string strDes)
{
//文件名转换
wchar_t * wText = CByteExChange::AnsiToUnicode(strFileName.c_str());
HANDLE hFile = ::CreateFile(wText,GENERIC_READ,FILE_SHARE_READ,
NULL,OPEN_EXISTING,FILE_FLAG_RANDOM_ACCESS,NULL);
delete [] wText;
if (INVALID_HANDLE_VALUE == hFile)
{
printf("CreateFile Failed,ErrorCode is %d",GetLastError());
return;
}
// 创建文件映射对象
HANDLE hFileMap = CreateFileMapping(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
if (NULL == hFileMap)
{
printf("CreateFileMapping Failed,ErrorCode is %d",GetLastError());
return;
}
//////////////////////////////////////////////////////////////////////////
string strFileNameNoSuffix = GetFileNameNoSuffix(strFileName, strDes);
//////////////////////////////////////////////////////////////////////////
//获取系统分配粒度
SYSTEM_INFO SysInfo;
GetSystemInfo(&SysInfo);
DWORD dwGran = SysInfo.dwAllocationGranularity;
// 得到文件尺寸
DWORD dwFileSizeHigh;
__int64 qwFileSize = GetFileSize(hFile, &dwFileSizeHigh);
qwFileSize |= (((__int64)dwFileSizeHigh) << 32);
//增加限制条件,大于1G文件不处理,直接返回
// if (qwFileSize > 1024*1024*1024)
// {
// printf("The file is too big,if you want to process super file,please contact nk_wang@yeah.net");
// return;
// }
__int64 qwFileOffset = 0;
DWORD dwBlockBytes = 5000 * dwGran;
if (qwFileSize < 5000 * dwGran)
{
dwBlockBytes = (DWORD)qwFileSize;
m_FileSize = qwFileSize;
}
else
{
m_FileSize = dwBlockBytes;
}
bool isOneFile = true;
//循环获取内存映射内容
while(qwFileSize > 0)
{
m_buf = (char *)MapViewOfFile(hFileMap,FILE_MAP_READ,(DWORD)
(qwFileOffset>>32),(DWORD)(qwFileOffset&0xFFFFFFFF),dwBlockBytes);
if (NULL == m_buf)
{
printf("Create MapView failed, ErrorCode is %d\n",GetLastError());
return;
}
m_pBuf = m_buf;
//此处添加处理文件
m_dwBlockSize = dwBlockBytes;
//处理文件头
if (isOneFile)
{
//第一次读取文件
memcpy(m_fileHeader, m_buf, 136);
m_buf += 136;
//m_dwBlockSize = dwBlockBytes;
isOneFile = false;
m_dwBlockSize -= 136; //减去文件头长度
}
//////////////////////////////////////////////////////////////////////////
//处理文件内容
ProcessFile(dwBlockBytes, strFileNameNoSuffix);
//撤消文件映像
UnmapViewOfFile(m_buf);
//修正参数
qwFileOffset += dwBlockBytes;
qwFileSize -= dwBlockBytes;
if (qwFileSize > 0 && qwFileSize < dwBlockBytes)
{
m_FileSize = qwFileSize;
dwBlockBytes = static_cast<DWORD>(qwFileSize);
}
else
{
m_FileSize = dwBlockBytes;
}
}
::CloseHandle(hFileMap);
::CloseHandle(hFile);
}
string CFileProcess::GetFileNameNoSuffix( string &strFileName, string strDes )
{
size_t m = strFileName.rfind('\\');
string temp = strFileName.substr(m);
size_t n = temp.find(".bin");
temp = temp.substr(0,n);
string strFileNameNoSuffix(strDes + temp);
return strFileNameNoSuffix;
}
void CFileProcess::ProcessFile( DWORD dwBlockBytes, string strFileName )
{
while(m_buf < m_pBuf + m_FileSize)
{
//文件试图剩余字节数不足12个,拷贝后存入Buffer
if (m_dwBlockSize < 12)
{
char temp[12] = {0};
memcpy(m_stfilepos.PackageHead,m_buf,(uint32)m_dwBlockSize);
m_stfilepos.NextPackageHeadCnt = 12 - dwBlockBytes;
break;
}
//处理数据包头与数据包内容不在同一张视图中
if (m_stfilepos.NextPackageHeadCnt > 0)
{
memcpy(m_stfilepos.PackageHead+m_stfilepos.NextPackageHeadCnt,m_buf,m_stfilepos.NextPackageHeadCnt);
m_buf += m_stfilepos.NextPackageHeadCnt;
CByteReader rReader(m_stfilepos.PackageHead,12);
//跳过包头和序列号
rReader.Skip(8);
//计算数据包大小;
uint32 CountUint = rReader.GetInt();
ResizeBuffer(CountUint);
//memset(m_packageArray,0,m_bufSize);
//组合数据包,首先拷贝数据包头12个字节到数据包Buffer中
memcpy(m_packageArray,m_stfilepos.PackageHead,12);
//拷贝数据包内容到到数据包Buffer中
memcpy(m_packageArray+12,m_buf,CountUint-12);
m_buf +=(CountUint-12);
string strName = GetSmallFileName(strFileName);
WriteContentToFile(strFileName,CountUint);
m_stfilepos.NextPackageHeadCnt = 0;
}
//处理部分内容在下一张视图中
if (m_stfilepos.NextFileFistCount > 0)
{
//拷贝上个视图数据包剩余部分
memcpy(m_packageArray,m_buf,m_stfilepos.NextFileFistCount);
m_buf += m_stfilepos.NextFileFistCount;
string strName = GetSmallFileName(strName);
WriteContentToFile(strFileName,m_stfilepos.NextFileFistCount,true);
m_dwBlockSize -= m_stfilepos.NextFileFistCount;
m_stfilepos.NextFileFistCount = 0;
}
//判断数据包头是否正确
if (!JudgePackageHeader())
{
break;
}
char temp[8] = {0};
memcpy(temp,m_buf,8);
m_buf += 8;
//计算包长度
CByteReader rHeader(temp,8);
rHeader.Skip(4); //跳过四个字节
uint32 CountUint = rHeader.GetInt(); //数据包数据
m_buf -= 12;
if (CountUint > m_dwBlockSize)
{
//记录下个文件中同个数据包长度
m_stfilepos.NextFileFistCount = uint32(CountUint - m_dwBlockSize);
ResizeBuffer(CountUint);
//memset(m_packageArray,0,m_bufSize);
memcpy(m_packageArray,m_buf,static_cast<size_t>(m_dwBlockSize));
string strName = GetSmallFileName(strFileName);
WriteContentToFile(strName,static_cast<size_t>(m_dwBlockSize));
break;
}
ResizeBuffer(CountUint);
//memset(m_packageArray,0,m_bufSize);
memcpy(m_packageArray,m_buf,CountUint);
m_buf += CountUint;
m_dwBlockSize -= CountUint;
string strName = GetSmallFileName(strFileName);
WriteContentToFile(strName,CountUint);
}
}
bool CFileProcess::JudgePackageHeader()
{
unsigned char headerInfo[4] = {0};
memcpy(headerInfo,m_buf,4);
m_buf += 4;
if (headerInfo[0] == 0xFF &&
headerInfo[1] == 0xFF &&
headerInfo[2] == 0xFF &&
headerInfo[3] == 0xFF)
{
return true;
}
return false;
}
void CFileProcess::WriteContentToFile(string & strFileName,size_t CountUint,bool bFlag)
{
//写入文件头
if (0 == m_counter)
{
m_fout.open(strFileName, ios::out|ios::binary);
if (m_fout.bad())
{
return;
}
m_fout.write(m_fileHeader,136);
}
m_fout.write(m_packageArray,CountUint);
m_fout.flush();
if (!bFlag)
{
m_counter += 1;
}
if (PACKAGE_NUM == m_counter)
{
m_counter = 0;
m_fileNo += 1;
m_isChangeFile = true;
m_fout.close();
}
}
string CFileProcess::GetSmallFileName(string & strFileName )
{
string strName("");
if (m_isChangeFile)
{
strName = strFileName;
char temp[10] = {'\0'};
sprintf_s(temp,"_%04d",m_fileNo);
strName += temp;
strName += ".bin";
m_isChangeFile = false;
}
return strName;
}
void CFileProcess::ResizeBuffer( size_t Count )
{
if(Count > m_bufSize)
{
m_bufSize = Count;
delete [] m_packageArray;
m_packageArray = new char[m_bufSize];
}
}
//Interface.h
# ifndef __INCLUDE_INTERFACE_H__
# define __INCLUDE_INTERFACE_H__
#include "FileProcess.h"
/************************************************************************/
/* 函数功能:大文件分解为小文件
/* strFileName[in]:输入文件名 例如:F:\\上海项目\\test4\\test4.bin
/* strDes[in]:分解后输出文件路径 例如:F:\\上海项目
/************************************************************************/
extern "C" __declspec(dllexport) void __stdcall SplitFile(char * strFileName,char* strDes);
# endif