一步一步教你写pdf文件

最新推荐文章于 2024-04-18 18:50:56 发布

砖厂码农

最新推荐文章于 2024-04-18 18:50:56 发布

阅读量1.8k

点赞数

分类专栏：杂七杂八文章标签： C++CLI PDF pdf 交叉引用表

本文链接：https://blog.csdn.net/luo222/article/details/8262648

版权

杂七杂八专栏收录该内容

9 篇文章 0 订阅

订阅专栏

一步一步教你写PDF文件

PDF作为一种跨平台的文件格式，越来越受到用户的欢迎。现在除了adobe官方提供的pdflib有很多第三方的库可以实现PDF的文件创建、修改、格式转换。PDF文档采用的是二进制和文本混排的方式。近期项目需要，对pdf文件的结构做了一些研究。最终，领导决定采用第三方库，没能用得上。在此，跟大家分享一下，算是抛砖引玉。关于pdf文件的结构网上有很多介绍，在此不在赘述，请参看。

pdf文件可以包含文本、图片、视频、动画、3D数据等，本人就拿最简单的文本举例。那么我们就从“Hello， world！"开始吧。本例，截图如下：

由于pdf文件的解析中需要依靠交叉引用表，交叉引用表记录的是每一个obj相对于文件始点的位置（以后称“地址”），我们可以写一个函数，求出当前文件的位置。

long getObjLocation(FILE* pFile)
{
int fseekres = fseek(pFile,0,SEEK_CUR);
return ftell(pFile);
}

随着ISO标准的变化，pdf的文件格式也有一些调整。文件开始的地方标明文件对应的版本格式。

void writeHeader(FILE* pFile,vector<long>& pdfLocation)
{
fwrite("%PDF-1.7\n\n",1,strlen("%PDF-1.7\n\n"),pFile);

pdfLocation.push_back(getObjLocation(pFile));

}

解析pdf文件时，首先会从入口点开始。

void writeCatalog(FILE* pFile,vector<long>& pdfLocation)
{
fwrite("1 0 obj % entry point\n<<\n /Type /Catalog\n /Pages 3 0 R\n>>

\nendobj\n\n",

strlen("1 0 obj % entry point\n<<\n /Type /Catalog\n /Pages 3 0 R\n>>

\nendobj\n\n"),

pFile);
pdfLocation.push_back(getObjLocation(pFile));
}

pdf可以有很多页，每一页的页面大小、内容可以各不相同，本例中只有一页

void writePages(FILE* pFile,vector<long>& pdfLocation)
{
fwrite("3 0 obj\n<<\n /Type /Pages\n /MediaBox [ 0 0 200 200 ]\n

/Count 1\n /Kids [ 4 0 R ]\n>>\nendobj\n\n",

strlen("3 0 obj\n<<\n /Type /Pages\n /MediaBox [ 0 0 200 200 ]\n

/Count 1\n /Kids [ 4 0 R ]\n>>\nendobj\n\n"),pFile);
pdfLocation.push_back(currentLocation(pFile));
}

创建page。

void writePage(FILE* pFile,vector<long>& pdfLocation)
{
fwrite("4 0 obj\n<<\n /Type /Page\n /Parent 3 0 R\n /Resources <<\n/Font <<

\n/F1 5 0 R \n>>\n >>\n /Contents 6 0 R\n>>\nendobj\n\n",

1,
strlen("4 0 obj\n<<\n /Type /Page\n /Parent 2 0 R\n /Resources <<

\n/Font <<\n/F1 5 0 R \n>>\n >>\n /Contents 6 0 R\n>>\nendobj\n\n"),

pFile);
pdfLocation.push_back(getObjLocation(pFile));
}

写入文本内容。

void writeContents(FILE* pFile,vector<long>& pdfLocation)
{
char txtContent[512];
sprintf_s(txtContent,512,"6 0 obj %% page content\n<<\n /Length % d\n>>

\nstream\nBT\n70 50 TD\n/F1 12 Tf\n(Hello, world!)nET\nendstream\nendobj\n\n",

     strlen("\nBT\n70 50 TD\n/F1 12 Tf\n(Hello, world!) Tj\nET\n"));
   fwrite(txtContent,1,strlen(txtContent),pFile);
    pdfLocation.push_back(getObjLocation(pFile));
}

定义字体。

void writeFont(FILE* pFile,vector<long>& pdfLocation)
{
fwrite("5 0 obj\n<<\n /Type /Font\n /Subtype /Type1\n /BaseFont

/Times- Roman\n>>\nendobj\n\n",

1,
strlen("5 0 obj\n<<\n /Type /Font\n /Subtype /Type1\n /BaseFont

/Times-Roman\n>>\nendobj\n\n"),

pFile);
pdfLocation.push_back(getObjLocation(pFile));
}

       创建交叉引用表。习惯上从第0个obj开始，第0个不存在，标记为删除。根据情况交叉引用表可以不止一个。
void writeXref(FILE* pFile,const vector<long> pdfLocation)
{
     char* xrefIndexAndNum = new char[64];
    memset(xrefIndexAndNum,0x0,64);
    sprintf_s(xrefIndexAndNum,64,"xref\n0 %d\n",pdfLocation.size());
    fwrite(xrefIndexAndNum,1,strlen(xrefIndexAndNum),pFile);
   delete [] xrefIndexAndNum;
   xrefIndexAndNum = NULL;
   fwrite("0000000000 65535 f\n",1,strlen("0000000000 65535 f\n"),pFile);
   vector<long>::const_iterator iter = pdfLocation.begin();
   for(size_t i = 0;i < pdfLocation.size() -1;i++)
   {
     writeObjXref(pFile,pdfLocation[i]);
}
}

//单个obj的引用地址

void writeObjXref(FILE* pFile,long objRef)
{
     char *temp = new char[30];
    sprintf_s(temp,30,"%010d 00000 n \n",objRef);
   fwrite(temp,1,strlen(temp),pFile);
   delete [] temp;
   temp = NULL;
}

文件尾的写法跟版本的关系不大，指明obj的的个数、入口点、交叉引用表的地址。

void writeTraile(FILE* pFile,const vector<long> pdfLocation)
{
   char* tempArrChar = new char[256];
    sprintf_s(tempArrChar,256,
"trailer\n<</Size %d/Root 1 0 R>>\nstartxref\n%ld\n%%%%EOF\n",
   pdfLocation.size(),
    pdfLocation[pdfLocation.size() -1]);
    fwrite(tempArrChar,1,strlen(tempArrChar),pFile);
   delete [] tempArrChar;
   tempArrChar = NULL;
}

最后调用以上函数，创建一个完整的pdf文件。大家可以用记事本打开生产的pdf文件，可以看到pdf文件的组织形式。

void writePdf(const char *filename)
{
FILE* pdfFile = NULL;
fopen_s(&pdfFile,filename,"wb");
   vector<long> pdfLocation;//存储obj的引用地址和交叉引用表的地址
   writeHeader(pdfFile,pdfLocation);
writeCatalog(pdfFile,pdfLocation);
   writePages(pdfFile,pdfLocation);
   writePage(pdfFile,pdfLocation);
   writeFont(pdfFile,pdfLocation);
   writeContents(pdfFile,pdfLocation);
writeXref(pdfFile,pdfLocation);

writeTraile(pdfFile,pdfLocation);
fclose(pdfFile);
pdfFile = NULL;
}

目前市面上已经有很多成熟的第三方库，基本能满足大家的需求。鄙人在此抛砖引玉，只是出于帮助大家一起理解pdf文件结构的目的，不鼓励重复造轮子。匆忙写就，如有不当之处，欢迎拍砖。

ps：本文采用C++/CLI 托管代码，简单修改可用于C#。为了排版需要，代码中插入了很多空格，请不要简单拷贝运行。