0x01 pdf文件
至于整个格式adobe公司已经有相应的公开文档,同时网上的资料也很多都能查阅到。
比如文件格式 :
http://www.2cto.com/Article/201011/77380.html
还有网上也有对应的封装好的接口,开源的代码支持目前主流平台。
开源代码 :
windows http://www.pdflib.com/binaries/PDFlib/705/PDFlib-Lite-7.0.5p3.zip
linux http://www.pdflib.com/binaries/PDFlib/705/PDFlib-Lite-7.0.5p3.tar.gz
mac os http://www.pdflib.com/binaries/PDFlib/705/PDFlib-Lite-7.0.5p3.dmg
0x02 格式解析
python脚本(国外牛人写的)
作者
#!/usr/bin/python
__description__ = 'pdf-parser, use it to parse a PDF document'
__author__ = 'Didier Stevens'
__version__ = '0.6.5'
__date__ = '2016/07/27'
__minimum_python_version__ = (2, 5, 1)
__maximum_python_version__ = (3, 4, 3)
"""
Source code put in public domain by Didier Stevens, no Copyright
https://DidierStevens.com
Use at your own risk
History:
2008/05/02: continue
2008/05/03: continue
2008/06/02: streams
2008/10/19: refactor, grep & extract functionality
2008/10/20: reference
2008/10/21: cleanup
2008/11/12: V0.3 dictionary parser
2008/11/13: option elements
2008/11/14: continue
2009/05/05: added /ASCIIHexDecode support (thanks Justin Prosco)
2009/05/11: V0.3.1 updated usage, added --verbose and --extract
2009/07/16: V0.3.2 Added Canonicalize (thanks Justin Prosco)
2009/07/18: bugfix EqualCanonical
2009/07/24: V0.3.3 Added --hash option
2009/07/25: EqualCanonical for option --type, added option --nocanonicalizedoutput
2009/07/28: V0.3.4 Added ASCII85Decode support
2009/08/01: V0.3.5 Updated ASCIIHexDecode to support whitespace obfuscation
2009/08/30: V0.3.6 TestPythonVersion
2010/01/08: V0.3.7 Added RLE and LZW support (thanks pARODY); added dump option
2010/01/09: Fixed parsing of incomplete startxref
2010/09/22: V0.3.8 Changed dump option, updated PrettyPrint, added debug option
2011/12/17: fixed bugs empty objects
2012/03/11: V0.3.9 fixed bugs double nested [] in PrettyPrintSub (thanks kurt)
2013/01/11: V0.3.10 Extract and dump bug fixes by Priit; added content option
2013/02/16: Performance improvement in cPDFTokenizer by using StringIO for token building by Christophe Vandeplas; xrange replaced with range
2013/02/16: V0.4.0 added http/https support; added error handling for missing file or URL; ; added support for ZIP file with password 'infected'
2013/03/13: V0.4.1 fixes for Python 3
2013/04/11: V0.4.2 modified PrettyPrintSub for strings with unprintable characters
2013/05/04: Added options searchstream, unfiltered, casesensitive, regex
2013/09/18: V0.4.3 fixed regression bug -w option
2014/09/25: V0.5.0 added option -g
2014/09/29: Added PrintGenerateObject and PrintOutputObject
2014/12/05: V0.6.0 Added YARA support
2014/12/09: cleanup, refactoring
2014/12/13: Python 3 fixes
2015/01/11: Added support for multiple YARA rule files; added request to search in trailer
2015/01/31: V0.6.1 Added optionyarastrings
2015/02/09: Added decoders
2015/04/05: V0.6.2 Added generateembedded
2015/04/06: fixed bug reported by Kurt for stream produced by Ghostscript where endstream is not preceded by whitespace; fixed prettyprint bug
2015/04/24: V0.6.3 when option dump's filename is -, content is dumped to stdout
2015/08/12: V0.6.4 option hash now also calculates hashes of streams when selecting or searching objects; and displays hexasciidump first line
2016/07/27: V0.6.5 bugfix whitespace 0x00 0x0C after stream 0x0D 0x0A reported by @mr_me
Todo:
- handle printf todo
- support for JS hex string EC61C64349DB8D88AF0523C4C06E0F4D.pdf.vir
"""
import re
import optparse
import zlib
import binascii
import hashlib
import sys
import zipfile
import time
import os
if sys.version_info[0] >= 3:
from io import StringIO
import urllib.request
urllib23 = urllib.request
else:
from cStringIO import StringIO
import urllib2
urllib23 = urllib2
try:
import yara
except:
pass
CHAR_WHITESPACE = 1
CHAR_DELIMITER = 2
CHAR_REGULAR = 3
CONTEXT_NONE = 1
CONTEXT_OBJ = 2
CONTEXT_XREF = 3
CONTEXT_TRAILER = 4
PDF_ELEMENT_COMMENT = 1
PDF_ELEMENT_INDIRECT_OBJECT = 2
PDF_ELEMENT_XREF = 3
PDF_ELEMENT_TRAILER = 4
PDF_ELEMENT_STARTXREF = 5
PDF_ELEMENT_MALFORMED = 6
dumplinelength = 16
#Convert 2 Bytes If Python 3
def C2BIP3(string):
if sys.version_info[0] > 2:
return bytes([ord(x) for x in string])
else:
return string
# CIC: Call If Callable
def CIC(expression):
if callable(expression):
return expression()
else:
return expression
# IFF: IF Function
def IFF(expression, valueTrue, valueFalse):
if expression:
return CIC(valueTrue)
else:
return CIC(valueFalse)
def Timestamp(epoch=None):
if epoch == None:
localTime = time.localtime()
else:
localTime = time.localtime(epoch)
return '%04d%02d%02d-%02d%02d%02d' % localTime[0:6]
def CopyWithoutWhiteSpace(content):
result = []
for token in content:
if token[0] != CHAR_WHITESPACE:
result.append(token)
return result
def Obj2Str(content):
return ''.join(map(lambda x: repr(x[1])[1:-1], CopyWithoutWhiteSpace(content)))
class cPDFDocument:
def __init__(self, file):
self.file = file
if file.lower().startswith('http://') or file.lower().startswith('https://'):
try:
if sys.hexversion >= 0x020601F0:
self.infile = urllib23.urlopen(file, timeout=5)
else:
self.infile = urllib23.urlopen(file)
except urllib23.HTTPError:
print('Error accessing URL %s' % file)
print(sys.exc_info()[1])
sys.exit()
elif file.lower().endswith('.zip'):
try:
self.zipfile = zipfile.ZipFile(file, 'r')
self.infile = self.zipfile.open(self.zipfile.infolist()[0], 'r', C2BIP3('infected'))
except:
print('Error opening file %s' % file)
print(sys.exc_info()[1])
sys.exit()
else:
try:
self.infile = open(file, 'rb')
except:
print('Error opening file %s' % file)
print(sys.exc_info()[1])
sys.exit()
self.ungetted = []
self.position = -1
def byte(self):
if len(self.ungetted) != 0:
self.position += 1
return self.ungetted.pop()
inbyte = self.infile.read(1)
if not inbyte or inbyte == '':
self.infile.close()
return None
self.position += 1
return ord(inbyte)
def unget(self, byte):
self.position -= 1
self.ungetted.append(byte)
def CharacterClass(byte):
if byte == 0 or byte == 9 or byte == 10 or byte == 12 or byte == 13 or byte == 32:
return CHAR_WHITESPACE
if byte == 0x28 or byte == 0x29 or byte == 0x3C or byte == 0x3E or byte == 0x5B or byte == 0x5D or byte == 0x7B or byte == 0x7D or byte == 0x2F or byte == 0x25:
return CHAR_DELIMITER
return CHAR_REGULAR
def IsNumeric(str):
return re.match('^[0-9]+', str)
class cPDFTokenizer:
def __init__(self, file):
self.oPDF = cPDFDocument(file)
self.ungetted = []
def Token(self):
if len(self.ungetted) != 0:
return self.ungetted.pop()
if self.oPDF == None:
return None
self.byte = self.oPDF.byte()
if self.byte == None:
self.oPDF = None
return None
elif CharacterClass(self.byte) == CHAR_WHITESPACE:
file_str = StringIO()
while self.byte != None and CharacterClass(self.byte) == CHAR_WHITESPACE:
file_str.write(chr(self.byte))
self.byte = self.oPDF.byte()
if self.byte != None:
self.oPDF.unget(self.byte)
else:
self.oPDF = None
self.token = file_str.getvalue()
return (CHAR_WHITESPACE, self.token)
elif CharacterClass(self.byte) == CHAR_REGULAR:
file_str = StringIO()
while self.byte != None and CharacterClass(self.byte) == CHAR_REGULAR:
file_str.write(chr(self.byte))
self.byte = self.oPDF.byte()
if self.byte != None:
self.oPDF.unget(self.byte)
else:
self.oPDF = None
self.token = file_str.getvalue()
return (CHAR_REGULAR, self.token)
else:
if self.byte == 0x3C:
self.byte = self.oPDF.byte()
if self.byte == 0x3C:
return (CHAR_DELIMITER, '<<')
else:
self.oPDF.unget(self.byte)
return (CHAR_DELIMITER, '<')
elif self.byte == 0x3E:
self.byte = self.oPDF.byte()
if self.byte == 0x3E:
return (CHAR_DELIMITER, '>>')
else:
self.oPDF.unget(self.byte)
return (CHAR_DELIMITER, '>')
elif self.byte == 0x25:
file_str = StringIO()
while self.byte != None:
file_str.write(chr(self.byte))
if self.byte == 10 or self.byte == 13:
self.byte = self.oPDF.byte()
break
self.byte = self.oPDF.byte()
if self.byte != None:
if self.byte == 10:
file_str.write(chr(self.byte))
else:
self.oPDF.unget(self.byte)
else:
self.oPDF = None
self.token = file_str.getvalue()
return (CHAR_DELIMITER, self.token)
return (CHAR_DELIMITER, chr(self.byte))
def TokenIgnoreWhiteSpace(self):
token = self.Token()
while token != None and token[0] == CHAR_WHITESPACE:
token = self.Token()
return token
def unget(self, byte):
self.ungetted.append(byte)
class cPDFParser:
def __init__(self, file, verbose=False, extract=None):
self.context = CONTEXT_NONE
self.content = []
self.oPDFTokenizer = cPDFTokenizer(file)
self.verbose = verbose
self.extract = extract
def GetObject(self):
while True:
if self.context == CONTEXT_OBJ:
self.token = self.oPDFTokenizer.Token()
else:
self.token = self.oPDFTokenizer.TokenIgnoreWhiteSpace()
if self.token:
if self.token[0] == CHAR_DELIMITER:
if self.token[1][0] == '%':
if self.context == CONTEXT_OBJ:
self.content.append(self.token)
else:
return cPDFElementComment(self.token[1])
elif self.token[1] == '/':
self.token2 = self.oPDFTokenizer.Token()
if self.token2[0] == CHAR_REGULAR:
if self.context != CONTEXT_NONE:
self.content.append((CHAR_DELIMITER, self.token[1] + self.token2[1]))
elif self.verbose:
print('todo 1: %s' % (self.token[1] + self.token2[1]))
else:
self.oPDFTokenizer.unget(self.token2)
if self.context != CONTEXT_NONE:
self.content.append(self.token)
elif self.verbose:
print('todo 2: %d %s' % (self.token[0], repr(self.token[1])))
elif self.context != CONTEXT_NONE:
self.content.append(self.token)
elif self.verbose:
print('todo 3: %d %s' % (self.token[0], repr(self.token[1])))
elif self.token[0] == CHAR_WHITESPACE:
if self.context != CONTEXT_NONE:
self.content.append(self.token)
elif self.verbose:
print('todo 4: %d %s' % (self.token[0], repr(self.token[1])))
else:
if self.context == CONTEXT_OBJ:
if self.token[1] == 'endobj':
self.oPDFElementIndirectObject = cPDFElementIndirectObject(self.objectId, self.objectVersion, self.content)
self.context = CONTEXT_NONE
self.content = []
return self.oPDFElementIndirectObject
else:
self.content.append(self.token)
elif self.context == CONTEXT_TRAILER:
if self.token[1] == 'startxref' or self.token[1] == 'xref':
self.oPDFElementTrailer = cPDFElementTrailer(self.content)
self.oPDFTokenizer.unget(self.token)
self.context = CONTEXT_NONE
self.content = []
return self.oPDFElementTrailer
else:
self.content.append(self.token)
elif self.context == CONTEXT_XREF: