"""Guess the MIME type of a file.
This module defines two useful functions:
guess_type(url, strict=1) -- guess the MIME type and encoding of a URL.
guess_extension(type, strict=1) -- guess the extension fora given MIME type.
It also contains the following, fortuning the behavior:
Data:
knownfiles -- list of files to parse
inited -- flag set when init() has been called
suffix_map -- dictionary mapping suffixes to suffixes
encodings_map -- dictionary mapping suffixes to encodings
types_map -- dictionary mapping suffixes to types
Functions:
init([files]) -- parse a list of files, defaultknownfiles (on Windows, the
defaultvalues are taken from the registry)
read_mime_types(file) -- parse one file, returna dictionary or None
"""
from itertools importcount
importos
importsys
importposixpath
importurllib
try:
import_winreg
except ImportError:
_winreg = None
__all__ = [
"guess_type","guess_extension","guess_all_extensions",
"add_type","read_mime_types","init"
]
knownfiles = [
"/etc/mime.types",
"/etc/httpd/mime.types", # Mac OS X
"/etc/httpd/conf/mime.types", # Apache
"/etc/apache/mime.types", # Apache 1
"/etc/apache2/mime.types", # Apache 2
"/usr/local/etc/httpd/conf/mime.types",
"/usr/local/lib/netscape/mime.types",
"/usr/local/etc/httpd/conf/mime.types", # Apache 1.2
"/usr/local/etc/mime.types", # Apache 1.3
]
inited = False
_db = None
classMimeTypes:
"""MIME-types datastore.
This datastore can handle information from mime.types-style files
and supports basic determination of MIME type from a filename or
URL, and can guess a reasonable extension given a MIME type.
"""
def __init__(self, filenames=(), strict=True):
ifnot inited:
init()
self.encodings_map = encodings_map.copy()
self.suffix_map = suffix_map.copy()
self.types_map = ({}, {}) # dict for(non-strict, strict)
self.types_map_inv = ({}, {})
for(ext, type)intypes_map.items():
self.add_type(type, ext, True)
for(ext, type)incommon_types.items():
self.add_type(type, ext, False)
fornameinfilenames:
self.read(name, strict)
def add_type(self, type, ext, strict=True):
"""Add a mapping between a type and an extension.
When the extension is already known, the new
type will replace the old one. When the type
is already known the extension will be added
to the list of known extensions.
If strict is true, information will be added to
list of standard types, elseto the list of non-standard
types.
"""
self.types_map[strict][ext] = type
exts = self.types_map_inv[strict].setdefault(type, [])
ifext notinexts:
exts.append(ext)
def guess_type(self, url, strict=True):
"""Guess the type of a file based on its URL.
Return value is a tuple (type, encoding) where type is None if
the type can't be guessed (no or unknown suffix) or a string
of the form type/subtype, usable fora MIME Content-type
header; and encoding is None forno encoding or the name of
the program used to encode (e.g. compress or gzip). The
mappings are table driven. Encoding suffixes are case
sensitive; type suffixes are first tried casesensitive, then
caseinsensitive.
The suffixes .tgz, .taz and .tz (casesensitive!) are all
mapped to '.tar.gz'. (This is table-driven too, using the
dictionary suffix_map.)
Optional `strict' argument when False adds a bunch of commonly found,
but non-standard types.
"""
scheme, url = urllib.splittype(url)
ifscheme =='data':
# syntax of data URLs:
# dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
# mediatype := [ type "/" subtype ] *( ";" parameter )
# data := *urlchar
# parameter := attribute "=" value
# type/subtype defaults to "text/plain"
comma = url.find(',')
ifcomma < 0:
# bad data URL
returnNone, None
semi = url.find(';', 0, comma)
ifsemi >= 0:
type = url[:semi]
else:
type = url[:comma]
if'='intype or'/'notintype:
type = 'text/plain'
returntype, None # never compressed, so encoding is None
base, ext = posixpath.splitext(url)
whileextinself.suffix_map:
base, ext = posixpath.splitext(base + self.suffix_map[ext])
ifextinself.encodings_map:
encoding = self.encodings_map[ext]
base, ext = posixpath.splitext(base)
else:
encoding = None
types_map = self.types_map[True]
ifextintypes_map:
returntypes_map[ext], encoding
elif ext.lower() intypes_map:
returntypes_map[ext.lower()], encoding
elif strict:
returnNone, encoding
types_map = self.types_map[False]
ifextintypes_map:
returntypes_map[ext], encoding
elif ext.lower() intypes_map:
returntypes_map[ext.lower()], encoding
else:
returnNone, encoding
def guess_all_extensions(self, type, strict=True):
"""Guess the extensionsfora file based on its MIME type.
Return value is a list of strings giving the possible filename
extensions, including the leading dot ('.'). The extension is not
guaranteed to have been associated withany particular data stream,
but would be mapped to the MIME type `type' by guess_type().
Optional `strict' argument when falseadds a bunch of commonly found,
but non-standard types.
"""
type = type.lower()
extensions = self.types_map_inv[True].get(type, [])
ifnot strict:
forextinself.types_map_inv[False].get(type, []):
ifext notinextensions:
extensions.append(ext)
returnextensions
def guess_extension(self, type, strict=True):
"""Guess the extensionfora file based on its MIME type.
Return value is a string giving a filename extension,
including the leading dot ('.'). The extension is not
guaranteed to have been associated withany particular data
stream, but would be mapped to the MIME type `type' by
guess_type(). If no extension can be guessed for`type', None
is returned.
Optional `strict' argument when falseadds a bunch of commonly found,
but non-standard types.
"""
extensions = self.guess_all_extensions(type, strict)
ifnot extensions:
returnNone
returnextensions[0]
def read(self, filename, strict=True):
"""
Read a single mime.types-format file, specified by pathname.
If strict is true, information will be added to
list of standard types, elseto the list of non-standard
types.
"""
withopen(filename) as fp:
self.readfp(fp, strict)
def readfp(self, fp, strict=True):
"""
Read a single mime.types-format file.
If strict is true, information will be added to
list of standard types, elseto the list of non-standard
types.
"""
while1:
line = fp.readline()
ifnot line:
break
words = line.split()
foriinrange(len(words)):
ifwords[i][0] =='#':
del words[i:]
break
ifnot words:
continue
type, suffixes = words[0], words[1:]
forsuffinsuffixes:
self.add_type(type, '.'+ suff, strict)
def read_windows_registry(self, strict=True):
"""
Load the MIME types database from Windows registry.
If strict is true, information will be added to
list of standard types, elseto the list of non-standard
types.
"""
# Windows only
ifnot _winreg:
return
def enum_types(mimedb):
foriincount():
try:
yield _winreg.EnumKey(mimedb, i)
except EnvironmentError:
break
default_encoding = sys.getdefaultencoding()
with_winreg.OpenKey(_winreg.HKEY_CLASSES_ROOT,'') as hkcr:
forsubkeynameinenum_types(hkcr):
try:
with_winreg.OpenKey(hkcr, subkeyname) as subkey:
# Only check file extensions
ifnot subkeyname.startswith("."):
continue
# raises EnvironmentError if no 'Content Type' value
mimetype, datatype = _winreg.QueryValueEx(
subkey, 'Content Type')
ifdatatype != _winreg.REG_SZ:
continue
try:
mimetype = mimetype.encode(default_encoding)
subkeyname = subkeyname.encode(default_encoding)
except UnicodeEncodeError:
continue
self.add_type(mimetype, subkeyname, strict)
except EnvironmentError:
continue
def guess_type(url, strict=True):
"""Guess the type of a file based on its URL.
Return value is a tuple (type, encoding) where type is None ifthe
type can't be guessed (no or unknown suffix) or a string of the
form type/subtype, usable fora MIME Content-type header; and
encoding is None forno encoding or the name of the program used
to encode (e.g. compress or gzip). The mappings are table
driven. Encoding suffixes are casesensitive; type suffixes are
first tried casesensitive, thencaseinsensitive.
The suffixes .tgz, .taz and .tz (casesensitive!) are all mapped
to ".tar.gz". (This is table-driven too, using the dictionary
suffix_map).
Optional `strict' argument when falseadds a bunch of commonly found, but
non-standard types.
"""
if_db is None:
init()
return_db.guess_type(url, strict)
def guess_all_extensions(type, strict=True):
"""Guess the extensionsfora file based on its MIME type.
Return value is a list of strings giving the possible filename
extensions, including the leading dot ('.'). The extension is not
guaranteed to have been associated withany particular data
stream, but would be mapped to the MIME type `type' by
guess_type(). If no extension can be guessed for`type', None
is returned.
Optional `strict' argument when falseadds a bunch of commonly found,
but non-standard types.
"""
if_db is None:
init()
return_db.guess_all_extensions(type, strict)
def guess_extension(type, strict=True):
"""Guess the extensionfora file based on its MIME type.
Return value is a string giving a filename extension, including the
leading dot ('.'). The extension is not guaranteed to have been
associated withany particular data stream, but would be mapped to the
MIME type `type' by guess_type(). If no extension can be guessed for
`type', None is returned.
Optional `strict' argument when falseadds a bunch of commonly found,
but non-standard types.
"""
if_db is None:
init()
return_db.guess_extension(type, strict)
def add_type(type, ext, strict=True):
"""Add a mapping between a type and an extension.
When the extension is already known, the new
type will replace the old one. When the type
is already known the extension will be added
to the list of known extensions.
If strict is true, information will be added to
list of standard types, elseto the list of non-standard
types.
"""
if_db is None:
init()
return_db.add_type(type, ext, strict)
def init(files=None):
global suffix_map, types_map, encodings_map, common_types
global inited, _db
inited = True # so that MimeTypes.__init__() doesn't call us again
db = MimeTypes()
iffiles is None:
if_winreg:
db.read_windows_registry()
files = knownfiles
forfileinfiles:
ifos.path.isfile(file):
db.read(file)
encodings_map = db.encodings_map
suffix_map = db.suffix_map
types_map = db.types_map[True]
common_types = db.types_map[False]
# Make the DB a global variable now that it is fully initialized
_db = db
def read_mime_types(file):
try:
f = open(file)
except IOError:
returnNone
db = MimeTypes()
db.readfp(f, True)
returndb.types_map[True]
def _default_mime_types():
global suffix_map
global encodings_map
global types_map
global common_types
suffix_map = {
'.tgz':'.tar.gz',
'.taz':'.tar.gz',
'.tz':'.tar.gz',
'.tbz2':'.tar.bz2',
'.txz':'.tar.xz',
}
encodings_map = {
'.gz':'gzip',
'.Z':'compress',
'.bz2':'bzip2',
'.xz':'xz',
}
# Before adding new types, make sure they are either registered with IANA,
# at http://www.isi.edu/in-notes/iana/assignments/media-types
# or extensions, i.e. using the x- prefix
# If you add to these, please keep them sorted!
types_map = {
'.a':'application/octet-stream',
'.ai':'application/postscript',
'.aif':'audio/x-aiff',
'.aifc':'audio/x-aiff',
'.aiff':'audio/x-aiff',
'.au':'audio/basic',
'.avi':'video/x-msvideo',
'.bat':'text/plain',
'.bcpio':'application/x-bcpio',
'.bin':'application/octet-stream',
'.bmp':'image/x-ms-bmp',
'.c':'text/plain',
# Duplicates
'.cdf' : 'application/x-cdf',
'.cdf':'application/x-netcdf',
'.cpio':'application/x-cpio',
'.csh':'application/x-csh',
'.css':'text/css',
'.dll':'application/octet-stream',
'.doc':'application/msword',
'.dot':'application/msword',
'.dvi':'application/x-dvi',
'.eml':'message/rfc822',
'.eps':'application/postscript',
'.etx':'text/x-setext',
'.exe':'application/octet-stream',
'.gif':'image/gif',
'.gtar':'application/x-gtar',
'.h':'text/plain',
'.hdf':'application/x-hdf',
'.htm':'text/html',
'.html':'text/html',
'.ico':'image/vnd.microsoft.icon',
'.ief':'image/ief',
'.jpe':'image/jpeg',
'.jpeg':'image/jpeg',
'.jpg':'image/jpeg',
'.js':'application/javascript',
'.ksh':'text/plain',
'.latex':'application/x-latex',
'.m1v':'video/mpeg',
'.man':'application/x-troff-man',
'.me':'application/x-troff-me',
'.mht':'message/rfc822',
'.mhtml':'message/rfc822',
'.mif':'application/x-mif',
'.mov':'video/quicktime',
'.movie':'video/x-sgi-movie',
'.mp2':'audio/mpeg',
'.mp3':'audio/mpeg',
'.mp4':'video/mp4',
'.mpa':'video/mpeg',
'.mpe':'video/mpeg',
'.mpeg':'video/mpeg',
'.mpg':'video/mpeg',
'.ms':'application/x-troff-ms',
'.nc':'application/x-netcdf',
'.nws':'message/rfc822',
'.o':'application/octet-stream',
'.obj':'application/octet-stream',
'.oda':'application/oda',
'.p12':'application/x-pkcs12',
'.p7c':'application/pkcs7-mime',
'.pbm':'image/x-portable-bitmap',
'.pdf':'application/pdf',
'.pfx':'application/x-pkcs12',
'.pgm':'image/x-portable-graymap',
'.pl':'text/plain',
'.png':'image/png',
'.pnm':'image/x-portable-anymap',
'.pot':'application/vnd.ms-powerpoint',
'.ppa':'application/vnd.ms-powerpoint',
'.ppm':'image/x-portable-pixmap',
'.pps':'application/vnd.ms-powerpoint',
'.ppt':'application/vnd.ms-powerpoint',
'.ps':'application/postscript',
'.pwz':'application/vnd.ms-powerpoint',
'.py':'text/x-python',
'.pyc':'application/x-python-code',
'.pyo':'application/x-python-code',
'.qt':'video/quicktime',
'.ra':'audio/x-pn-realaudio',
'.ram':'application/x-pn-realaudio',
'.ras':'image/x-cmu-raster',
'.rdf':'application/xml',
'.rgb':'image/x-rgb',
'.roff':'application/x-troff',
'.rtx':'text/richtext',
'.sgm':'text/x-sgml',
'.sgml':'text/x-sgml',
'.sh':'application/x-sh',
'.shar':'application/x-shar',
'.snd':'audio/basic',
'.so':'application/octet-stream',
'.src':'application/x-wais-source',
'.sv4cpio':'application/x-sv4cpio',
'.sv4crc':'application/x-sv4crc',
'.swf':'application/x-shockwave-flash',
'.t':'application/x-troff',
'.tar':'application/x-tar',
'.tcl':'application/x-tcl',
'.tex':'application/x-tex',
'.texi':'application/x-texinfo',
'.texinfo':'application/x-texinfo',
'.tif':'image/tiff',
'.tiff':'image/tiff',
'.tr':'application/x-troff',
'.tsv':'text/tab-separated-values',
'.txt':'text/plain',
'.ustar':'application/x-ustar',
'.vcf':'text/x-vcard',
'.wav':'audio/x-wav',
'.wiz':'application/msword',
'.wsdl':'application/xml',
'.xbm':'image/x-xbitmap',
'.xlb':'application/vnd.ms-excel',
# Duplicates
'.xls':'application/excel',
'.xls':'application/vnd.ms-excel',
'.xml':'text/xml',
'.xpdl':'application/xml',
'.xpm':'image/x-xpixmap',
'.xsl':'application/xml',
'.xwd':'image/x-xwindowdump',
'.zip':'application/zip',
}
# These are non-standard types, commonly found in the wild. They will
# only match if strict=0 flag is given to the API methods.
# Please sort these too
common_types = {
'.jpg':'image/jpg',
'.mid':'audio/midi',
'.midi':'audio/midi',
'.pct':'image/pict',
'.pic':'image/pict',
'.pict':'image/pict',
'.rtf':'application/rtf',
'.xul':'text/xul'
}
_default_mime_types()
if__name__ =='__main__':
importgetopt
USAGE = """\
Usage: mimetypes.py [options] type
Options:
--help / -h -- print thismessage and exit
--lenient / -l -- additionally search of some common, but non-standard
types.
--extension / -e -- guess extension instead of type
More than one type argument may be given.
"""
def usage(code, msg=''):
print USAGE
ifmsg: print msg
sys.exit(code)
try:
opts, args = getopt.getopt(sys.argv[1:], 'hle',
['help','lenient','extension'])
except getopt.error, msg:
usage(1, msg)
strict = 1
extension = 0
foropt, arginopts:
ifoptin('-h','--help'):
usage(0)
elif opt in('-l','--lenient'):
strict = 0
elif opt in('-e','--extension'):
extension = 1
forgtypeinargs:
ifextension:
guess = guess_extension(gtype, strict)
ifnot guess: print"I don't know anything about type", gtype
else: print guess
else:
guess, encoding = guess_type(gtype, strict)
ifnot guess: print"I don't know anything about type", gtype
else: print'type:', guess,'encoding:', encoding