spark python pickle对象_python/pyspark/cloudpickle.py · 837211851/spark - Gitee.com

CloudPickler是一个Python模块,用于扩展标准pickle模块的功能,能序列化和反序列化lambda函数、嵌套函数、主模块和其他不可序列化对象。它将函数转换为编译的字节码,并处理模块导入,确保在反序列化时正确处理全局环境。此外,CloudPickler还处理内置类型、模块、生成器、迭代器和缓冲区等对象的序列化。
摘要由CSDN通过智能技术生成

"""

This class is defined to override standard pickle functionality

The goals of it follow:

-Serialize lambdas and nested functions to compiled byte code

-Deal with main module correctly

-Deal with other non-serializable objects

It does not include an unpickler, as standard python unpickling suffices.

This module was extracted from the `cloud` package, developed by `PiCloud, Inc.

`_.

Copyright (c) 2012, Regents of the University of California.

Copyright (c) 2009 `PiCloud, Inc. `_.

All rights reserved.

Redistribution and use in source and binary forms, with or without

modification, are permitted provided that the following conditions

are met:

* Redistributions of source code must retain the above copyright

notice, this list of conditions and the following disclaimer.

* Redistributions in binary form must reproduce the above copyright

notice, this list of conditions and the following disclaimer in the

documentation and/or other materials provided with the distribution.

* Neither the name of the University of California, Berkeley nor the

names of its contributors may be used to endorse or promote

products derived from this software without specific prior written

permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT

HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,

SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED

TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR

PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF

LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING

NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

"""

from __future__ import print_function

import operator

import os

import io

import pickle

import struct

import sys

import types

from functools import partial

import itertools

import dis

import traceback

if sys.version < '3':

from pickle import Pickler

try:

from cStringIO import StringIO

except ImportError:

from StringIO import StringIO

PY3 = False

else:

types.ClassType = type

from pickle import _Pickler as Pickler

from io import BytesIO as StringIO

PY3 = True

#relevant opcodes

STORE_GLOBAL = dis.opname.index('STORE_GLOBAL')

DELETE_GLOBAL = dis.opname.index('DELETE_GLOBAL')

LOAD_GLOBAL = dis.opname.index('LOAD_GLOBAL')

GLOBAL_OPS = [STORE_GLOBAL, DELETE_GLOBAL, LOAD_GLOBAL]

HAVE_ARGUMENT = dis.HAVE_ARGUMENT

EXTENDED_ARG = dis.EXTENDED_ARG

def islambda(func):

return getattr(func,'__name__') == ''

_BUILTIN_TYPE_NAMES = {}

for k, v in types.__dict__.items():

if type(v) is type:

_BUILTIN_TYPE_NAMES[v] = k

def _builtin_type(name):

return getattr(types, name)

class CloudPickler(Pickler):

dispatch = Pickler.dispatch.copy()

def __init__(self, file, protocol=None):

Pickler.__init__(self, file, protocol)

# set of modules to unpickle

self.modules = set()

# map ids to dictionary. used to ensure that functions can share global env

self.globals_ref = {}

def dump(self, obj):

self.inject_addons()

try:

return Pickler.dump(self, obj)

except RuntimeError as e:

if 'recursion' in e.args[0]:

msg = """Could not pickle object as excessively deep recursion required."""

raise pickle.PicklingError(msg)

except pickle.PickleError:

raise

except Exception as e:

if "'i' format requires" in e.message:

msg = "Object too large to serialize: " + e.message

else:

msg = "Could not serialize object: " + e.__class__.__name__ + ": " + e.message

print_exec(sys.stderr)

raise pickle.PicklingError(msg)

def save_memoryview(self, obj):

"""Fallback to save_string"""

Pickler.save_string(self, str(obj))

def save_buffer(self, obj):

"""Fallback to save_string"""

Pickler.save_string(self,str(obj))

if PY3:

dispatch[memoryview] = save_memoryview

else:

dispatch[buffer] = save_buffer

def save_unsupported(self, obj):

raise pickle.PicklingError("Cannot pickle objects of type %s" % type(obj))

dispatch[types.GeneratorType] = save_unsupported

# itertools objects do not pickle!

for v in itertools.__dict__.values():

if type(v) is type:

dispatch[v] = save_unsupported

def save_module(self, obj):

"""

Save a module as an import

"""

self.modules.add(obj)

self.save_reduce(subimport, (obj.__name__,), obj=obj)

dispatch[types.ModuleType] = save_module

def save_codeobject(self, obj):

"""

Save a code object

"""

if PY3:

args = (

obj.co_argcount, obj.co_kwonlyargcount, obj.co_nlocals, obj.co_stacksize,

obj.co_flags, obj.co_code, obj.co_consts, obj.co_names, obj.co_varnames,

obj.co_filename, obj.co_name, obj.co_firstlineno, obj.co_lnotab, obj.co_freevars,

obj.co_cellvars

)

else:

args = (

obj.co_argcount, obj.co_nlocals, obj.co_stacksize, obj.co_flags, obj.co_code,

obj.co_consts, obj.co_names, obj.co_varnames, obj.co_filename, obj.co_name,

obj.co_firstlineno, obj.co_lnotab, obj.co_freevars, obj.co_cellvars

)

self.save_reduce(types.CodeType, args, obj=obj)

dispatch[types.CodeType] = save_codeobject

def save_function(self, obj, name=None):

""" Registered with the dispatch to handle all function types.

Determines what kind of function obj is (e.g. lambda, defined at

interactive prompt, etc) and handles the pickling appropriately.

"""

write = self.write

if name is None:

name = obj.__name__

try:

# whichmodule() could fail, see

# https://bitbucket.org/gutworth/six/issues/63/importing-six-breaks-pickling

modname = pickle.whichmodule(obj, name)

except Exception:

modname = None

# print('which gives %s %s %s' % (modname, obj, name))

try:

themodule = sys.modules[modname]

except KeyError:

# eval'd items such as namedtuple give invalid items for their function __module__

modname = '__main__'

if modname == '__main__':

themodule = None

if themodule:

self.modules.add(themodule)

if getattr(themodule, name, None) is obj:

return self.save_global(obj, name)

# if func is lambda, def'ed at prompt, is in main, or is nested, then

# we'll pickle the actual function object rather than simply saving a

# reference (as is done in default pickler), via save_function_tuple.

if islambda(obj) or obj.__code__.co_filename == '' or themodule is None:

#print("save global", islambda(obj), obj.__code__.co_filename, modname, themodule)

self.save_function_tuple(obj)

return

else:

# func is nested

klass = getattr(themodule, name, None)

if klass is None or klass is not obj:

self.save_function_tuple(obj)

return

if obj.__dict__:

# essentially save_reduce, but workaround needed to avoid recursion

self.save(_restore_attr)

write(pickle.MARK + pickle.GLOBAL + modname + '\n' + name + '\n')

self.memoize(obj)

self.save(obj.__dict__)

write(pickle.TUPLE + pickle.REDUCE)

else:

write(pickle.GLOBAL + modname + '\n' + name + '\n')

self.memoize(obj)

dispatch[types.FunctionType] = save_function

def save_function_tuple(self, func):

""" Pickles an actual func object.

A func comprises: code, globals, defaults, closure, and dict. We

extract and save these, injecting reducing functions at certain points

to recreate the func object. Keep in mind that some of these pieces

can contain a ref to the func itself. Thus, a naive save on these

pieces could trigger an infinite loop of save's. To get around that,

we first create a skeleton func object using just the code (this is

safe, since this won't contain a ref to the func), and memoize it as

soon as it's created. The other stuff can then be filled in later.

"""

save = self.save

write = self.write

code, f_globals, defaults, closure, dct, base_globals = self.extract_func_data(func)

save(_fill_function) # skeleton function updater

write(pickle.MARK) # beginning of tuple that _fill_function expects

# create a skeleton function object and memoize it

save(_make_skel_func)

save((code, closure, base_globals))

write(pickle.REDUCE)

self.memoize(func)

# save the rest of the func data needed by _fill_function

save(f_globals)

save(defaults)

save(dct)

save(func.__module__)

write(pickle.TUPLE)

write(pickle.REDUCE) # applies _fill_function on the tuple

@staticmethod

def extract_code_globals(co):

"""

Find all globals names read or written to by codeblock co

"""

code = co.co_code

if not PY3:

code = [ord(c) for c in code]

names = co.co_names

out_names = set()

n = len(code)

i = 0

extended_arg = 0

while i < n:

op = code[i]

i += 1

if op >= HAVE_ARGUMENT:

oparg = code[i] + code[i+1] * 256 + extended_arg

extended_arg = 0

i += 2

if op == EXTENDED_ARG:

extended_arg = oparg*65536

if op in GLOBAL_OPS:

out_names.add(names[oparg])

# see if nested function have any global refs

if co.co_consts:

for const in co.co_consts:

if type(const) is types.CodeType:

out_names |= CloudPickler.extract_code_globals(const)

return out_names

def extract_func_data(self, func):

"""

Turn the function into a tuple of data necessary to recreate it:

code, globals, defaults, closure, dict

"""

code = func.__code__

# extract all global ref's

func_global_refs = self.extract_code_globals(code)

# process all variables referenced by global environment

f_globals = {}

for var in func_global_refs:

if var in func.__globals__:

f_globals[var] = func.__globals__[var]

# defaults requires no processing

defaults = func.__defaults__

# process closure

closure = [c.cell_contents for c in func.__closure__] if func.__closure__ else []

# save the dict

dct = func.__dict__

base_globals = self.globals_ref.get(id(func.__globals__), {})

self.globals_ref[id(func.__globals__)] = base_globals

return (code, f_globals, defaults, closure, dct, base_globals)

def save_builtin_function(self, obj):

if obj.__module__ is "__builtin__":

return self.save_global(obj)

return self.save_function(obj)

dispatch[types.BuiltinFunctionType] = save_builtin_function

def save_global(self, obj, name=None, pack=struct.pack):

if obj.__module__ == "__builtin__" or obj.__module__ == "builtins":

if obj in _BUILTIN_TYPE_NAMES:

return self.save_reduce(_builtin_type, (_BUILTIN_TYPE_NAMES[obj],), obj=obj)

if name is None:

name = obj.__name__

modname = getattr(obj, "__module__", None)

if modname is None:

try:

# whichmodule() could fail, see

# https://bitbucket.org/gutworth/six/issues/63/importing-six-breaks-pickling

modname = pickle.whichmodule(obj, name)

except Exception:

modname = '__main__'

if modname == '__main__':

themodule = None

else:

__import__(modname)

themodule = sys.modules[modname]

self.modules.add(themodule)

if hasattr(themodule, name) and getattr(themodule, name) is obj:

return Pickler.save_global(self, obj, name)

typ = type(obj)

if typ is not obj and isinstance(obj, (type, types.ClassType)):

d = dict(obj.__dict__) # copy dict proxy to a dict

if not isinstance(d.get('__dict__', None), property):

# don't extract dict that are properties

d.pop('__dict__', None)

d.pop('__weakref__', None)

# hack as __new__ is stored differently in the __dict__

new_override = d.get('__new__', None)

if new_override:

d['__new__'] = obj.__new__

# workaround for namedtuple (hijacked by PySpark)

if getattr(obj, '_is_namedtuple_', False):

self.save_reduce(_load_namedtuple, (obj.__name__, obj._fields))

return

self.save(_load_class)

self.save_reduce(typ, (obj.__name__, obj.__bases__, {"__doc__": obj.__doc__}), obj=obj)

d.pop('__doc__', None)

# handle property and staticmethod

dd = {}

for k, v in d.items():

if isinstance(v, property):

k = ('property', k)

v = (v.fget, v.fset, v.fdel, v.__doc__)

elif isinstance(v, staticmethod) and hasattr(v, '__func__'):

k = ('staticmethod', k)

v = v.__func__

elif isinstance(v, classmethod) and hasattr(v, '__func__'):

k = ('classmethod', k)

v = v.__func__

dd[k] = v

self.save(dd)

self.write(pickle.TUPLE2)

self.write(pickle.REDUCE)

else:

raise pickle.PicklingError("Can't pickle %r" % obj)

dispatch[type] = save_global

dispatch[types.ClassType] = save_global

def save_instancemethod(self, obj):

# Memoization rarely is ever useful due to python bounding

if PY3:

self.save_reduce(types.MethodType, (obj.__func__, obj.__self__), obj=obj)

else:

self.save_reduce(types.MethodType, (obj.__func__, obj.__self__, obj.__self__.__class__),

obj=obj)

dispatch[types.MethodType] = save_instancemethod

def save_inst(self, obj):

"""Inner logic to save instance. Based off pickle.save_inst

Supports __transient__"""

cls = obj.__class__

memo = self.memo

write = self.write

save = self.save

if hasattr(obj, '__getinitargs__'):

args = obj.__getinitargs__()

len(args) # XXX Assert it's a sequence

pickle._keep_alive(args, memo)

else:

args = ()

write(pickle.MARK)

if self.bin:

save(cls)

for arg in args:

save(arg)

write(pickle.OBJ)

else:

for arg in args:

save(arg)

write(pickle.INST + cls.__module__ + '\n' + cls.__name__ + '\n')

self.memoize(obj)

try:

getstate = obj.__getstate__

except AttributeError:

stuff = obj.__dict__

#remove items if transient

if hasattr(obj, '__transient__'):

transient = obj.__transient__

stuff = stuff.copy()

for k in list(stuff.keys()):

if k in transient:

del stuff[k]

else:

stuff = getstate()

pickle._keep_alive(stuff, memo)

save(stuff)

write(pickle.BUILD)

if not PY3:

dispatch[types.InstanceType] = save_inst

def save_property(self, obj):

# properties not correctly saved in python

self.save_reduce(property, (obj.fget, obj.fset, obj.fdel, obj.__doc__), obj=obj)

dispatch[property] = save_property

def save_itemgetter(self, obj):

"""itemgetter serializer (needed for namedtuple support)"""

class Dummy:

def __getitem__(self, item):

return item

items = obj(Dummy())

if not isinstance(items, tuple):

items = (items, )

return self.save_reduce(operator.itemgetter, items)

if type(operator.itemgetter) is type:

dispatch[operator.itemgetter] = save_itemgetter

def save_attrgetter(self, obj):

"""attrgetter serializer"""

class Dummy(object):

def __init__(self, attrs, index=None):

self.attrs = attrs

self.index = index

def __getattribute__(self, item):

attrs = object.__getattribute__(self, "attrs")

index = object.__getattribute__(self, "index")

if index is None:

index = len(attrs)

attrs.append(item)

else:

attrs[index] = ".".join([attrs[index], item])

return type(self)(attrs, index)

attrs = []

obj(Dummy(attrs))

return self.save_reduce(operator.attrgetter, tuple(attrs))

if type(operator.attrgetter) is type:

dispatch[operator.attrgetter] = save_attrgetter

def save_reduce(self, func, args, state=None,

listitems=None, dictitems=None, obj=None):

"""Modified to support __transient__ on new objects

Change only affects protocol level 2 (which is always used by PiCloud"""

# Assert that args is a tuple or None

if not isinstance(args, tuple):

raise pickle.PicklingError("args from reduce() should be a tuple")

# Assert that func is callable

if not hasattr(func, '__call__'):

raise pickle.PicklingError("func from reduce should be callable")

save = self.save

write = self.write

# Protocol 2 special case: if func's name is __newobj__, use NEWOBJ

if self.proto >= 2 and getattr(func, "__name__", "") == "__newobj__":

#Added fix to allow transient

cls = args[0]

if not hasattr(cls, "__new__"):

raise pickle.PicklingError(

"args[0] from __newobj__ args has no __new__")

if obj is not None and cls is not obj.__class__:

raise pickle.PicklingError(

"args[0] from __newobj__ args has the wrong class")

args = args[1:]

save(cls)

#Don't pickle transient entries

if hasattr(obj, '__transient__'):

transient = obj.__transient__

state = state.copy()

for k in list(state.keys()):

if k in transient:

del state[k]

save(args)

write(pickle.NEWOBJ)

else:

save(func)

save(args)

write(pickle.REDUCE)

if obj is not None:

self.memoize(obj)

# More new special cases (that work with older protocols as

# well): when __reduce__ returns a tuple with 4 or 5 items,

# the 4th and 5th item should be iterators that provide list

# items and dict items (as (key, value) tuples), or None.

if listitems is not None:

self._batch_appends(listitems)

if dictitems is not None:

self._batch_setitems(dictitems)

if state is not None:

save(state)

write(pickle.BUILD)

def save_partial(self, obj):

"""Partial objects do not serialize correctly in python2.x -- this fixes the bugs"""

self.save_reduce(_genpartial, (obj.func, obj.args, obj.keywords))

if sys.version_info < (2,7): # 2.7 supports partial pickling

dispatch[partial] = save_partial

def save_file(self, obj):

"""Save a file"""

try:

import StringIO as pystringIO #we can't use cStringIO as it lacks the name attribute

except ImportError:

import io as pystringIO

if not hasattr(obj, 'name') or not hasattr(obj, 'mode'):

raise pickle.PicklingError("Cannot pickle files that do not map to an actual file")

if obj is sys.stdout:

return self.save_reduce(getattr, (sys,'stdout'), obj=obj)

if obj is sys.stderr:

return self.save_reduce(getattr, (sys,'stderr'), obj=obj)

if obj is sys.stdin:

raise pickle.PicklingError("Cannot pickle standard input")

if hasattr(obj, 'isatty') and obj.isatty():

raise pickle.PicklingError("Cannot pickle files that map to tty objects")

if 'r' not in obj.mode:

raise pickle.PicklingError("Cannot pickle files that are not opened for reading")

name = obj.name

try:

fsize = os.stat(name).st_size

except OSError:

raise pickle.PicklingError("Cannot pickle file %s as it cannot be stat" % name)

if obj.closed:

#create an empty closed string io

retval = pystringIO.StringIO("")

retval.close()

elif not fsize: #empty file

retval = pystringIO.StringIO("")

try:

tmpfile = file(name)

tst = tmpfile.read(1)

except IOError:

raise pickle.PicklingError("Cannot pickle file %s as it cannot be read" % name)

tmpfile.close()

if tst != '':

raise pickle.PicklingError("Cannot pickle file %s as it does not appear to map to a physical, real file" % name)

else:

try:

tmpfile = file(name)

contents = tmpfile.read()

tmpfile.close()

except IOError:

raise pickle.PicklingError("Cannot pickle file %s as it cannot be read" % name)

retval = pystringIO.StringIO(contents)

curloc = obj.tell()

retval.seek(curloc)

retval.name = name

self.save(retval)

self.memoize(obj)

if PY3:

dispatch[io.TextIOWrapper] = save_file

else:

dispatch[file] = save_file

"""Special functions for Add-on libraries"""

def inject_numpy(self):

numpy = sys.modules.get('numpy')

if not numpy or not hasattr(numpy, 'ufunc'):

return

self.dispatch[numpy.ufunc] = self.__class__.save_ufunc

def save_ufunc(self, obj):

"""Hack function for saving numpy ufunc objects"""

name = obj.__name__

numpy_tst_mods = ['numpy', 'scipy.special']

for tst_mod_name in numpy_tst_mods:

tst_mod = sys.modules.get(tst_mod_name, None)

if tst_mod and name in tst_mod.__dict__:

return self.save_reduce(_getobject, (tst_mod_name, name))

raise pickle.PicklingError('cannot save %s. Cannot resolve what module it is defined in'

% str(obj))

def inject_addons(self):

"""Plug in system. Register additional pickling functions if modules already loaded"""

self.inject_numpy()

# Shorthands for legacy support

def dump(obj, file, protocol=2):

CloudPickler(file, protocol).dump(obj)

def dumps(obj, protocol=2):

file = StringIO()

cp = CloudPickler(file,protocol)

cp.dump(obj)

return file.getvalue()

#hack for __import__ not working as desired

def subimport(name):

__import__(name)

return sys.modules[name]

# restores function attributes

def _restore_attr(obj, attr):

for key, val in attr.items():

setattr(obj, key, val)

return obj

def _get_module_builtins():

return pickle.__builtins__

def print_exec(stream):

ei = sys.exc_info()

traceback.print_exception(ei[0], ei[1], ei[2], None, stream)

def _modules_to_main(modList):

"""Force every module in modList to be placed into main"""

if not modList:

return

main = sys.modules['__main__']

for modname in modList:

if type(modname) is str:

try:

mod = __import__(modname)

except Exception as e:

sys.stderr.write('warning: could not import %s\n. '

'Your function may unexpectedly error due to this import failing;'

'A version mismatch is likely. Specific error was:\n' % modname)

print_exec(sys.stderr)

else:

setattr(main, mod.__name__, mod)

#object generators:

def _genpartial(func, args, kwds):

if not args:

args = ()

if not kwds:

kwds = {}

return partial(func, *args, **kwds)

def _fill_function(func, globals, defaults, dict, module):

""" Fills in the rest of function data into the skeleton function object

that were created via _make_skel_func().

"""

func.__globals__.update(globals)

func.__defaults__ = defaults

func.__dict__ = dict

func.__module__ = module

return func

def _make_cell(value):

return (lambda: value).__closure__[0]

def _reconstruct_closure(values):

return tuple([_make_cell(v) for v in values])

def _make_skel_func(code, closures, base_globals = None):

""" Creates a skeleton function object that contains just the provided

code and the correct number of cells in func_closure. All other

func attributes (e.g. func_globals) are empty.

"""

closure = _reconstruct_closure(closures) if closures else None

if base_globals is None:

base_globals = {}

base_globals['__builtins__'] = __builtins__

return types.FunctionType(code, base_globals,

None, None, closure)

def _load_class(cls, d):

"""

Loads additional properties into class `cls`.

"""

for k, v in d.items():

if isinstance(k, tuple):

typ, k = k

if typ == 'property':

v = property(*v)

elif typ == 'staticmethod':

v = staticmethod(v)

elif typ == 'classmethod':

v = classmethod(v)

setattr(cls, k, v)

return cls

def _load_namedtuple(name, fields):

"""

Loads a class generated by namedtuple

"""

from collections import namedtuple

return namedtuple(name, fields)

"""Constructors for 3rd party libraries

Note: These can never be renamed due to client compatibility issues"""

def _getobject(modname, attribute):

mod = __import__(modname, fromlist=[attribute])

return mod.__dict__[attribute]

一键复制

编辑

Web IDE

原始数据

按行查看

历史

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值