Skip to content
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,12 @@
- Add support for pickling interactively defined dataclasses.
([issue #245](https://github.com/cloudpipe/cloudpickle/pull/245))

- Global variables contained in pickle strings will override existing
variables when loaded in their new environment. This restores the (previously
untested) behavior of cloudpickle prior to changes done in 0.5.4 for
functions defined in the `__main__` module, and 0.6.0/1 for other dynamic
functions.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is no "override" anymore because we create a new empty namespace each time. May I suggest the following:

- Global variables referenced by functions pickled  by cloudpickle are now
  unpickled in a new and isolated namespace scoped by the CloudPickler instance.  
  This restores the (previously  untested) behavior of cloudpickle prior to changes
  done in 0.5.4 for functions defined in the `__main__` module,  and 0.6.0/1 for
  other dynamic functions.

Copy link
Member Author

@pierreglaser pierreglaser Feb 7, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes you are right, this was stale. I'll let the downstream integration test finish before pushing a new commit.



0.7.0
=====
Expand Down
70 changes: 25 additions & 45 deletions cloudpickle/cloudpickle.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,22 +79,6 @@
PY3 = True


# Container for the global namespace to ensure consistent unpickling of
# functions defined in dynamic modules (modules not registed in sys.modules).
_dynamic_modules_globals = weakref.WeakValueDictionary()


class _DynamicModuleFuncGlobals(dict):
"""Global variables referenced by a function defined in a dynamic module

To avoid leaking references we store such context in a WeakValueDictionary
instance. However instances of python builtin types such as dict cannot
be used directly as values in such a construct, hence the need for a
derived class.
"""
pass


def _make_cell_set_template_code():
"""Get the Python compiler to emit LOAD_FAST(arg); STORE_DEREF

Expand Down Expand Up @@ -670,17 +654,17 @@ def extract_func_data(self, func):
# save the dict
dct = func.__dict__

base_globals = self.globals_ref.get(id(func.__globals__), None)
if base_globals is None:
# For functions defined in a well behaved module use
# vars(func.__module__) for base_globals. This is necessary to
# share the global variables across multiple pickled functions from
# this module.
if func.__module__ is not None:
base_globals = func.__module__
else:
base_globals = {}
self.globals_ref[id(func.__globals__)] = base_globals
# base_globals represents the future global namespace of func at
# unpickling time. Looking it up and storing it in globals_ref allow
# functions sharing the same globals at pickling time to also
# share them once unpickled, at one condition: since globals_ref is
# an attribute of a Cloudpickler instance, and that a new CloudPickler is
# created each time pickle.dump or pickle.dumps is called, functions
# also need to be saved within the same invokation of
# cloudpickle.dump/cloudpickle.dumps (for example: cloudpickle.dumps([f1, f2])). There
# is no such limitation when using Cloudpickler.dump, as long as the
# multiple invokations are bound to the same Cloudpickler.
base_globals = self.globals_ref.setdefault(id(func.__globals__), {})

return (code, f_globals, defaults, closure, dct, base_globals)

Expand Down Expand Up @@ -1096,10 +1080,16 @@ def _fill_function(*args):
else:
raise ValueError('Unexpected _fill_value arguments: %r' % (args,))

# Only set global variables that do not exist.
for k, v in state['globals'].items():
if k not in func.__globals__:
func.__globals__[k] = v
# - At pickling time, any dynamic global variable used by func is
# serialized by value (in state['globals']).
# - At unpickling time, func's __globals__ attribute is initialized by
# first retrieving an empty isolated namespace that will be shared
# with other functions pickled from the same original module
# by the same CloudPickler instance and then updated with the
# content of state['globals'] to populate the shared isolated
# namespace with all the global variables that are specifically
# referenced for this function.
func.__globals__.update(state['globals'])

func.__defaults__ = state['defaults']
func.__dict__ = state['dict']
Expand Down Expand Up @@ -1137,21 +1127,11 @@ def _make_skel_func(code, cell_count, base_globals=None):
code and the correct number of cells in func_closure. All other
func attributes (e.g. func_globals) are empty.
"""
if base_globals is None:
# This is backward-compatibility code: for cloudpickle versions between
# 0.5.4 and 0.7, base_globals could be a string or None. base_globals
# should now always be a dictionary.
if base_globals is None or isinstance(base_globals, str):
base_globals = {}
elif isinstance(base_globals, str):
base_globals_name = base_globals
try:
# First try to reuse the globals from the module containing the
# function. If it is not possible to retrieve it, fallback to an
# empty dictionary.
base_globals = vars(importlib.import_module(base_globals))
except ImportError:
base_globals = _dynamic_modules_globals.get(
base_globals_name, None)
if base_globals is None:
base_globals = _DynamicModuleFuncGlobals()
_dynamic_modules_globals[base_globals_name] = base_globals

base_globals['__builtins__'] = __builtins__

Expand Down
228 changes: 63 additions & 165 deletions tests/cloudpickle_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -451,57 +451,6 @@ def method(self, x):
mod1, mod2 = pickle_depickle([mod, mod])
self.assertEqual(id(mod1), id(mod2))

def test_dynamic_modules_globals(self):
# _dynamic_modules_globals is a WeakValueDictionary, so if a value
# in this dict (containing a set of global variables from a dynamic
# module created in the parent process) has no other reference than in
# this dict in the child process, it will be garbage collected.

# We first create a module
mod = types.ModuleType('mod')
code = '''
x = 1
def func():
return
'''
exec(textwrap.dedent(code), mod.__dict__)

pickled_module_path = os.path.join(self.tmpdir, 'mod_f.pkl')
child_process_script = '''
import pickle
from cloudpickle.cloudpickle import _dynamic_modules_globals
import gc
with open("{pickled_module_path}", 'rb') as f:
func = pickle.load(f)

# A dictionnary storing the globals of the newly unpickled function
# should have been created
assert list(_dynamic_modules_globals.keys()) == ['mod']

# func.__globals__ is the only non-weak reference to
# _dynamic_modules_globals['mod']. By deleting func, we delete also
# _dynamic_modules_globals['mod']
del func
gc.collect()

# There is no reference to the globals of func since func has been
# deleted and _dynamic_modules_globals is a WeakValueDictionary,
# so _dynamic_modules_globals should now be empty
assert list(_dynamic_modules_globals.keys()) == []
'''

child_process_script = child_process_script.format(
pickled_module_path=_escape(pickled_module_path))

try:
with open(pickled_module_path, 'wb') as f:
cloudpickle.dump(mod.func, f, protocol=self.protocol)

assert_run_python_script(textwrap.dedent(child_process_script))

finally:
os.unlink(pickled_module_path)

def test_module_locals_behavior(self):
# Makes sure that a local function defined in another module is
# correctly serialized. This notably checks that the globals are
Expand Down Expand Up @@ -1083,20 +1032,42 @@ def f0():
def f1():
return VARIABLE

cloned_f0 = {clone_func}(f0, protocol={protocol})
cloned_f1 = {clone_func}(f1, protocol={protocol})
assert f0.__globals__ is f1.__globals__

# pickle f0 and f1 inside the same pickle_string
cloned_f0, cloned_f1 = {clone_func}([f0, f1], protocol={protocol})

# cloned_f0 and cloned_f1 now share a global namespace that is isolated
# from any previously existing namespace
assert cloned_f0.__globals__ is cloned_f1.__globals__
assert cloned_f0.__globals__ is not f0.__globals__

# pickle f1 another time, but in a new pickle string
pickled_f1 = dumps(f1, protocol={protocol})

# Change the value of the global variable
# Change the value of the global variable in f0's new global namespace
cloned_f0()

# Ensure that the global variable is the same for another function
result_f1 = cloned_f1()
assert result_f1 == "changed_by_f0", result_f1

# Ensure that unpickling the global variable does not change its value
result_pickled_f1 = loads(pickled_f1)()
assert result_pickled_f1 == "changed_by_f0", result_pickled_f1
# thanks to cloudpickle isolation, depickling and calling f0 and f1
# should not affect the globals of already existing modules
assert VARIABLE == "default_value", VARIABLE

# Ensure that cloned_f1 and cloned_f0 share the same globals, as f1 and
# f0 shared the same globals at pickling time, and cloned_f1 was
# depickled from the same pickle string as cloned_f0
shared_global_var = cloned_f1()
assert shared_global_var == "changed_by_f0", shared_global_var

# f1 is unpickled another time, but because it comes from another
# pickle string than pickled_f1 and pickled_f0, it will not share the
# same globals as the latter two.
new_cloned_f1 = loads(pickled_f1)
assert new_cloned_f1.__globals__ is not cloned_f1.__globals__
assert new_cloned_f1.__globals__ is not f1.__globals__

# get the value of new_cloned_f1's VARIABLE
new_global_var = new_cloned_f1()
assert new_global_var == "default_value", new_global_var
"""
for clone_func in ['local_clone', 'subprocess_pickle_echo']:
code = code_template.format(protocol=self.protocol,
Expand All @@ -1115,117 +1086,44 @@ def f0():
def f1():
return _TEST_GLOBAL_VARIABLE

cloned_f0 = cloudpickle.loads(cloudpickle.dumps(
f0, protocol=self.protocol))
cloned_f1 = cloudpickle.loads(cloudpickle.dumps(
f1, protocol=self.protocol))
# pickle f0 and f1 inside the same pickle_string
cloned_f0, cloned_f1 = pickle_depickle([f0, f1],
protocol=self.protocol)

# cloned_f0 and cloned_f1 now share a global namespace that is
# isolated from any previously existing namespace
assert cloned_f0.__globals__ is cloned_f1.__globals__
assert cloned_f0.__globals__ is not f0.__globals__

# pickle f1 another time, but in a new pickle string
pickled_f1 = cloudpickle.dumps(f1, protocol=self.protocol)

# Change the value of the global variable
# Change the global variable's value in f0's new global namespace
cloned_f0()
assert _TEST_GLOBAL_VARIABLE == "changed_by_f0"

# Ensure that the global variable is the same for another function
result_cloned_f1 = cloned_f1()
assert result_cloned_f1 == "changed_by_f0", result_cloned_f1
assert f1() == result_cloned_f1

# Ensure that unpickling the global variable does not change its
# value
result_pickled_f1 = cloudpickle.loads(pickled_f1)()
assert result_pickled_f1 == "changed_by_f0", result_pickled_f1
# depickling f0 and f1 should not affect the globals of already
# existing modules
assert _TEST_GLOBAL_VARIABLE == "default_value"

# Ensure that cloned_f1 and cloned_f0 share the same globals, as f1
# and f0 shared the same globals at pickling time, and cloned_f1
# was depickled from the same pickle string as cloned_f0
shared_global_var = cloned_f1()
assert shared_global_var == "changed_by_f0", shared_global_var

# f1 is unpickled another time, but because it comes from another
# pickle string than pickled_f1 and pickled_f0, it will not share
# the same globals as the latter two.
new_cloned_f1 = pickle.loads(pickled_f1)
assert new_cloned_f1.__globals__ is not cloned_f1.__globals__
assert new_cloned_f1.__globals__ is not f1.__globals__

# get the value of new_cloned_f1's VARIABLE
new_global_var = new_cloned_f1()
assert new_global_var == "default_value", new_global_var
finally:
_TEST_GLOBAL_VARIABLE = orig_value

def test_function_from_dynamic_module_with_globals_modifications(self):
# This test verifies that the global variable state of a function
# defined in a dynamic module in a child process are not reset by
# subsequent uplickling.

# first, we create a dynamic module in the parent process
mod = types.ModuleType('mod')
code = '''
GLOBAL_STATE = "initial value"

def func_defined_in_dynamic_module(v=None):
global GLOBAL_STATE
if v is not None:
GLOBAL_STATE = v
return GLOBAL_STATE
'''
exec(textwrap.dedent(code), mod.__dict__)

with_initial_globals_file = os.path.join(
self.tmpdir, 'function_with_initial_globals.pkl')
with_modified_globals_file = os.path.join(
self.tmpdir, 'function_with_modified_globals.pkl')

try:
# Simple sanity check on the function's output
assert mod.func_defined_in_dynamic_module() == "initial value"

# The function of mod is pickled two times, with two different
# values for the global variable GLOBAL_STATE.
# Then we launch a child process that sequentially unpickles the
# two functions. Those unpickle functions should share the same
# global variables in the child process:
# Once the first function gets unpickled, mod is created and
# tracked in the child environment. This is state is preserved
# when unpickling the second function whatever the global variable
# GLOBAL_STATE's value at the time of pickling.

with open(with_initial_globals_file, 'wb') as f:
cloudpickle.dump(mod.func_defined_in_dynamic_module, f)

# Change the mod's global variable
mod.GLOBAL_STATE = 'changed value'

# At this point, mod.func_defined_in_dynamic_module()
# returns the updated value. Let's pickle it again.
assert mod.func_defined_in_dynamic_module() == 'changed value'
with open(with_modified_globals_file, 'wb') as f:
cloudpickle.dump(mod.func_defined_in_dynamic_module, f,
protocol=self.protocol)

child_process_code = """
import pickle

with open({with_initial_globals_file!r},'rb') as f:
func_with_initial_globals = pickle.load(f)

# At this point, a module called 'mod' should exist in
# _dynamic_modules_globals. Further function loading
# will use the globals living in mod.

assert func_with_initial_globals() == 'initial value'

# Load a function with initial global variable that was
# pickled after a change in the global variable
with open({with_modified_globals_file!r},'rb') as f:
func_with_modified_globals = pickle.load(f)

# assert the this unpickling did not modify the value of
# the local
assert func_with_modified_globals() == 'initial value'

# Update the value from the child process and check that
# unpickling again does not reset our change.
assert func_with_initial_globals('new value') == 'new value'
assert func_with_modified_globals() == 'new value'

with open({with_initial_globals_file!r},'rb') as f:
func_with_initial_globals = pickle.load(f)
assert func_with_initial_globals() == 'new value'
assert func_with_modified_globals() == 'new value'
""".format(
with_initial_globals_file=_escape(with_initial_globals_file),
with_modified_globals_file=_escape(with_modified_globals_file))
assert_run_python_script(textwrap.dedent(child_process_code))

finally:
os.unlink(with_initial_globals_file)
os.unlink(with_modified_globals_file)

@pytest.mark.skipif(sys.version_info >= (3, 0),
reason="hardcoded pickle bytes for 2.7")
def test_function_pickle_compat_0_4_0(self):
Expand Down