diff --git a/cloudpickle/cloudpickle.py b/cloudpickle/cloudpickle.py index 842723539..cd39e92ba 100644 --- a/cloudpickle/cloudpickle.py +++ b/cloudpickle/cloudpickle.py @@ -78,6 +78,27 @@ PY3 = True +# caches dynamic modules that are not referenced in sys.modules +_dynamic_modules_globals = weakref.WeakValueDictionary() + +# _dynamic_modules_globals will store base_globals variables. Or, base_globals +# end up being dicts, and built-in dict instances cannot be weakly referenced. +# Therefore, we create a mirror of the dict type, and at the mutation of +# base globals from string to dict, we use the _DynamicModuleFuncGlobals +# constructor instead of the builtin dict one. + + +class _DynamicModuleFuncGlobals(dict): + """Global variables referenced by a function defined in a dynamic module + + To avoid leaking references we store such context in a WeakValueDictionary + instance. However instances of python builtin types such as dict cannot + be used directly as values in such a construct, hence the need for a + derived class. + """ + pass + + def _make_cell_set_template_code(): """Get the Python compiler to emit LOAD_FAST(arg); STORE_DEREF @@ -1090,12 +1111,17 @@ def _make_skel_func(code, cell_count, base_globals=None): if base_globals is None: base_globals = {} elif isinstance(base_globals, str): + base_globals_name = base_globals if sys.modules.get(base_globals, None) is not None: - # this checks if we can import the previous environment the object + # This checks if we can import the previous environment the object # lived in base_globals = vars(sys.modules[base_globals]) else: - base_globals = {} + base_globals = _dynamic_modules_globals.get( + base_globals_name, None) + if base_globals is None: + base_globals = _DynamicModuleFuncGlobals() + _dynamic_modules_globals[base_globals_name] = base_globals base_globals['__builtins__'] = __builtins__ diff --git a/tests/cloudpickle_test.py b/tests/cloudpickle_test.py index 41f185851..22fb0cfd6 100644 --- a/tests/cloudpickle_test.py +++ b/tests/cloudpickle_test.py @@ -4,6 +4,7 @@ import collections import base64 import functools +import gc import imp from io import BytesIO import itertools @@ -43,6 +44,7 @@ import cloudpickle from cloudpickle.cloudpickle import _find_module, _make_empty_cell, cell_set +from cloudpickle.cloudpickle import _dynamic_modules_globals from .testutils import subprocess_pickle_echo from .testutils import assert_run_python_script @@ -440,6 +442,59 @@ def method(self, x): mod1, mod2 = pickle_depickle([mod, mod]) self.assertEqual(id(mod1), id(mod2)) + def test_dynamic_modules_globals(self): + # _dynamic_modules_globals is a WeakValueDictionary, so if a value + # in this dict (containing a set of global variables from a dynamic + # module created in the parent process) has no other reference than in + # this dict in the child process, it will be garbage collected. + + # We first create a module + mod = imp.new_module('mod') + code = ''' + x = 1 + def func(): + return + ''' + exec(textwrap.dedent(code), mod.__dict__) + + pickled_module_path = 'mod_f.pkl' + + child_process_script = ''' + import pickle + from cloudpickle.cloudpickle import _dynamic_modules_globals + import gc + with open("{pickled_module_path}", 'rb') as f: + func = pickle.load(f) + + # A dictionnary storing the globals of the newly unpickled function + # should have been created + assert list(_dynamic_modules_globals.keys()) == ['mod'] + + # func.__globals__ is the only non-weak reference to + # _dynamic_modules_globals['mod']. By deleting func, we delete also + # _dynamic_modules_globals['mod'] + del func + gc.collect() + + # There is no reference to the globals of func since func has been + # deleted and _dynamic_modules_globals is a WeakValueDictionary, + # so _dynamic_modules_globals should now be empty + assert list(_dynamic_modules_globals.keys()) == [] + ''' + + child_process_script = child_process_script.format( + pickled_module_path=pickled_module_path) + + try: + with open(pickled_module_path, 'wb') as f: + cloudpickle.dump(mod.func, f) + + assert_run_python_script(textwrap.dedent(child_process_script)) + + finally: + os.unlink(pickled_module_path) + + def test_load_dynamic_module_in_grandchild_process(self): # Make sure that when loaded, a dynamic module preserves its dynamic # property. Otherwise, this will lead to an ImportError if pickled in @@ -993,6 +1048,95 @@ def f1(): finally: _TEST_GLOBAL_VARIABLE = orig_value + def test_function_from_dynamic_module_with_globals_modifications(self): + """ + this test verifies that: + - any modification in the global variables of a dynamic + module living in a child process won't get overridden + when new objects are unpickled in the child's interpreter + + - vice versa, e.g a modification in the parent process does not + override the value of the variables in the child process + + The two cases are equivalent, and here, the second case is tested. + """ + + # first, we create a dynamic module in the parent process + mod = imp.new_module('mod') + code = ''' + x = 1 + def func_that_relies_on_dynamic_module(v=None): + global x + if v is not None: + x = v + return x + ''' + exec(textwrap.dedent(code), mod.__dict__) + + try: + # simple sanity check on the function's output + assert mod.func_that_relies_on_dynamic_module() == 1 + + # the function of mod is pickled two times, with two different + # values for the global variable x. + + # a child process that sequentially unpickles the + # two functions is then launched + + # once the _first_ function gets unpickled, mod is created and + # tracked in the child environment. Whatever the global variable + # x's value in the second function, it will be overriden by the + # initial value of x in the child environment + + with open('function_with_initial_globals.pk', 'wb') as fid: + cloudpickle.dump(mod.func_that_relies_on_dynamic_module, fid) + + # change the mod's global variable x + mod.x = 2 + + # at this point, mod.func_that_relies_on_dynamic_module() + # returns 2 + assert mod.func_that_relies_on_dynamic_module() == 2 + with open('function_with_modified_globals.pk', 'wb') as fid: + cloudpickle.dump(mod.func_that_relies_on_dynamic_module, fid) + + child_process_code = """ + import pickle + + with open('function_with_initial_globals.pk','rb') as fid: + function_with_initial_globals = pickle.load(fid) + + # at this point, a module called 'mod' should exist in + # _dynamic_modules_globals. further function loading + # will use the globals living in mod + + assert function_with_initial_globals() == 1 + + # load a function with initial global variable x set to 2 + with open('function_with_modified_globals.pk','rb') as fid: + function_with_modified_globals = pickle.load(fid) + + # assert the initial global got overridden by + # _dynamic_modules_globals + assert function_with_modified_globals()==1 + + # both function's global x should point to the + # same variable. + # calling function_with_initial_globals('test_value') + # will change this variable: function_with_modified_globals() + # should return the changed variable + assert function_with_initial_globals('test_value') == 'test_value' + assert function_with_modified_globals() == 'test_value' + """ + + # finally, we execute the code + assert_run_python_script(textwrap.dedent(child_process_code)) + + finally: + # remove the created files + os.unlink('function_with_initial_globals.pk') + os.unlink('function_with_modified_globals.pk') + @pytest.mark.skipif(sys.version_info >= (3, 0), reason="hardcoded pickle bytes for 2.7") def test_function_pickle_compat_0_4_0(self):