From a4863c4cc99c304f78a0f17cfe8ec3d0a8e574e7 Mon Sep 17 00:00:00 2001 From: Davies Liu Date: Tue, 26 Aug 2014 18:06:27 -0700 Subject: [PATCH] randomize the dirs for each process --- python/pyspark/shuffle.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/pyspark/shuffle.py b/python/pyspark/shuffle.py index 1ebe7df418327..2750f117ba559 100644 --- a/python/pyspark/shuffle.py +++ b/python/pyspark/shuffle.py @@ -21,6 +21,7 @@ import shutil import warnings import gc +import random from pyspark.serializers import BatchedSerializer, PickleSerializer @@ -216,6 +217,9 @@ def _get_dirs(self): """ Get all the directories """ path = os.environ.get("SPARK_LOCAL_DIRS", "/tmp") dirs = path.split(",") + if len(dirs) > 1: + rnd = random.Random(os.getpid() + id(dirs)) + random.shuffle(dirs, rnd.random) return [os.path.join(d, "python", str(os.getpid()), str(id(self))) for d in dirs]