From 39bdf30604a325a2ae0fc935adad413006dbacdb Mon Sep 17 00:00:00 2001
From: Dan Osipov <daniil.osipov@shazam.com>
Date: Wed, 18 Jun 2014 14:37:19 -0700
Subject: [PATCH 1/4] Add S3 configuration parameters to the EC2 deploy scripts

---
 ec2/deploy.generic/root/spark-ec2/ec2-variables.sh | 2 ++
 ec2/spark_ec2.py                                   | 4 +++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/ec2/deploy.generic/root/spark-ec2/ec2-variables.sh b/ec2/deploy.generic/root/spark-ec2/ec2-variables.sh
index 3570891be804e..740c267fd9866 100644
--- a/ec2/deploy.generic/root/spark-ec2/ec2-variables.sh
+++ b/ec2/deploy.generic/root/spark-ec2/ec2-variables.sh
@@ -30,3 +30,5 @@ export HADOOP_MAJOR_VERSION="{{hadoop_major_version}}"
 export SWAP_MB="{{swap}}"
 export SPARK_WORKER_INSTANCES="{{spark_worker_instances}}"
 export SPARK_MASTER_OPTS="{{spark_master_opts}}"
+export AWS_ACCESS_KEY_ID="{{aws_access_key_id}}"
+export AWS_SECRET_ACCESS_KEY="{{aws_secret_access_key}}"
\ No newline at end of file
diff --git a/ec2/spark_ec2.py b/ec2/spark_ec2.py
index bfd07593b92ed..ff3be34b951cf 100755
--- a/ec2/spark_ec2.py
+++ b/ec2/spark_ec2.py
@@ -711,7 +711,9 @@ def deploy_files(conn, root_dir, opts, master_nodes, slave_nodes, modules):
         "shark_version": shark_v,
         "hadoop_major_version": opts.hadoop_major_version,
         "spark_worker_instances": "%d" % opts.worker_instances,
-        "spark_master_opts": opts.master_opts
+        "spark_master_opts": opts.master_opts,
+        "aws_access_key_id": os.getenv('AWS_ACCESS_KEY_ID'),
+        "aws_secret_access_key": os.getenv('AWS_SECRET_ACCESS_KEY')
     }
 
     # Create a temp directory in which we will place all the files to be

From 7e0da26ef5ca638c493d298c3c09d54326c8b40a Mon Sep 17 00:00:00 2001
From: Dan Osipov <daniil.osipov@shazam.com>
Date: Mon, 8 Sep 2014 10:30:35 -0700
Subject: [PATCH 2/4] Get AWS credentials out of boto connection instance

---
 ec2/spark_ec2.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ec2/spark_ec2.py b/ec2/spark_ec2.py
index ff3be34b951cf..3c94395f22c8e 100755
--- a/ec2/spark_ec2.py
+++ b/ec2/spark_ec2.py
@@ -712,8 +712,8 @@ def deploy_files(conn, root_dir, opts, master_nodes, slave_nodes, modules):
         "hadoop_major_version": opts.hadoop_major_version,
         "spark_worker_instances": "%d" % opts.worker_instances,
         "spark_master_opts": opts.master_opts,
-        "aws_access_key_id": os.getenv('AWS_ACCESS_KEY_ID'),
-        "aws_secret_access_key": os.getenv('AWS_SECRET_ACCESS_KEY')
+        "aws_access_key_id": conn.aws_access_key_id,
+        "aws_secret_access_key": conn.aws_secret_access_key
     }
 
     # Create a temp directory in which we will place all the files to be

From 71fab1482ca98f0de70139f51f292750bb4f5403 Mon Sep 17 00:00:00 2001
From: Dan Osipov <daniil.osipov@shazam.com>
Date: Mon, 15 Sep 2014 14:00:30 -0700
Subject: [PATCH 3/4] Use a parameter --copy-aws-credentials to enable S3
 credential deployment

---
 ec2/spark_ec2.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/ec2/spark_ec2.py b/ec2/spark_ec2.py
index 3c94395f22c8e..05cb53e7f73a3 100755
--- a/ec2/spark_ec2.py
+++ b/ec2/spark_ec2.py
@@ -158,6 +158,9 @@ def parse_args():
     parser.add_option(
         "--additional-security-group", type="string", default="",
         help="Additional security group to place the machines in")
+    parser.add_option(
+        "--copy-aws-credentials", action="store_true", default=False,
+        help="Add AWS credentials to hadoop configuration to allow Spark to access S3")
 
     (opts, args) = parser.parse_args()
     if len(args) != 2:
@@ -711,11 +714,16 @@ def deploy_files(conn, root_dir, opts, master_nodes, slave_nodes, modules):
         "shark_version": shark_v,
         "hadoop_major_version": opts.hadoop_major_version,
         "spark_worker_instances": "%d" % opts.worker_instances,
-        "spark_master_opts": opts.master_opts,
-        "aws_access_key_id": conn.aws_access_key_id,
-        "aws_secret_access_key": conn.aws_secret_access_key
+        "spark_master_opts": opts.master_opts
     }
 
+    if opts.copy_aws_credentials:
+        template_vars["aws_access_key_id"] = conn.aws_access_key_id
+        template_vars["aws_secret_access_key"] = conn.aws_secret_access_key
+    else:
+        template_vars["aws_access_key_id"] = ""
+        template_vars["aws_secret_access_key"] = ""
+
     # Create a temp directory in which we will place all the files to be
     # deployed after we substitue template parameters in them
     tmp_dir = tempfile.mkdtemp()

From 758da8b2521b19991302ac7666d9c2c6d441f4ea Mon Sep 17 00:00:00 2001
From: Dan Osipov <daniil.osipov@shazam.com>
Date: Tue, 16 Sep 2014 10:02:01 -0700
Subject: [PATCH 4/4] Modify documentation to include the new parameter

---
 docs/ec2-scripts.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/ec2-scripts.md b/docs/ec2-scripts.md
index f5ac6d894e1eb..b2ca6a9b48f32 100644
--- a/docs/ec2-scripts.md
+++ b/docs/ec2-scripts.md
@@ -156,6 +156,6 @@ If you have a patch or suggestion for one of these limitations, feel free to
 
 # Accessing Data in S3
 
-Spark's file interface allows it to process data in Amazon S3 using the same URI formats that are supported for Hadoop. You can specify a path in S3 as input through a URI of the form `s3n://<bucket>/path`. You will also need to set your Amazon security credentials, either by setting the environment variables `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` before your program or through `SparkContext.hadoopConfiguration`. Full instructions on S3 access using the Hadoop input libraries can be found on the [Hadoop S3 page](http://wiki.apache.org/hadoop/AmazonS3).
+Spark's file interface allows it to process data in Amazon S3 using the same URI formats that are supported for Hadoop. You can specify a path in S3 as input through a URI of the form `s3n://<bucket>/path`. To provide AWS credentials for S3 access, launch the Spark cluster with the option `--copy-aws-credentials`. Full instructions on S3 access using the Hadoop input libraries can be found on the [Hadoop S3 page](http://wiki.apache.org/hadoop/AmazonS3).
 
 In addition to using a single input file, you can also use a directory of files as input by simply giving the path to the directory.