From 39bdf30604a325a2ae0fc935adad413006dbacdb Mon Sep 17 00:00:00 2001 From: Dan Osipov Date: Wed, 18 Jun 2014 14:37:19 -0700 Subject: [PATCH 1/4] Add S3 configuration parameters to the EC2 deploy scripts --- ec2/deploy.generic/root/spark-ec2/ec2-variables.sh | 2 ++ ec2/spark_ec2.py | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/ec2/deploy.generic/root/spark-ec2/ec2-variables.sh b/ec2/deploy.generic/root/spark-ec2/ec2-variables.sh index 3570891be804e..740c267fd9866 100644 --- a/ec2/deploy.generic/root/spark-ec2/ec2-variables.sh +++ b/ec2/deploy.generic/root/spark-ec2/ec2-variables.sh @@ -30,3 +30,5 @@ export HADOOP_MAJOR_VERSION="{{hadoop_major_version}}" export SWAP_MB="{{swap}}" export SPARK_WORKER_INSTANCES="{{spark_worker_instances}}" export SPARK_MASTER_OPTS="{{spark_master_opts}}" +export AWS_ACCESS_KEY_ID="{{aws_access_key_id}}" +export AWS_SECRET_ACCESS_KEY="{{aws_secret_access_key}}" \ No newline at end of file diff --git a/ec2/spark_ec2.py b/ec2/spark_ec2.py index bfd07593b92ed..ff3be34b951cf 100755 --- a/ec2/spark_ec2.py +++ b/ec2/spark_ec2.py @@ -711,7 +711,9 @@ def deploy_files(conn, root_dir, opts, master_nodes, slave_nodes, modules): "shark_version": shark_v, "hadoop_major_version": opts.hadoop_major_version, "spark_worker_instances": "%d" % opts.worker_instances, - "spark_master_opts": opts.master_opts + "spark_master_opts": opts.master_opts, + "aws_access_key_id": os.getenv('AWS_ACCESS_KEY_ID'), + "aws_secret_access_key": os.getenv('AWS_SECRET_ACCESS_KEY') } # Create a temp directory in which we will place all the files to be From 7e0da26ef5ca638c493d298c3c09d54326c8b40a Mon Sep 17 00:00:00 2001 From: Dan Osipov Date: Mon, 8 Sep 2014 10:30:35 -0700 Subject: [PATCH 2/4] Get AWS credentials out of boto connection instance --- ec2/spark_ec2.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ec2/spark_ec2.py b/ec2/spark_ec2.py index ff3be34b951cf..3c94395f22c8e 100755 --- a/ec2/spark_ec2.py +++ b/ec2/spark_ec2.py @@ -712,8 +712,8 @@ def deploy_files(conn, root_dir, opts, master_nodes, slave_nodes, modules): "hadoop_major_version": opts.hadoop_major_version, "spark_worker_instances": "%d" % opts.worker_instances, "spark_master_opts": opts.master_opts, - "aws_access_key_id": os.getenv('AWS_ACCESS_KEY_ID'), - "aws_secret_access_key": os.getenv('AWS_SECRET_ACCESS_KEY') + "aws_access_key_id": conn.aws_access_key_id, + "aws_secret_access_key": conn.aws_secret_access_key } # Create a temp directory in which we will place all the files to be From 71fab1482ca98f0de70139f51f292750bb4f5403 Mon Sep 17 00:00:00 2001 From: Dan Osipov Date: Mon, 15 Sep 2014 14:00:30 -0700 Subject: [PATCH 3/4] Use a parameter --copy-aws-credentials to enable S3 credential deployment --- ec2/spark_ec2.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/ec2/spark_ec2.py b/ec2/spark_ec2.py index 3c94395f22c8e..05cb53e7f73a3 100755 --- a/ec2/spark_ec2.py +++ b/ec2/spark_ec2.py @@ -158,6 +158,9 @@ def parse_args(): parser.add_option( "--additional-security-group", type="string", default="", help="Additional security group to place the machines in") + parser.add_option( + "--copy-aws-credentials", action="store_true", default=False, + help="Add AWS credentials to hadoop configuration to allow Spark to access S3") (opts, args) = parser.parse_args() if len(args) != 2: @@ -711,11 +714,16 @@ def deploy_files(conn, root_dir, opts, master_nodes, slave_nodes, modules): "shark_version": shark_v, "hadoop_major_version": opts.hadoop_major_version, "spark_worker_instances": "%d" % opts.worker_instances, - "spark_master_opts": opts.master_opts, - "aws_access_key_id": conn.aws_access_key_id, - "aws_secret_access_key": conn.aws_secret_access_key + "spark_master_opts": opts.master_opts } + if opts.copy_aws_credentials: + template_vars["aws_access_key_id"] = conn.aws_access_key_id + template_vars["aws_secret_access_key"] = conn.aws_secret_access_key + else: + template_vars["aws_access_key_id"] = "" + template_vars["aws_secret_access_key"] = "" + # Create a temp directory in which we will place all the files to be # deployed after we substitue template parameters in them tmp_dir = tempfile.mkdtemp() From 758da8b2521b19991302ac7666d9c2c6d441f4ea Mon Sep 17 00:00:00 2001 From: Dan Osipov Date: Tue, 16 Sep 2014 10:02:01 -0700 Subject: [PATCH 4/4] Modify documentation to include the new parameter --- docs/ec2-scripts.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ec2-scripts.md b/docs/ec2-scripts.md index f5ac6d894e1eb..b2ca6a9b48f32 100644 --- a/docs/ec2-scripts.md +++ b/docs/ec2-scripts.md @@ -156,6 +156,6 @@ If you have a patch or suggestion for one of these limitations, feel free to # Accessing Data in S3 -Spark's file interface allows it to process data in Amazon S3 using the same URI formats that are supported for Hadoop. You can specify a path in S3 as input through a URI of the form `s3n:///path`. You will also need to set your Amazon security credentials, either by setting the environment variables `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` before your program or through `SparkContext.hadoopConfiguration`. Full instructions on S3 access using the Hadoop input libraries can be found on the [Hadoop S3 page](http://wiki.apache.org/hadoop/AmazonS3). +Spark's file interface allows it to process data in Amazon S3 using the same URI formats that are supported for Hadoop. You can specify a path in S3 as input through a URI of the form `s3n:///path`. To provide AWS credentials for S3 access, launch the Spark cluster with the option `--copy-aws-credentials`. Full instructions on S3 access using the Hadoop input libraries can be found on the [Hadoop S3 page](http://wiki.apache.org/hadoop/AmazonS3). In addition to using a single input file, you can also use a directory of files as input by simply giving the path to the directory.