From 85bdf17921cf62f9444dc21feddd8294056b77ea Mon Sep 17 00:00:00 2001 From: Laszlo Bodor Date: Fri, 16 May 2025 09:46:26 +0200 Subject: [PATCH 1/2] TEZ-4631: Include an official script that installs hadoop and tez and runs a simple example DAG --- dev-support/bin/tez_run_example.sh | 119 +++++++++++++++++++++++++++++ 1 file changed, 119 insertions(+) create mode 100644 dev-support/bin/tez_run_example.sh diff --git a/dev-support/bin/tez_run_example.sh b/dev-support/bin/tez_run_example.sh new file mode 100644 index 0000000000..d139186e3b --- /dev/null +++ b/dev-support/bin/tez_run_example.sh @@ -0,0 +1,119 @@ + +# This script is used to set up a local Hadoop and Tez environment for running a simple word count example. +# Prerequisites +# 1. java is installed and JAVA_HOME is set +# 2. ssh localhost works without password + +# configure this if needed, by default it will use the latest stable versions in the current directory +export TEZ_VERSION=$(curl -s "https://downloads.apache.org/tez/" | grep -oP '\K[0-9]+\.[0-9]+\.[0-9]+(?=/)' | sort -V | tail -1) # e.g. 0.10.4 +export HADOOP_VERSION=$(curl -s "https://downloads.apache.org/hadoop/common/" | grep -oP 'hadoop-\K[0-9]+\.[0-9]+\.[0-9]+(?=/)' | sort -V | tail -1) # e.g. 3.4.1 +export HADOOP_STACK_HOME=$PWD + +echo "Demo script is running in $HADOOP_STACK_HOME with TEZ version $TEZ_VERSION and HADOOP version $HADOOP_VERSION" + +cd $HADOOP_STACK_HOME +wget -nc https://archive.apache.org/dist/hadoop/common/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION.tar.gz +wget -nc https://archive.apache.org/dist/tez/$TEZ_VERSION/apache-tez-$TEZ_VERSION-bin.tar.gz + +if [ ! -d "hadoop-$HADOOP_VERSION" ]; then + tar -xzf hadoop-$HADOOP_VERSION.tar.gz +fi + +if [ ! -d "apache-tez-$TEZ_VERSION-bin" ]; then + tar -xzf apache-tez-$TEZ_VERSION-bin.tar.gz +fi + +ln -s hadoop-$HADOOP_VERSION hadoop +ln -s apache-tez-$TEZ_VERSION-bin tez + +export HADOOP_HOME=$HADOOP_STACK_HOME/hadoop +export TEZ_HOME=$HADOOP_STACK_HOME/tez +export HADOOP_CLASSPATH=$TEZ_HOME/*:$TEZ_HOME/lib/*:$TEZ_HOME/conf + +export PATH=$PATH:$HADOOP_HOME/bin + +# https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-common/SingleCluster.html#Pseudo-Distributed_Operation +cat < $HADOOP_HOME/etc/hadoop/hdfs-site.xml + + + + + + dfs.replication + 1 + + +EOF + +cat < $HADOOP_HOME/etc/hadoop/core-site.xml + + + + + + fs.defaultFS + hdfs://localhost:9000 + + +EOF + +cat < $HADOOP_HOME/etc/hadoop/yarn-site.xml + + + + + + yarn.nodemanager.aux-services + mapreduce_shuffle + + +EOF + +# optionally stop previous clusters if any +#$HADOOP_HOME/sbin/stop-dfs.sh +#$HADOOP_HOME/sbin/stop-yarn.sh + +hdfs namenode -format + +$HADOOP_HOME/sbin/start-dfs.sh +$HADOOP_HOME/sbin/start-yarn.sh + +hadoop fs -mkdir /apps/ +hadoop fs -mkdir /apps/tez-$TEZ_VERSION +hadoop fs -copyFromLocal $TEZ_HOME/share/tez.tar.gz /apps/tez-$TEZ_VERSION + +# create a simple tez-site.xml +cat < $TEZ_HOME/conf/tez-site.xml + + + + + + tez.lib.uris + /apps/tez-$TEZ_VERSION/tez.tar.gz + + +EOF + +# create a simple input file +cat < ./words.txt +Apple +Banana +Car +Apple +Banana +Car +Dog +Elephant +Friend +Game +EOF + +hadoop fs -copyFromLocal words.txt /words.txt + +# finally run the example +hadoop jar $TEZ_HOME/tez-examples-$TEZ_VERSION.jar orderedwordcount /words.txt /words_out + +# check the output +hadoop fs -ls /words_out +hadoop fs -text /words_out/part-v002-o000-r-00000 \ No newline at end of file From bd94d8bc04ce02a7939ee281111f7ad818aadeb8 Mon Sep 17 00:00:00 2001 From: Laszlo Bodor Date: Tue, 20 May 2025 09:35:46 +0200 Subject: [PATCH 2/2] improvements + PR comments --- dev-support/bin/tez_run_example.sh | 73 +++++++++++++++++++++--------- 1 file changed, 51 insertions(+), 22 deletions(-) mode change 100644 => 100755 dev-support/bin/tez_run_example.sh diff --git a/dev-support/bin/tez_run_example.sh b/dev-support/bin/tez_run_example.sh old mode 100644 new mode 100755 index d139186e3b..7e980b9484 --- a/dev-support/bin/tez_run_example.sh +++ b/dev-support/bin/tez_run_example.sh @@ -4,30 +4,42 @@ # 1. java is installed and JAVA_HOME is set # 2. ssh localhost works without password -# configure this if needed, by default it will use the latest stable versions in the current directory -export TEZ_VERSION=$(curl -s "https://downloads.apache.org/tez/" | grep -oP '\K[0-9]+\.[0-9]+\.[0-9]+(?=/)' | sort -V | tail -1) # e.g. 0.10.4 -export HADOOP_VERSION=$(curl -s "https://downloads.apache.org/hadoop/common/" | grep -oP 'hadoop-\K[0-9]+\.[0-9]+\.[0-9]+(?=/)' | sort -V | tail -1) # e.g. 3.4.1 -export HADOOP_STACK_HOME=$PWD +# All parameters are optional: +# TEZ_VERSION: defaults to the latest version available on the Apache Tez download page +# HADOOP_VERSION: defaults to the version which belongs to the TEZ_VERSION +# TEZ_EXAMPLE_WORKING_DIR: defaults to the current working directory -echo "Demo script is running in $HADOOP_STACK_HOME with TEZ version $TEZ_VERSION and HADOOP version $HADOOP_VERSION" +# TEZ_VERSION comes from environment variable or is fetched from the Apache Tez download page +export TEZ_VERSION=${TEZ_VERSION:=$(curl -s "https://downloads.apache.org/tez/" | grep --color=never -o '[0-9]\+\.[0-9]\+\.[0-9]\+' | sed -n '/\/$/!p' | sort -V | tail -1)} # e.g. 0.10.4 +export TEZ_EXAMPLE_WORKING_DIR=${TEZ_EXAMPLE_WORKING_DIR:=$PWD} +cd $TEZ_EXAMPLE_WORKING_DIR -cd $HADOOP_STACK_HOME -wget -nc https://archive.apache.org/dist/hadoop/common/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION.tar.gz +echo "TEZ_VERSION: $TEZ_VERSION" wget -nc https://archive.apache.org/dist/tez/$TEZ_VERSION/apache-tez-$TEZ_VERSION-bin.tar.gz -if [ ! -d "hadoop-$HADOOP_VERSION" ]; then - tar -xzf hadoop-$HADOOP_VERSION.tar.gz -fi - +# Need to extract the Tez tarball early to get hadoop version it depends on if [ ! -d "apache-tez-$TEZ_VERSION-bin" ]; then tar -xzf apache-tez-$TEZ_VERSION-bin.tar.gz fi -ln -s hadoop-$HADOOP_VERSION hadoop -ln -s apache-tez-$TEZ_VERSION-bin tez +export HADOOP_VERSION=${HADOOP_VERSION:=$(basename apache-tez-$TEZ_VERSION-bin/lib/hadoop-hdfs-client-*.jar | sed -E 's/.*hadoop-hdfs-client-([0-9]+\.[0-9]+\.[0-9]+)\.jar/\1/')} # e.g. 3.4.1 + +cat < $HADOOP_HOME/etc/hadoop/yarn-site.xml EOF # optionally stop previous clusters if any -#$HADOOP_HOME/sbin/stop-dfs.sh -#$HADOOP_HOME/sbin/stop-yarn.sh +$HADOOP_HOME/sbin/stop-dfs.sh +$HADOOP_HOME/sbin/stop-yarn.sh -hdfs namenode -format +rm -rf /tmp/hadoop-$USER/dfs/data +hdfs namenode -format -force $HADOOP_HOME/sbin/start-dfs.sh $HADOOP_HOME/sbin/start-yarn.sh -hadoop fs -mkdir /apps/ -hadoop fs -mkdir /apps/tez-$TEZ_VERSION +hadoop fs -mkdir -p /apps/tez-$TEZ_VERSION hadoop fs -copyFromLocal $TEZ_HOME/share/tez.tar.gz /apps/tez-$TEZ_VERSION # create a simple tez-site.xml @@ -111,9 +123,26 @@ EOF hadoop fs -copyFromLocal words.txt /words.txt +export HADOOP_USER_CLASSPATH_FIRST=true # finally run the example -hadoop jar $TEZ_HOME/tez-examples-$TEZ_VERSION.jar orderedwordcount /words.txt /words_out +yarn jar $TEZ_HOME/tez-examples-$TEZ_VERSION.jar orderedwordcount /words.txt /words_out # check the output hadoop fs -ls /words_out -hadoop fs -text /words_out/part-v002-o000-r-00000 \ No newline at end of file +hadoop fs -text /words_out/part-v002-o000-r-00000 + + +cat <