Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
148 changes: 148 additions & 0 deletions dev-support/bin/tez_run_example.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@

# This script is used to set up a local Hadoop and Tez environment for running a simple word count example.
# Prerequisites
# 1. java is installed and JAVA_HOME is set
# 2. ssh localhost works without password

# All parameters are optional:
# TEZ_VERSION: defaults to the latest version available on the Apache Tez download page
# HADOOP_VERSION: defaults to the version which belongs to the TEZ_VERSION
# TEZ_EXAMPLE_WORKING_DIR: defaults to the current working directory

# TEZ_VERSION comes from environment variable or is fetched from the Apache Tez download page
export TEZ_VERSION=${TEZ_VERSION:=$(curl -s "https://downloads.apache.org/tez/" | grep --color=never -o '[0-9]\+\.[0-9]\+\.[0-9]\+' | sed -n '/\/$/!p' | sort -V | tail -1)} # e.g. 0.10.4
export TEZ_EXAMPLE_WORKING_DIR=${TEZ_EXAMPLE_WORKING_DIR:=$PWD}
cd $TEZ_EXAMPLE_WORKING_DIR

echo "TEZ_VERSION: $TEZ_VERSION"
wget -nc https://archive.apache.org/dist/tez/$TEZ_VERSION/apache-tez-$TEZ_VERSION-bin.tar.gz

# Need to extract the Tez tarball early to get hadoop version it depends on
if [ ! -d "apache-tez-$TEZ_VERSION-bin" ]; then
tar -xzf apache-tez-$TEZ_VERSION-bin.tar.gz
fi

export HADOOP_VERSION=${HADOOP_VERSION:=$(basename apache-tez-$TEZ_VERSION-bin/lib/hadoop-hdfs-client-*.jar | sed -E 's/.*hadoop-hdfs-client-([0-9]+\.[0-9]+\.[0-9]+)\.jar/\1/')} # e.g. 3.4.1

cat <<EOF
***
*** Demo setup script is running in $TEZ_EXAMPLE_WORKING_DIR ***
*** TEZ version: $TEZ_VERSION
*** HADOOP version $HADOOP_VERSION
***
EOF

wget -nc https://archive.apache.org/dist/hadoop/common/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION.tar.gz

if [ ! -d "hadoop-$HADOOP_VERSION" ]; then
tar -xzf hadoop-$HADOOP_VERSION.tar.gz
fi

export HADOOP_HOME=$TEZ_EXAMPLE_WORKING_DIR/hadoop-$HADOOP_VERSION
export TEZ_HOME=$TEZ_EXAMPLE_WORKING_DIR/apache-tez-$TEZ_VERSION-bin
export HADOOP_CLASSPATH=$TEZ_HOME/*:$TEZ_HOME/lib/*:$TEZ_HOME/conf

export PATH=$PATH:$HADOOP_HOME/bin

# https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-common/SingleCluster.html#Pseudo-Distributed_Operation
cat <<EOF > $HADOOP_HOME/etc/hadoop/hdfs-site.xml
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>

<configuration>
<property>
<name>dfs.replication</name>
<value>1</value>
</property>
</configuration>
EOF

cat <<EOF > $HADOOP_HOME/etc/hadoop/core-site.xml
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>

<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://localhost:9000</value>
</property>
</configuration>
EOF

cat <<EOF > $HADOOP_HOME/etc/hadoop/yarn-site.xml
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>

<configuration>
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
</configuration>
EOF

# optionally stop previous clusters if any
$HADOOP_HOME/sbin/stop-dfs.sh
$HADOOP_HOME/sbin/stop-yarn.sh

rm -rf /tmp/hadoop-$USER/dfs/data
hdfs namenode -format -force

$HADOOP_HOME/sbin/start-dfs.sh
$HADOOP_HOME/sbin/start-yarn.sh

hadoop fs -mkdir -p /apps/tez-$TEZ_VERSION
hadoop fs -copyFromLocal $TEZ_HOME/share/tez.tar.gz /apps/tez-$TEZ_VERSION

# create a simple tez-site.xml
cat <<EOF > $TEZ_HOME/conf/tez-site.xml
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>

<configuration>
<property>
<name>tez.lib.uris</name>
<value>/apps/tez-$TEZ_VERSION/tez.tar.gz</value>
</property>
</configuration>
EOF

# create a simple input file
cat <<EOF > ./words.txt
Apple
Banana
Car
Apple
Banana
Car
Dog
Elephant
Friend
Game
EOF

hadoop fs -copyFromLocal words.txt /words.txt

export HADOOP_USER_CLASSPATH_FIRST=true
# finally run the example
yarn jar $TEZ_HOME/tez-examples-$TEZ_VERSION.jar orderedwordcount /words.txt /words_out

# check the output
hadoop fs -ls /words_out
hadoop fs -text /words_out/part-v002-o000-r-00000


cat <<EOF
*** Since the environment is already set up, you can rerun the DAG using the commands below.

export HADOOP_USER_CLASSPATH_FIRST=true
export TEZ_HOME=$TEZ_EXAMPLE_WORKING_DIR/apache-tez-$TEZ_VERSION-bin
export HADOOP_CLASSPATH=$TEZ_HOME/*:$TEZ_HOME/lib/*:$TEZ_HOME/conf
$HADOOP_HOME/bin/yarn jar $TEZ_HOME/tez-examples-$TEZ_VERSION.jar orderedwordcount /words.txt /words_out

*** You can also visit some of the sites that are set up during the script execution.

Yarn RM: http://localhost:8088
HDFS NN: http://localhost:9870

EOF