Skip to content

Commit 9f97910

Browse files
committed
YARN-4309. Add container launch related debug information to container logs when a container fails. (Varun Vasudev via wangda)
(cherry picked from commit dfcbbdd)
1 parent 0852d35 commit 9f97910

7 files changed

Lines changed: 217 additions & 14 deletions

File tree

hadoop-yarn-project/CHANGES.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -561,6 +561,9 @@ Release 2.8.0 - UNRELEASED
561561
YARN-3946. Update exact reason as to why a submitted app is in ACCEPTED state to
562562
app's diagnostic message. (Naganarasimha G R via wangda)
563563

564+
YARN-4309. Add container launch related debug information to container logs
565+
when a container fails. (Varun Vasudev via wangda)
566+
564567
OPTIMIZATIONS
565568

566569
YARN-3339. TestDockerContainerExecutor should pull a single image and not

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,11 @@ private static void addDeprecatedKeys() {
9292
/** Delay before deleting resource to ease debugging of NM issues */
9393
public static final String DEBUG_NM_DELETE_DELAY_SEC =
9494
YarnConfiguration.NM_PREFIX + "delete.debug-delay-sec";
95+
96+
public static final String NM_LOG_CONTAINER_DEBUG_INFO =
97+
YarnConfiguration.NM_PREFIX + "log-container-debug-info.enabled";
98+
99+
public static final boolean DEFAULT_NM_LOG_CONTAINER_DEBUG_INFO = false;
95100

96101
////////////////////////////////
97102
// IPC Configs

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1127,6 +1127,18 @@
11271127
<value>logs</value>
11281128
</property>
11291129

1130+
<property>
1131+
<description>Generate additional logs about container launches.
1132+
Currently, this creates a copy of the launch script and lists the
1133+
directory contents of the container work dir. When listing directory
1134+
contents, we follow symlinks to a max-depth of 5(including symlinks
1135+
which point to outside the container work dir) which may lead to a
1136+
slowness in launching containers.
1137+
</description>
1138+
<name>yarn.nodemanager.log-container-debug-info.enabled</name>
1139+
<value>false</value>
1140+
</property>
1141+
11301142
<property>
11311143
<description>Amount of physical memory, in MB, that can be allocated
11321144
for containers. If set to -1 and

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/ContainerExecutor.java

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
import java.util.concurrent.locks.ReentrantReadWriteLock.ReadLock;
3535
import java.util.concurrent.locks.ReentrantReadWriteLock.WriteLock;
3636

37+
import com.google.common.annotations.VisibleForTesting;
3738
import org.apache.commons.io.FileUtils;
3839
import org.apache.commons.logging.Log;
3940
import org.apache.commons.logging.LogFactory;
@@ -65,6 +66,8 @@ public abstract class ContainerExecutor implements Configurable {
6566
final public static FsPermission TASK_LAUNCH_SCRIPT_PERMISSION =
6667
FsPermission.createImmutable((short) 0700);
6768

69+
public static final String DIRECTORY_CONTENTS = "directory.info";
70+
6871
private Configuration conf;
6972

7073
private ConcurrentMap<ContainerId, Path> pidFiles =
@@ -241,11 +244,22 @@ public int reacquireContainer(ContainerReacquisitionContext ctx)
241244
* @param resources The resources which have been localized for this container
242245
* Symlinks will be created to these localized resources
243246
* @param command The command that will be run.
247+
* @param logDir The log dir to copy debugging information to
244248
* @throws IOException if any errors happened writing to the OutputStream,
245249
* while creating symlinks
246250
*/
247251
public void writeLaunchEnv(OutputStream out, Map<String, String> environment,
248-
Map<Path, List<String>> resources, List<String> command) throws IOException{
252+
Map<Path, List<String>> resources, List<String> command, Path logDir)
253+
throws IOException {
254+
this.writeLaunchEnv(out, environment, resources, command, logDir,
255+
ContainerLaunch.CONTAINER_SCRIPT);
256+
}
257+
258+
@VisibleForTesting
259+
public void writeLaunchEnv(OutputStream out,
260+
Map<String, String> environment, Map<Path, List<String>> resources,
261+
List<String> command, Path logDir, String outFilename)
262+
throws IOException {
249263
ContainerLaunch.ShellScriptBuilder sb =
250264
ContainerLaunch.ShellScriptBuilder.create();
251265
Set<String> whitelist = new HashSet<String>();
@@ -272,6 +286,14 @@ public void writeLaunchEnv(OutputStream out, Map<String, String> environment,
272286
}
273287
}
274288

289+
// dump debugging information if configured
290+
if (getConf() != null && getConf().getBoolean(
291+
YarnConfiguration.NM_LOG_CONTAINER_DEBUG_INFO,
292+
YarnConfiguration.DEFAULT_NM_LOG_CONTAINER_DEBUG_INFO)) {
293+
sb.copyDebugInformation(new Path(outFilename), new Path(logDir, outFilename));
294+
sb.listDebugInformation(new Path(logDir, DIRECTORY_CONTENTS));
295+
}
296+
275297
sb.command(command);
276298

277299
PrintStream pout = null;

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DockerContainerExecutor.java

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -329,7 +329,7 @@ public int launchContainer(ContainerStartContext ctx) throws IOException {
329329
* the docker image and write them out to an OutputStream.
330330
*/
331331
public void writeLaunchEnv(OutputStream out, Map<String, String> environment,
332-
Map<Path, List<String>> resources, List<String> command)
332+
Map<Path, List<String>> resources, List<String> command, Path logDir)
333333
throws IOException {
334334
ContainerLaunch.ShellScriptBuilder sb =
335335
ContainerLaunch.ShellScriptBuilder.create();
@@ -358,6 +358,15 @@ public void writeLaunchEnv(OutputStream out, Map<String, String> environment,
358358
}
359359
}
360360

361+
// dump debugging information if configured
362+
if (getConf() != null && getConf().getBoolean(
363+
YarnConfiguration.NM_LOG_CONTAINER_DEBUG_INFO,
364+
YarnConfiguration.DEFAULT_NM_LOG_CONTAINER_DEBUG_INFO)) {
365+
sb.copyDebugInformation(new Path(ContainerLaunch.CONTAINER_SCRIPT),
366+
new Path(logDir, ContainerLaunch.CONTAINER_SCRIPT));
367+
sb.listDebugInformation(new Path(logDir, DIRECTORY_CONTENTS));
368+
}
369+
361370
sb.command(command);
362371

363372
PrintStream pout = null;

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerLaunch.java

Lines changed: 73 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -272,7 +272,8 @@ public Integer call() {
272272

273273
// Write out the environment
274274
exec.writeLaunchEnv(containerScriptOutStream, environment,
275-
localResources, launchContext.getCommands());
275+
localResources, launchContext.getCommands(),
276+
new Path(containerLogDirs.get(0)));
276277

277278
// /////////// End of writing out container-script
278279

@@ -641,6 +642,28 @@ public final void symlink(Path src, Path dst) throws IOException {
641642
link(src, dst);
642643
}
643644

645+
/**
646+
* Method to copy files that are useful for debugging container failures.
647+
* This method will be called by ContainerExecutor when setting up the
648+
* container launch script. The method should take care to make sure files
649+
* are read-able by the yarn user if the files are to undergo
650+
* log-aggregation.
651+
* @param src path to the source file
652+
* @param dst path to the destination file - should be absolute
653+
* @throws IOException
654+
*/
655+
public abstract void copyDebugInformation(Path src, Path dst)
656+
throws IOException;
657+
658+
/**
659+
* Method to dump debug information to the a target file. This method will
660+
* be called by ContainerExecutor when setting up the container launch
661+
* script.
662+
* @param output the file to which debug information is to be written
663+
* @throws IOException
664+
*/
665+
public abstract void listDebugInformation(Path output) throws IOException;
666+
644667
@Override
645668
public String toString() {
646669
return sb.toString();
@@ -704,6 +727,36 @@ protected void mkdir(Path path) {
704727
line("mkdir -p ", path.toString());
705728
errorCheck();
706729
}
730+
731+
@Override
732+
public void copyDebugInformation(Path src, Path dest) throws IOException {
733+
line("# Creating copy of launch script");
734+
line("cp \"", src.toUri().getPath(), "\" \"", dest.toUri().getPath(),
735+
"\"");
736+
// set permissions to 640 because we need to be able to run
737+
// log aggregation in secure mode as well
738+
if(dest.isAbsolute()) {
739+
line("chmod 640 \"", dest.toUri().getPath(), "\"");
740+
}
741+
}
742+
743+
@Override
744+
public void listDebugInformation(Path output) throws IOException {
745+
line("# Determining directory contents");
746+
line("echo \"ls -l:\" 1>\"", output.toString(), "\"");
747+
line("ls -l 1>>\"", output.toString(), "\"");
748+
749+
// don't run error check because if there are loops
750+
// find will exit with an error causing container launch to fail
751+
// find will follow symlinks outside the work dir if such sylimks exist
752+
// (like public/app local resources)
753+
line("echo \"find -L . -maxdepth 5 -ls:\" 1>>\"", output.toString(),
754+
"\"");
755+
line("find -L . -maxdepth 5 -ls 1>>\"", output.toString(), "\"");
756+
line("echo \"broken symlinks(find -L . -maxdepth 5 -type l -ls):\" 1>>\"",
757+
output.toString(), "\"");
758+
line("find -L . -maxdepth 5 -type l -ls 1>>\"", output.toString(), "\"");
759+
}
707760
}
708761

709762
private static final class WindowsShellScriptBuilder
@@ -757,6 +810,25 @@ protected void mkdir(Path path) throws IOException {
757810
path.toString(), path.toString()));
758811
errorCheck();
759812
}
813+
814+
@Override
815+
public void copyDebugInformation(Path src, Path dest)
816+
throws IOException {
817+
// no need to worry about permissions - in secure mode
818+
// WindowsSecureContainerExecutor will set permissions
819+
// to allow NM to read the file
820+
line("rem Creating copy of launch script");
821+
lineWithLenCheck(String.format("copy \"%s\" \"%s\"", src.toString(),
822+
dest.toString()));
823+
}
824+
825+
@Override
826+
public void listDebugInformation(Path output) throws IOException {
827+
line("rem Determining directory contents");
828+
lineWithLenCheck(
829+
String.format("@echo \"dir:\" > \"%s\"", output.toString()));
830+
lineWithLenCheck(String.format("dir >> \"%s\"", output.toString()));
831+
}
760832
}
761833

762834
private static void putEnvIfNotNull(

0 commit comments

Comments
 (0)