From a86c54c4db3954aca40ef297135a5e875c0a96a8 Mon Sep 17 00:00:00 2001 From: George G Date: Tue, 11 Sep 2018 15:00:00 +0000 Subject: [PATCH] YARN-8470. Fix a NPE in identifyContainersToPreemptOnNode() I encountered this issue while running 3.1.0: ``` 2018-09-10 13:42:39,437 INFO org.apache.hadoop.yarn.server.resourcemanager.scheduler.AbstractYarnScheduler: Container container_1536156801471_0071_01_000055 completed with event FINISHED, but corresponding RMContainer doesn't exist. 2018-09-10 13:42:39,881 ERROR org.apache.hadoop.yarn.server.resourcemanager.ResourceManager: Received RMFatalEvent of type CRITICAL_THREAD_CRASH, caused by a critical thread, FSPreemptionThread, that exited unexpectedly: java.lang.NullPointerException at org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FSPreemptionThread.identifyContainersToPreemptOnNode(FSPreemptionThread.java:207) at org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FSPreemptionThread.identifyContainersToPreemptForOneContainer(FSPreemptionThread.java:161) at org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FSPreemptionThread.identifyContainersToPreempt(FSPreemptionThread.java:121) at org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FSPreemptionThread.run(FSPreemptionThread.java:81) 2018-09-10 13:42:39,886 FATAL org.apache.hadoop.yarn.server.resourcemanager.ResourceManager: Shutting down the resource manager. 2018-09-10 13:42:39,891 INFO org.apache.hadoop.util.ExitUtil: Exiting with status 1: a critical thread, FSPreemptionThread, that exited unexpectedly: java.lang.NullPointerException at org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FSPreemptionThread.identifyContainersToPreemptOnNode(FSPreemptionThread.java:207) at org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FSPreemptionThread.identifyContainersToPreemptForOneContainer(FSPreemptionThread.java:161) at org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FSPreemptionThread.identifyContainersToPreempt(FSPreemptionThread.java:121) at org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FSPreemptionThread.run(FSPreemptionThread.java:81) ``` I'm guessing a better fix would be to synchronise the removal of applications, but this simple patch should be an improvement IMO. Signed-off-by: George G --- .../resourcemanager/scheduler/fair/FSPreemptionThread.java | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSPreemptionThread.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSPreemptionThread.java index c32565f63c313..a8abc36cd18b1 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSPreemptionThread.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FSPreemptionThread.java @@ -204,6 +204,12 @@ private PreemptableContainers identifyContainersToPreemptOnNode( for (RMContainer container : containersToCheck) { FSAppAttempt app = scheduler.getSchedulerApp(container.getApplicationAttemptId()); + if (app == null) { + // e.g. "INFO org.apache.hadoop.yarn.server.resourcemanager.scheduler.AbstractYarnScheduler: Container container_1536156801471_0071_01_000096 completed with event FINISHED, but corresponding RMContainer doesn't exist." + LOG.warn("app == null, giving up in identifyContainersToPreemptOnNode()"); + return null; + } + ApplicationId appId = app.getApplicationId(); if (app.canContainerBePreempted(container,