Skip to content

Commit ac5f21d

Browse files
committed
YARN-4771. Some containers can be skipped during log aggregation after NM
restart. Contributed by Jason Lowe and Jim Brennan.
1 parent e60096c commit ac5f21d

2 files changed

Lines changed: 18 additions & 17 deletions

File tree

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -777,17 +777,20 @@ public void removeVeryOldStoppedContainersFromCache() {
777777
while (i.hasNext()) {
778778
Entry<ContainerId, Long> mapEntry = i.next();
779779
ContainerId cid = mapEntry.getKey();
780-
if (mapEntry.getValue() < currentTime) {
781-
if (!context.getContainers().containsKey(cid)) {
780+
if (mapEntry.getValue() >= currentTime) {
781+
break;
782+
}
783+
if (!context.getContainers().containsKey(cid)) {
784+
ApplicationId appId =
785+
cid.getApplicationAttemptId().getApplicationId();
786+
if (isApplicationStopped(appId)) {
782787
i.remove();
783788
try {
784789
context.getNMStateStore().removeContainer(cid);
785790
} catch (IOException e) {
786791
LOG.error("Unable to remove container " + cid + " in store", e);
787792
}
788793
}
789-
} else {
790-
break;
791794
}
792795
}
793796
}

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java

Lines changed: 11 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -931,9 +931,8 @@ public void deleteBaseDir() throws IOException {
931931
public void testRecentlyFinishedContainers() throws Exception {
932932
NodeManager nm = new NodeManager();
933933
YarnConfiguration conf = new YarnConfiguration();
934-
conf.set(
935-
NodeStatusUpdaterImpl.YARN_NODEMANAGER_DURATION_TO_TRACK_STOPPED_CONTAINERS,
936-
"10000");
934+
conf.setInt(NodeStatusUpdaterImpl.
935+
YARN_NODEMANAGER_DURATION_TO_TRACK_STOPPED_CONTAINERS, 1);
937936
nm.init(conf);
938937
NodeStatusUpdaterImpl nodeStatusUpdater =
939938
(NodeStatusUpdaterImpl) nm.getNodeStatusUpdater();
@@ -948,18 +947,17 @@ public void testRecentlyFinishedContainers() throws Exception {
948947
nodeStatusUpdater.addCompletedContainer(cId);
949948
Assert.assertTrue(nodeStatusUpdater.isContainerRecentlyStopped(cId));
950949

950+
// verify container remains even after expiration if app
951+
// is still active
951952
nm.getNMContext().getContainers().remove(cId);
952-
long time1 = System.currentTimeMillis();
953-
int waitInterval = 15;
954-
while (waitInterval-- > 0
955-
&& nodeStatusUpdater.isContainerRecentlyStopped(cId)) {
956-
nodeStatusUpdater.removeVeryOldStoppedContainersFromCache();
957-
Thread.sleep(1000);
958-
}
959-
long time2 = System.currentTimeMillis();
960-
// By this time the container will be removed from cache. need to verify.
953+
Thread.sleep(10);
954+
nodeStatusUpdater.removeVeryOldStoppedContainersFromCache();
955+
Assert.assertTrue(nodeStatusUpdater.isContainerRecentlyStopped(cId));
956+
957+
// complete the application and verify container is removed
958+
nm.getNMContext().getApplications().remove(appId);
959+
nodeStatusUpdater.removeVeryOldStoppedContainersFromCache();
961960
Assert.assertFalse(nodeStatusUpdater.isContainerRecentlyStopped(cId));
962-
Assert.assertTrue((time2 - time1) >= 10000 && (time2 - time1) <= 250000);
963961
}
964962

965963
@Test(timeout = 90000)

0 commit comments

Comments
 (0)