Skip to content

Commit 30c95dd

Browse files
prabhujosephhungj
authored andcommitted
MAPREDUCE-7249. Fix Invalid event TA_TOO_MANY_FETCH_FAILURE at SUCCESS_CONTAINER_CLEANUP causes job failure.
Contributed by Wilfred Spiegelenburg. (cherry picked from commit a97f777)
1 parent f4f00d5 commit 30c95dd

2 files changed

Lines changed: 42 additions & 6 deletions

File tree

  • hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src

hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapreduce/v2/app/job/impl/TaskAttemptImpl.java

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -469,12 +469,16 @@ TaskAttemptEventType.TA_CONTAINER_CLEANED, new KilledTransition())
469469
TaskAttemptStateInternal.COMMIT_PENDING,
470470
TaskAttemptEventType.TA_COMMIT_PENDING)
471471

472-
// Transitions from SUCCESS_CONTAINER_CLEANUP state
473-
// kill and cleanup the container
474-
.addTransition(TaskAttemptStateInternal.SUCCESS_CONTAINER_CLEANUP,
475-
TaskAttemptStateInternal.SUCCEEDED,
476-
TaskAttemptEventType.TA_CONTAINER_CLEANED)
477-
.addTransition(
472+
// Transitions from SUCCESS_CONTAINER_CLEANUP state
473+
// kill and cleanup the container
474+
.addTransition(TaskAttemptStateInternal.SUCCESS_CONTAINER_CLEANUP,
475+
TaskAttemptStateInternal.SUCCEEDED,
476+
TaskAttemptEventType.TA_CONTAINER_CLEANED)
477+
.addTransition(TaskAttemptStateInternal.SUCCESS_CONTAINER_CLEANUP,
478+
TaskAttemptStateInternal.FAILED,
479+
TaskAttemptEventType.TA_TOO_MANY_FETCH_FAILURE,
480+
new TooManyFetchFailureTransition())
481+
.addTransition(
478482
TaskAttemptStateInternal.SUCCESS_CONTAINER_CLEANUP,
479483
TaskAttemptStateInternal.SUCCESS_CONTAINER_CLEANUP,
480484
TaskAttemptEventType.TA_DIAGNOSTICS_UPDATE,

hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/test/java/org/apache/hadoop/mapreduce/v2/app/job/impl/TestTaskAttempt.java

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1773,6 +1773,38 @@ public void testReducerCustomResourceTypeWithInvalidUnit() {
17731773
createReduceTaskAttemptImplForTest(eventHandler, clock, jobConf);
17741774
}
17751775

1776+
@Test
1777+
public void testTooManyFetchFailureWhileContainerCleanup() {
1778+
MockEventHandler eventHandler = new MockEventHandler();
1779+
TaskAttemptImpl taImpl = createTaskAttemptImpl(eventHandler);
1780+
TaskId reducetaskId = MRBuilderUtils.newTaskId(taImpl.getID().getTaskId()
1781+
.getJobId(), 1, TaskType.REDUCE);
1782+
TaskAttemptId reduceTAId =
1783+
MRBuilderUtils.newTaskAttemptId(reducetaskId, 0);
1784+
1785+
// move in two steps to the desired state (cannot get there directly)
1786+
taImpl.handle(new TaskAttemptEvent(taImpl.getID(),
1787+
TaskAttemptEventType.TA_DONE));
1788+
assertEquals("Task attempt's internal state is not " +
1789+
"SUCCESS_FINISHING_CONTAINER",
1790+
TaskAttemptStateInternal.SUCCESS_FINISHING_CONTAINER,
1791+
taImpl.getInternalState());
1792+
1793+
taImpl.handle(new TaskAttemptEvent(taImpl.getID(),
1794+
TaskAttemptEventType.TA_TIMED_OUT));
1795+
assertEquals("Task attempt's internal state is not " +
1796+
"SUCCESS_CONTAINER_CLEANUP",
1797+
TaskAttemptStateInternal.SUCCESS_CONTAINER_CLEANUP,
1798+
taImpl.getInternalState());
1799+
1800+
taImpl.handle(new TaskAttemptTooManyFetchFailureEvent(taImpl.getID(),
1801+
reduceTAId, "Host"));
1802+
assertEquals("Task attempt is not in FAILED state",
1803+
TaskAttemptState.FAILED,
1804+
taImpl.getState());
1805+
assertFalse("InternalError occurred", eventHandler.internalError);
1806+
}
1807+
17761808
private void initResourceTypes() {
17771809
Configuration conf = new Configuration();
17781810
conf.set(YarnConfiguration.RM_CONFIGURATION_PROVIDER_CLASS,

0 commit comments

Comments
 (0)