Skip to content

Commit a602a07

Browse files
authored
TEZ-4349. DAGClient gets stuck with invalid cached DAGStatus (#161) (Ahmed Hussein reviewed by Laszlo Bodor, Jonathan Eagles)
1 parent c9b8e90 commit a602a07

4 files changed

Lines changed: 303 additions & 23 deletions

File tree

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
/**
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
package org.apache.tez.common;
19+
20+
import java.util.concurrent.TimeUnit;
21+
import java.util.concurrent.atomic.AtomicReference;
22+
23+
import org.apache.hadoop.yarn.util.Clock;
24+
import org.apache.hadoop.yarn.util.MonotonicClock;
25+
26+
/**
27+
* A thread safe implementation used as a container for cacheable entries with Expiration times.
28+
* It supports custom {@link Clock} to control the elapsed time calculation.
29+
* @param <T> the data object type.
30+
*/
31+
public class CachedEntity<T> {
32+
private final AtomicReference<T> entryDataRef;
33+
private final Clock cacheClock;
34+
private final long expiryDurationMS;
35+
private volatile long entryTimeStamp;
36+
37+
public CachedEntity(TimeUnit expiryTimeUnit, long expiryLength, Clock clock) {
38+
entryDataRef = new AtomicReference<>(null);
39+
cacheClock = clock;
40+
expiryDurationMS = TimeUnit.MILLISECONDS.convert(expiryLength, expiryTimeUnit);
41+
entryTimeStamp = 0;
42+
}
43+
44+
public CachedEntity(TimeUnit expiryTimeUnit, long expiryLength) {
45+
this(expiryTimeUnit, expiryLength, new MonotonicClock());
46+
}
47+
48+
/**
49+
*
50+
* @return true if expiration timestamp is 0, or the elapsed time since last update is
51+
* greater than {@link #expiryDurationMS}
52+
*/
53+
public boolean isExpired() {
54+
return (entryTimeStamp == 0)
55+
|| ((cacheClock.getTime() - entryTimeStamp) > expiryDurationMS);
56+
}
57+
58+
/**
59+
* If the entry has expired, it reset the cache reference through {@link #clearExpiredEntry()}.
60+
* @return cached data if the timestamp is valid. Null, if the timestamp has expired.
61+
*/
62+
public T getValue() {
63+
if (isExpired()) { // quick check for expiration
64+
if (clearExpiredEntry()) { // remove reference to the expired entry
65+
return null;
66+
}
67+
}
68+
return entryDataRef.get();
69+
}
70+
71+
/**
72+
* Safely sets the cached data.
73+
* @param newEntry
74+
*/
75+
public void setValue(T newEntry) {
76+
T currentEntry = entryDataRef.get();
77+
while (!entryDataRef.compareAndSet(currentEntry, newEntry)) {
78+
currentEntry = entryDataRef.get();
79+
}
80+
entryTimeStamp = cacheClock.getTime();
81+
}
82+
83+
/**
84+
* Enforces the expiration of the cached entry.
85+
*/
86+
public void enforceExpiration() {
87+
entryTimeStamp = 0;
88+
}
89+
90+
/**
91+
* Safely deletes the reference to the data if it was not null.
92+
* @return true if the reference is set to Null. False indicates that another thread
93+
* updated the cache.
94+
*/
95+
private boolean clearExpiredEntry() {
96+
T currentEntry = entryDataRef.get();
97+
if (currentEntry == null) {
98+
return true;
99+
}
100+
// the current value is not null: try to reset it.
101+
// if the CAS is successful, then we won't override a recent update to the cache.
102+
return (entryDataRef.compareAndSet(currentEntry, null));
103+
}
104+
}

tez-api/src/main/java/org/apache/tez/dag/api/TezConfiguration.java

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1993,6 +1993,18 @@ static Set<String> getPropertySet() {
19931993
TEZ_PREFIX + "test.minicluster.app.wait.on.shutdown.secs";
19941994
public static final long TEZ_TEST_MINI_CLUSTER_APP_WAIT_ON_SHUTDOWN_SECS_DEFAULT = 30;
19951995

1996+
/**
1997+
* Long value
1998+
* Status Cache timeout window in minutes for the DAGClient.
1999+
*/
2000+
@Private
2001+
@ConfigurationScope(Scope.CLIENT)
2002+
@ConfigurationProperty(type="long")
2003+
public static final String TEZ_CLIENT_DAG_STATUS_CACHE_TIMEOUT_SECS = TEZ_PREFIX
2004+
+ "client.dag.status.cache.timeout-secs";
2005+
// Default timeout is 60 seconds.
2006+
public static final long TEZ_CLIENT_DAG_STATUS_CACHE_TIMEOUT_SECS_DEFAULT = 60;
2007+
19962008
/**
19972009
* Long value
19982010
* Time to wait (in milliseconds) for yarn app's diagnotics is available

tez-api/src/main/java/org/apache/tez/dag/api/client/DAGClientImpl.java

Lines changed: 48 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,11 @@
2727
import java.util.HashMap;
2828
import java.util.Map;
2929
import java.util.Set;
30+
import java.util.concurrent.TimeUnit;
3031

3132
import com.google.common.annotations.VisibleForTesting;
3233
import org.apache.hadoop.security.UserGroupInformation;
34+
import org.apache.tez.common.CachedEntity;
3335
import org.apache.tez.common.Preconditions;
3436

3537
import org.apache.hadoop.yarn.exceptions.ApplicationNotFoundException;
@@ -58,13 +60,15 @@ public class DAGClientImpl extends DAGClient {
5860
private final String dagId;
5961
private final TezConfiguration conf;
6062
private final FrameworkClient frameworkClient;
61-
63+
/**
64+
* Container to cache the last {@link DAGStatus}.
65+
*/
66+
private final CachedEntity<DAGStatus> cachedDAGStatusRef;
6267
@VisibleForTesting
6368
protected DAGClientInternal realClient;
64-
private boolean dagCompleted = false;
69+
private volatile boolean dagCompleted = false;
6570
@VisibleForTesting
6671
protected boolean isATSEnabled = false;
67-
private DAGStatus cachedDagStatus = null;
6872
Map<String, VertexStatus> cachedVertexStatus = new HashMap<String, VertexStatus>();
6973

7074
private static final long SLEEP_FOR_COMPLETION = 500;
@@ -110,6 +114,28 @@ public DAGClientImpl(ApplicationId appId, String dagId, TezConfiguration conf,
110114
this.diagnoticsWaitTimeout = conf.getLong(
111115
TezConfiguration.TEZ_CLIENT_DIAGNOSTICS_WAIT_TIMEOUT_MS,
112116
TezConfiguration.TEZ_CLIENT_DIAGNOSTICS_WAIT_TIMEOUT_MS_DEFAULT);
117+
cachedDAGStatusRef = initCacheDAGRefFromConf(conf);
118+
}
119+
120+
/**
121+
* Constructs a new {@link CachedEntity} for {@link DAGStatus}.
122+
* @param tezConf TEZ configuration parameters.
123+
* @return a caching entry to hold the {@link DAGStatus}.
124+
*/
125+
protected CachedEntity<DAGStatus> initCacheDAGRefFromConf(TezConfiguration tezConf) {
126+
long clientDAGStatusCacheTimeOut = tezConf.getLong(
127+
TezConfiguration.TEZ_CLIENT_DAG_STATUS_CACHE_TIMEOUT_SECS,
128+
TezConfiguration.TEZ_CLIENT_DAG_STATUS_CACHE_TIMEOUT_SECS_DEFAULT);
129+
if (clientDAGStatusCacheTimeOut <= 0) {
130+
LOG.error("DAG Status cache timeout interval should be positive. Enforcing default value.");
131+
clientDAGStatusCacheTimeOut =
132+
TezConfiguration.TEZ_CLIENT_DAG_STATUS_CACHE_TIMEOUT_SECS_DEFAULT;
133+
}
134+
return new CachedEntity<>(TimeUnit.SECONDS, clientDAGStatusCacheTimeOut);
135+
}
136+
137+
protected CachedEntity<DAGStatus> getCachedDAGStatusRef() {
138+
return cachedDAGStatusRef;
113139
}
114140

115141
@Override
@@ -133,13 +159,11 @@ public DAGStatus getDAGStatus(@Nullable Set<StatusGetOpts> statusOptions,
133159
}
134160

135161
long startTime = System.currentTimeMillis();
136-
boolean refreshStatus;
137-
DAGStatus dagStatus;
138-
if(cachedDagStatus != null) {
139-
dagStatus = cachedDagStatus;
140-
refreshStatus = true;
141-
} else {
142-
// For the first lookup only. After this cachedDagStatus should be populated.
162+
163+
DAGStatus dagStatus = cachedDAGStatusRef.getValue();
164+
boolean refreshStatus = true;
165+
if (dagStatus == null) {
166+
// the first lookup only or when the cachedDAG has expired
143167
dagStatus = getDAGStatus(statusOptions);
144168
refreshStatus = false;
145169
}
@@ -221,13 +245,14 @@ protected DAGStatus getDAGStatusInternal(@Nullable Set<StatusGetOpts> statusOpti
221245
final DAGStatus dagStatus = getDAGStatusViaAM(statusOptions, timeout);
222246

223247
if (!dagCompleted) {
224-
if (dagStatus != null) {
225-
cachedDagStatus = dagStatus;
248+
if (dagStatus != null) { // update the cached DAGStatus
249+
cachedDAGStatusRef.setValue(dagStatus);
226250
return dagStatus;
227251
}
228-
if (cachedDagStatus != null) {
252+
DAGStatus cachedDAG = cachedDAGStatusRef.getValue();
253+
if (cachedDAG != null) {
229254
// could not get from AM (not reachable/ was killed). return cached status.
230-
return cachedDagStatus;
255+
return cachedDAG;
231256
}
232257
}
233258

@@ -253,8 +278,11 @@ protected DAGStatus getDAGStatusInternal(@Nullable Set<StatusGetOpts> statusOpti
253278

254279
// dag completed and Timeline service is either not enabled or does not have completion status
255280
// return cached status if completion info is present.
256-
if (dagCompleted && cachedDagStatus != null && cachedDagStatus.isCompleted()) {
257-
return cachedDagStatus;
281+
if (dagCompleted) {
282+
DAGStatus cachedDag = cachedDAGStatusRef.getValue();
283+
if (cachedDag != null && cachedDag.isCompleted()) {
284+
return cachedDag;
285+
}
258286
}
259287

260288
// everything else fails rely on RM.
@@ -377,9 +405,11 @@ private DAGStatus getDAGStatusViaAM(@Nullable Set<StatusGetOpts> statusOptions,
377405
LOG.info("DAG is no longer running - application not found by YARN", e);
378406
dagCompleted = true;
379407
} catch (TezException e) {
380-
// can be either due to a n/w issue of due to AM completed.
408+
// can be either due to a n/w issue or due to AM completed.
409+
LOG.info("Cannot retrieve DAG Status due to TezException: {}", e.getMessage());
381410
} catch (IOException e) {
382-
// can be either due to a n/w issue of due to AM completed.
411+
// can be either due to a n/w issue or due to AM completed.
412+
LOG.info("Cannot retrieve DAG Status due to IOException: {}", e.getMessage());
383413
}
384414

385415
if (dagStatus == null && !dagCompleted) {

0 commit comments

Comments
 (0)