-
Notifications
You must be signed in to change notification settings - Fork 9.2k
YARN-11235. Refactor Policy Code and Define getReservationHomeSubcluster #4656
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
aeea18b
63c3696
5412975
b258c9a
1ce3194
fd81fc1
10ddb9f
22db7c6
28b10e9
62dd3a1
874c9c5
0ea9490
6dd7d05
371801d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -25,6 +25,7 @@ | |
|
|
||
| import org.apache.hadoop.conf.Configuration; | ||
| import org.apache.hadoop.yarn.api.records.ApplicationSubmissionContext; | ||
| import org.apache.hadoop.yarn.api.protocolrecords.ReservationSubmissionRequest; | ||
| import org.apache.hadoop.yarn.conf.YarnConfiguration; | ||
| import org.apache.hadoop.yarn.exceptions.YarnException; | ||
| import org.apache.hadoop.yarn.server.federation.policies.exceptions.FederationPolicyException; | ||
|
|
@@ -136,7 +137,7 @@ public SubClusterId getHomeSubcluster( | |
|
|
||
| if (appSubmissionContext == null) { | ||
| throw new FederationPolicyException( | ||
| "The ApplicationSubmissionContext " + "cannot be null."); | ||
| "The ApplicationSubmissionContext cannot be null."); | ||
| } | ||
|
|
||
| String queue = appSubmissionContext.getQueue(); | ||
|
|
@@ -148,51 +149,7 @@ public SubClusterId getHomeSubcluster( | |
| queue = YarnConfiguration.DEFAULT_QUEUE_NAME; | ||
| } | ||
|
|
||
| // the facade might cache this request, based on its parameterization | ||
| SubClusterPolicyConfiguration configuration = null; | ||
|
|
||
| try { | ||
| configuration = federationFacade.getPolicyConfiguration(queue); | ||
| } catch (YarnException e) { | ||
| String errMsg = "There is no policy configured for the queue: " + queue | ||
| + ", falling back to defaults."; | ||
| LOG.warn(errMsg, e); | ||
| } | ||
|
|
||
| // If there is no policy configured for this queue, fallback to the baseline | ||
| // policy that is configured either in the store or via XML config (and | ||
| // cached) | ||
| if (configuration == null) { | ||
| LOG.warn("There is no policies configured for queue: " + queue + " we" | ||
| + " fallback to default policy for: " | ||
| + YarnConfiguration.DEFAULT_FEDERATION_POLICY_KEY); | ||
|
|
||
| queue = YarnConfiguration.DEFAULT_FEDERATION_POLICY_KEY; | ||
| try { | ||
| configuration = federationFacade.getPolicyConfiguration(queue); | ||
| } catch (YarnException e) { | ||
| String errMsg = "Cannot retrieve policy configured for the queue: " | ||
| + queue + ", falling back to defaults."; | ||
| LOG.warn(errMsg, e); | ||
|
|
||
| } | ||
| } | ||
|
|
||
| // the fallback is not configure via store, but via XML, using | ||
| // previously loaded configuration. | ||
| if (configuration == null) { | ||
| configuration = | ||
| cachedConfs.get(YarnConfiguration.DEFAULT_FEDERATION_POLICY_KEY); | ||
| } | ||
|
|
||
| // if the configuration has changed since last loaded, reinit the policy | ||
| // based on current configuration | ||
| if (!cachedConfs.containsKey(queue) | ||
| || !cachedConfs.get(queue).equals(configuration)) { | ||
| singlePolicyReinit(policyMap, cachedConfs, queue, configuration); | ||
| } | ||
|
|
||
| FederationRouterPolicy policy = policyMap.get(queue); | ||
| FederationRouterPolicy policy = getFederationRouterPolicy(cachedConfs, policyMap, queue); | ||
| if (policy == null) { | ||
| // this should never happen, as the to maps are updated together | ||
| throw new FederationPolicyException("No FederationRouterPolicy found " | ||
|
|
@@ -262,4 +219,92 @@ public synchronized void reset() { | |
|
|
||
| } | ||
|
|
||
| /** | ||
| * This method provides a wrapper of all policy functionalities for routing a | ||
| * reservation. Internally it manages configuration changes, and policy | ||
| * init/reinit. | ||
| * | ||
| * @param request the reservation to route. | ||
| * | ||
| * @return the id of the subcluster that will be the "home" for this | ||
| * reservation. | ||
| * | ||
| * @throws YarnException if there are issues initializing policies, or no | ||
| * valid sub-cluster id could be found for this reservation. | ||
| */ | ||
| public SubClusterId getReservationHomeSubCluster( | ||
| ReservationSubmissionRequest request) throws YarnException { | ||
|
|
||
| // the maps are concurrent, but we need to protect from reset() | ||
| // reinitialization mid-execution by creating a new reference local to this | ||
| // method. | ||
| Map<String, SubClusterPolicyConfiguration> cachedConfs = globalConfMap; | ||
| Map<String, FederationRouterPolicy> policyMap = globalPolicyMap; | ||
|
|
||
| if (request == null) { | ||
| throw new FederationPolicyException( | ||
| "The ReservationSubmissionRequest cannot be null."); | ||
| } | ||
|
|
||
| String queue = request.getQueue(); | ||
| FederationRouterPolicy policy = getFederationRouterPolicy(cachedConfs, policyMap, queue); | ||
|
|
||
| if (policy == null) { | ||
| // this should never happen, as the to maps are updated together | ||
| throw new FederationPolicyException("No FederationRouterPolicy found " | ||
| + "for queue: " + request.getQueue() + " (while routing " | ||
| + "reservation: " + request.getReservationId() + ") " | ||
| + "and no default specified."); | ||
| } | ||
|
|
||
| return policy.getReservationHomeSubcluster(request); | ||
| } | ||
|
|
||
| private FederationRouterPolicy getFederationRouterPolicy( | ||
| Map<String, SubClusterPolicyConfiguration> cachedConfiguration, | ||
| Map<String, FederationRouterPolicy> policyMap, String queue) | ||
| throws FederationPolicyInitializationException { | ||
|
|
||
| // the facade might cache this request, based on its parameterization | ||
| SubClusterPolicyConfiguration configuration = null; | ||
| String copyQueue = queue; | ||
|
|
||
| try { | ||
| configuration = federationFacade.getPolicyConfiguration(copyQueue); | ||
| } catch (YarnException e) { | ||
| LOG.warn("There is no policy configured for the queue: {}, " + | ||
| "falling back to defaults.", copyQueue, e); | ||
| } | ||
|
|
||
| // If there is no policy configured for this queue, fallback to the baseline | ||
| // policy that is configured either in the store or via XML config (and | ||
| // cached) | ||
| if (configuration == null) { | ||
| final String policyKey = YarnConfiguration.DEFAULT_FEDERATION_POLICY_KEY; | ||
| LOG.warn("There is no policies configured for queue: {} " + | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We can rearrange so the string is a single line.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I will fix it. |
||
| "we fallback to default policy for: {}. ", copyQueue, policyKey); | ||
| copyQueue = YarnConfiguration.DEFAULT_FEDERATION_POLICY_KEY; | ||
| try { | ||
| configuration = federationFacade.getPolicyConfiguration(copyQueue); | ||
| } catch (YarnException e) { | ||
| LOG.warn("Cannot retrieve policy configured for the queue: {}, " + | ||
| "falling back to defaults.", copyQueue, e); | ||
| } | ||
| } | ||
|
|
||
| // the fallback is not configure via store, but via XML, using | ||
| // previously loaded configuration. | ||
| if (configuration == null) { | ||
| configuration = cachedConfiguration.get(YarnConfiguration.DEFAULT_FEDERATION_POLICY_KEY); | ||
| } | ||
|
|
||
| // if the configuration has changed since last loaded, reinit the policy | ||
| // based on current configuration | ||
| if (!cachedConfiguration.containsKey(copyQueue) | ||
|
||
| || !cachedConfiguration.get(copyQueue).equals(configuration)) { | ||
|
||
| singlePolicyReinit(policyMap, cachedConfiguration, copyQueue, configuration); | ||
| } | ||
|
|
||
| return policyMap.get(copyQueue); | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,15 +18,23 @@ | |
|
|
||
| package org.apache.hadoop.yarn.server.federation.policies.router; | ||
|
|
||
| import java.util.ArrayList; | ||
| import java.util.List; | ||
| import java.util.Map; | ||
|
|
||
| import org.apache.hadoop.yarn.api.protocolrecords.ReservationSubmissionRequest; | ||
| import org.apache.hadoop.yarn.api.records.ApplicationSubmissionContext; | ||
| import org.apache.hadoop.yarn.api.records.ReservationId; | ||
| import org.apache.hadoop.yarn.conf.YarnConfiguration; | ||
| import org.apache.hadoop.yarn.exceptions.YarnException; | ||
| import org.apache.hadoop.yarn.server.federation.policies.AbstractConfigurableFederationPolicy; | ||
| import org.apache.hadoop.yarn.server.federation.policies.FederationPolicyUtils; | ||
| import org.apache.hadoop.yarn.server.federation.policies.dao.WeightedPolicyInfo; | ||
| import org.apache.hadoop.yarn.server.federation.policies.exceptions.FederationPolicyException; | ||
| import org.apache.hadoop.yarn.server.federation.policies.exceptions.FederationPolicyInitializationException; | ||
| import org.apache.hadoop.yarn.server.federation.store.records.SubClusterId; | ||
| import org.apache.hadoop.yarn.server.federation.store.records.SubClusterIdInfo; | ||
| import org.apache.hadoop.yarn.server.federation.store.records.SubClusterInfo; | ||
|
|
||
| /** | ||
| * Base abstract class for {@link FederationRouterPolicy} implementations, that | ||
|
|
@@ -63,4 +71,108 @@ public void validate(ApplicationSubmissionContext appSubmissionContext) | |
| } | ||
| } | ||
|
|
||
| /** | ||
| * This method is implemented by the specific policy, and it is used to route | ||
| * both reservations, and applications among a given set of | ||
| * sub-clusters. | ||
| * | ||
| * @param queue the queue for this application/reservation | ||
| * @param preSelectSubClusters a pre-filter set of sub-clusters | ||
| * @return the chosen sub-cluster | ||
| * | ||
| * @throws YarnException if the policy fails to choose a sub-cluster | ||
| */ | ||
| protected abstract SubClusterId chooseSubCluster(String queue, | ||
| Map<SubClusterId, SubClusterInfo> preSelectSubClusters) throws YarnException; | ||
|
|
||
| /** | ||
| * Filter chosen SubCluster based on reservationId. | ||
| * | ||
| * @param reservationId the globally unique identifier for a reservation. | ||
| * @param activeSubClusters the map of ids to info for all active subclusters. | ||
| * @return the chosen sub-cluster | ||
| * @throws YarnException if the policy fails to choose a sub-cluster | ||
| */ | ||
| protected Map<SubClusterId, SubClusterInfo> prefilterSubClusters( | ||
| ReservationId reservationId, Map<SubClusterId, SubClusterInfo> activeSubClusters) | ||
| throws YarnException { | ||
|
|
||
| // if a reservation exists limit scope to the sub-cluster this | ||
| // reservation is mapped to | ||
| // TODO: Implemented in YARN-11236 | ||
| return activeSubClusters; | ||
| } | ||
|
|
||
| /** | ||
| * Simply picks from alphabetically-sorted active subclusters based on the | ||
| * hash of quey name. Jobs of the same queue will all be routed to the same | ||
| * sub-cluster, as far as the number of active sub-cluster and their names | ||
| * remain the same. | ||
| * | ||
| * @param appContext the {@link ApplicationSubmissionContext} that | ||
| * has to be routed to an appropriate subCluster for execution. | ||
| * | ||
| * @param blackLists the list of subClusters as identified by | ||
| * {@link SubClusterId} to blackList from the selection of the home | ||
| * subCluster. | ||
| * | ||
| * @return a hash-based chosen {@link SubClusterId} that will be the "home" | ||
| * for this application. | ||
| * | ||
| * @throws YarnException if there are no active subclusters. | ||
| */ | ||
| @Override | ||
| public SubClusterId getHomeSubcluster(ApplicationSubmissionContext appContext, | ||
| List<SubClusterId> blackLists) throws YarnException { | ||
|
|
||
| // null checks and default-queue behavior | ||
| validate(appContext); | ||
|
|
||
| // apply filtering based on reservation location and active sub-clusters | ||
| Map<SubClusterId, SubClusterInfo> filteredSubClusters = prefilterSubClusters( | ||
| appContext.getReservationID(), getActiveSubclusters()); | ||
|
|
||
| FederationPolicyUtils.validateSubClusterAvailability( | ||
|
||
| new ArrayList<>(filteredSubClusters.keySet()), blackLists); | ||
|
|
||
| // remove black SubCluster | ||
| if (blackLists != null) { | ||
| blackLists.forEach(filteredSubClusters::remove); | ||
| } | ||
|
|
||
| // pick the chosen subCluster from the active ones | ||
| return chooseSubCluster(appContext.getQueue(), filteredSubClusters); | ||
| } | ||
|
|
||
| /** | ||
| * This method provides a wrapper of all policy functionalities for routing a | ||
| * reservation. Internally it manages configuration changes, and policy | ||
| * init/reinit. | ||
| * | ||
| * @param request the reservation to route. | ||
| * | ||
| * @return the id of the subcluster that will be the "home" for this | ||
| * reservation. | ||
| * | ||
| * @throws YarnException if there are issues initializing policies, or no | ||
| * valid sub-cluster id could be found for this reservation. | ||
| */ | ||
| @Override | ||
| public SubClusterId getReservationHomeSubcluster(ReservationSubmissionRequest request) | ||
| throws YarnException { | ||
| if (request == null) { | ||
| throw new FederationPolicyException("The ReservationSubmissionRequest cannot be null."); | ||
| } | ||
|
|
||
| if (request.getQueue() == null) { | ||
| request.setQueue(YarnConfiguration.DEFAULT_QUEUE_NAME); | ||
| } | ||
|
|
||
| // apply filtering based on reservation location and active sub-clusters | ||
| Map<SubClusterId, SubClusterInfo> filteredSubClusters = prefilterSubClusters( | ||
| request.getReservationId(), getActiveSubclusters()); | ||
|
|
||
| // pick the chosen subCluster from the active ones | ||
| return chooseSubCluster(request.getQueue(), filteredSubClusters); | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We can fit the string into a single line.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for your help reviewing the code, I will modify the code.