Skip to content

Commit 123596a

Browse files
committed
HBASE-23597 Give high priority for meta assign procedure and ServerCrashProcedure which carry meta.
1 parent c312760 commit 123596a

6 files changed

Lines changed: 250 additions & 42 deletions

File tree

hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/AbstractProcedureScheduler.java

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -140,20 +140,34 @@ protected void push(final Procedure procedure, final boolean addFront, final boo
140140
* NOTE: this method is called with the sched lock held.
141141
* @return the Procedure to execute, or null if nothing is available.
142142
*/
143-
protected abstract Procedure dequeue();
143+
protected abstract Procedure dequeue(boolean highPriority);
144+
145+
@Override
146+
public Procedure pollHighPriority() {
147+
return poll(-1, true);
148+
}
149+
150+
@Override
151+
public Procedure pollHighPriority(long timeout, TimeUnit unit) {
152+
return poll(unit.toNanos(timeout), true);
153+
}
144154

145155
@Override
146156
public Procedure poll() {
147-
return poll(-1);
157+
return poll(-1, false);
148158
}
149159

150160
@Override
151161
public Procedure poll(long timeout, TimeUnit unit) {
152-
return poll(unit.toNanos(timeout));
162+
return poll(unit.toNanos(timeout), false);
153163
}
154164

155-
@edu.umd.cs.findbugs.annotations.SuppressWarnings("WA_AWAIT_NOT_IN_LOOP")
156165
public Procedure poll(final long nanos) {
166+
return poll(nanos, false);
167+
}
168+
169+
@edu.umd.cs.findbugs.annotations.SuppressWarnings("WA_AWAIT_NOT_IN_LOOP")
170+
private Procedure poll(final long nanos, final boolean highPriority) {
157171
schedLock();
158172
try {
159173
if (!running) {
@@ -174,7 +188,7 @@ public Procedure poll(final long nanos) {
174188
return null;
175189
}
176190
}
177-
final Procedure pollResult = dequeue();
191+
final Procedure pollResult = dequeue(highPriority);
178192

179193
pollCalls++;
180194
nullPollCalls += (pollResult == null) ? 1 : 0;

hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureExecutor.java

Lines changed: 49 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1967,13 +1967,14 @@ protected WorkerThread(ThreadGroup group, String prefix) {
19671967
public void sendStopSignal() {
19681968
scheduler.signalAll();
19691969
}
1970+
19701971
@Override
19711972
public void run() {
19721973
long lastUpdate = EnvironmentEdgeManager.currentTime();
19731974
try {
19741975
while (isRunning() && keepAlive(lastUpdate)) {
19751976
@SuppressWarnings("unchecked")
1976-
Procedure<TEnvironment> proc = scheduler.poll(keepAliveTime, TimeUnit.MILLISECONDS);
1977+
Procedure<TEnvironment> proc = getProcedure();
19771978
if (proc == null) {
19781979
continue;
19791980
}
@@ -2025,6 +2026,10 @@ public long getCurrentRunTime() {
20252026
protected boolean keepAlive(long lastUpdate) {
20262027
return true;
20272028
}
2029+
2030+
protected Procedure<TEnvironment> getProcedure() {
2031+
return scheduler.poll(keepAliveTime, TimeUnit.MILLISECONDS);
2032+
}
20282033
}
20292034

20302035
// A worker thread which can be added when core workers are stuck. Will timeout after
@@ -2040,6 +2045,25 @@ protected boolean keepAlive(long lastUpdate) {
20402045
}
20412046
}
20422047

2048+
private final class HighPriorityWorkerThread extends WorkerThread {
2049+
private Procedure<TEnvironment> procedure;
2050+
2051+
public HighPriorityWorkerThread(ThreadGroup group, Procedure<TEnvironment> proc) {
2052+
super(group, "HighPriorityPEWorker-");
2053+
this.procedure = proc;
2054+
}
2055+
2056+
@Override
2057+
protected boolean keepAlive(long lastUpdate) {
2058+
return false;
2059+
}
2060+
2061+
@Override
2062+
protected Procedure<TEnvironment> getProcedure() {
2063+
return procedure;
2064+
}
2065+
}
2066+
20432067
// ----------------------------------------------------------------------------
20442068
// TODO-MAYBE: Should we provide a InlineChore to notify the store with the
20452069
// full set of procedures pending and completed to write a compacted
@@ -2051,7 +2075,7 @@ protected boolean keepAlive(long lastUpdate) {
20512075
private final class WorkerMonitor extends InlineChore {
20522076
public static final String WORKER_MONITOR_INTERVAL_CONF_KEY =
20532077
"hbase.procedure.worker.monitor.interval.msec";
2054-
private static final int DEFAULT_WORKER_MONITOR_INTERVAL = 5000; // 5sec
2078+
private static final int DEFAULT_WORKER_MONITOR_INTERVAL = 1000; // 1sec
20552079

20562080
public static final String WORKER_STUCK_THRESHOLD_CONF_KEY =
20572081
"hbase.procedure.worker.stuck.threshold.msec";
@@ -2071,13 +2095,36 @@ public WorkerMonitor() {
20712095

20722096
@Override
20732097
public void run() {
2098+
// accelerate high priority procedure.
2099+
accelerateHighPriority();
2100+
20742101
final int stuckCount = checkForStuckWorkers();
20752102
checkThreadCount(stuckCount);
20762103

20772104
// refresh interval (poor man dynamic conf update)
20782105
refreshConfig();
20792106
}
20802107

2108+
private void accelerateHighPriority() {
2109+
if (!scheduler.hasRunnables()) {
2110+
return;
2111+
}
2112+
while (true) {
2113+
// Poll a high priority procedure and execute it intermediately
2114+
Procedure highPriorityProcedure = scheduler.pollHighPriority(1, TimeUnit.NANOSECONDS);
2115+
if (highPriorityProcedure != null) {
2116+
final HighPriorityWorkerThread worker =
2117+
new HighPriorityWorkerThread(threadGroup, highPriorityProcedure);
2118+
workerThreads.add(worker);
2119+
worker.start();
2120+
LOG.info("Added new HighPriority worker thread {} for highPriorityProcedure {}", worker,
2121+
highPriorityProcedure);
2122+
} else {
2123+
return;
2124+
}
2125+
}
2126+
}
2127+
20812128
private int checkForStuckWorkers() {
20822129
// check if any of the worker is stuck
20832130
int stuckCount = 0;

hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureScheduler.java

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,20 @@ public interface ProcedureScheduler {
9595
*/
9696
boolean hasRunnables();
9797

98+
/**
99+
* Fetch one high priority Procedure from the queue
100+
* @return the Procedure to execute, or null if nothing present.
101+
*/
102+
Procedure pollHighPriority();
103+
104+
/**
105+
* Fetch one high priority Procedure from the queue
106+
* @param timeout how long to wait before giving up, in units of unit
107+
* @param unit a TimeUnit determining how to interpret the timeout parameter
108+
* @return the Procedure to execute, or null if nothing present.
109+
*/
110+
Procedure pollHighPriority(long timeout, TimeUnit unit);
111+
98112
/**
99113
* Fetch one Procedure from the queue
100114
* @return the Procedure to execute, or null if nothing present.

hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/SimpleProcedureScheduler.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ protected void enqueue(final Procedure procedure, final boolean addFront) {
4343
}
4444

4545
@Override
46-
protected Procedure dequeue() {
46+
protected Procedure dequeue(boolean highPriority) {
4747
return runnables.poll();
4848
}
4949

hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/MasterProcedureScheduler.java

Lines changed: 86 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -105,12 +105,16 @@ public class MasterProcedureScheduler extends AbstractProcedureScheduler {
105105
(n, k) -> n.compareKey((String) k);
106106
private final static AvlKeyComparator<MetaQueue> META_QUEUE_KEY_COMPARATOR =
107107
(n, k) -> n.compareKey((TableName) k);
108+
private static final AvlKeyComparator<ServerQueue> SERVER_HIGHPRIORITY_QUEUE_KEY_COMPARATOR =
109+
(n, k) -> n.compareKey((ServerName) k);
108110

111+
private final FairQueue<ServerName> serverHighPriorityRunQueue = new FairQueue<>();
109112
private final FairQueue<ServerName> serverRunQueue = new FairQueue<>();
110113
private final FairQueue<TableName> tableRunQueue = new FairQueue<>();
111114
private final FairQueue<String> peerRunQueue = new FairQueue<>();
112115
private final FairQueue<TableName> metaRunQueue = new FairQueue<>();
113116

117+
private final ServerQueue[] serverHighPriorityBuckets = new ServerQueue[4];
114118
private final ServerQueue[] serverBuckets = new ServerQueue[128];
115119
private TableQueue tableMap = null;
116120
private PeerQueue peerMap = null;
@@ -135,7 +139,11 @@ protected void enqueue(final Procedure proc, final boolean addFront) {
135139
doAdd(tableRunQueue, getTableQueue(getTableName(proc)), proc, addFront);
136140
} else if (isServerProcedure(proc)) {
137141
ServerProcedureInterface spi = (ServerProcedureInterface) proc;
138-
doAdd(serverRunQueue, getServerQueue(spi.getServerName(), spi), proc, addFront);
142+
if (spi.hasMetaTableRegion()) {
143+
doAdd(serverHighPriorityRunQueue, getServerQueue(proc), proc, addFront);
144+
} else {
145+
doAdd(serverRunQueue, getServerQueue(proc), proc, addFront);
146+
}
139147
} else if (isPeerProcedure(proc)) {
140148
doAdd(peerRunQueue, getPeerQueue(getPeerId(proc)), proc, addFront);
141149
} else {
@@ -173,25 +181,30 @@ private <T extends Comparable<T>> void doAdd(FairQueue<T> fairq, Queue<T> queue,
173181

174182
@Override
175183
protected boolean queueHasRunnables() {
176-
return metaRunQueue.hasRunnables() || tableRunQueue.hasRunnables() ||
177-
serverRunQueue.hasRunnables() || peerRunQueue.hasRunnables();
184+
return metaRunQueue.hasRunnables() || tableRunQueue.hasRunnables() || serverRunQueue
185+
.hasRunnables() || peerRunQueue.hasRunnables() || serverHighPriorityRunQueue.hasRunnables();
178186
}
179187

180188
@Override
181-
protected Procedure dequeue() {
189+
protected Procedure dequeue(boolean highPriority) {
182190
// meta procedure is always the first priority
183191
Procedure<?> pollResult = doPoll(metaRunQueue);
184192
// For now, let server handling have precedence over table handling; presumption is that it
185193
// is more important handling crashed servers than it is running the
186194
// enabling/disabling tables, etc.
187195
if (pollResult == null) {
188-
pollResult = doPoll(serverRunQueue);
189-
}
190-
if (pollResult == null) {
191-
pollResult = doPoll(peerRunQueue);
196+
pollResult = doPoll(serverHighPriorityRunQueue);
192197
}
193-
if (pollResult == null) {
194-
pollResult = doPoll(tableRunQueue);
198+
if (!highPriority) {
199+
if (pollResult == null) {
200+
pollResult = doPoll(serverRunQueue);
201+
}
202+
if (pollResult == null) {
203+
pollResult = doPoll(peerRunQueue);
204+
}
205+
if (pollResult == null) {
206+
pollResult = doPoll(tableRunQueue);
207+
}
195208
}
196209
return pollResult;
197210
}
@@ -269,6 +282,11 @@ private void clearQueue() {
269282
clear(serverBuckets[i], serverRunQueue, SERVER_QUEUE_KEY_COMPARATOR);
270283
serverBuckets[i] = null;
271284
}
285+
for (int i = 0; i < serverHighPriorityBuckets.length; ++i) {
286+
clear(serverHighPriorityBuckets[i], serverHighPriorityRunQueue,
287+
SERVER_HIGHPRIORITY_QUEUE_KEY_COMPARATOR);
288+
serverHighPriorityBuckets[i] = null;
289+
}
272290

273291
// Remove Tables
274292
clear(tableMap, tableRunQueue, TABLE_QUEUE_KEY_COMPARATOR);
@@ -307,6 +325,9 @@ protected int queueSize() {
307325
for (ServerQueue serverMap : serverBuckets) {
308326
count += queueSize(serverMap);
309327
}
328+
for (ServerQueue serverMap : serverHighPriorityBuckets) {
329+
count += queueSize(serverMap);
330+
}
310331
count += queueSize(tableMap);
311332
count += queueSize(peerMap);
312333
count += queueSize(metaMap);
@@ -338,7 +359,7 @@ public void completionCleanup(final Procedure proc) {
338359
} else if (proc instanceof PeerProcedureInterface) {
339360
tryCleanupPeerQueue(getPeerId(proc), proc);
340361
} else if (proc instanceof ServerProcedureInterface) {
341-
tryCleanupServerQueue(getServerName(proc), proc);
362+
tryCleanupServerQueue(proc);
342363
} else {
343364
// No cleanup for other procedure types, yet.
344365
return;
@@ -391,12 +412,28 @@ private static TableName getTableName(Procedure<?> proc) {
391412
return ((TableProcedureInterface)proc).getTableName();
392413
}
393414

415+
private ServerQueue getServerQueue(Procedure<?> proc) {
416+
if (isServerProcedure(proc)) {
417+
ServerProcedureInterface spi = (ServerProcedureInterface) proc;
418+
if (spi.hasMetaTableRegion()) {
419+
return getServerQueue(serverHighPriorityBuckets, SERVER_HIGHPRIORITY_QUEUE_KEY_COMPARATOR,
420+
spi.getServerName(), spi);
421+
} else {
422+
return getServerQueue(serverBuckets, SERVER_QUEUE_KEY_COMPARATOR, spi.getServerName(), spi);
423+
}
424+
} else {
425+
return null;
426+
}
427+
}
428+
394429
// ============================================================================
395430
// Server Queue Lookup Helpers
396431
// ============================================================================
397-
private ServerQueue getServerQueue(ServerName serverName, ServerProcedureInterface proc) {
432+
private ServerQueue getServerQueue(ServerQueue[] serverBuckets,
433+
AvlKeyComparator<ServerQueue> keyComparator, ServerName serverName,
434+
ServerProcedureInterface proc) {
398435
final int index = getBucketIndex(serverBuckets, serverName.hashCode());
399-
ServerQueue node = AvlTree.get(serverBuckets[index], serverName, SERVER_QUEUE_KEY_COMPARATOR);
436+
ServerQueue node = AvlTree.get(serverBuckets[index], serverName, keyComparator);
400437
if (node != null) {
401438
return node;
402439
}
@@ -411,18 +448,32 @@ private ServerQueue getServerQueue(ServerName serverName, ServerProcedureInterfa
411448
return node;
412449
}
413450

414-
private void removeServerQueue(ServerName serverName) {
451+
private void removeServerQueue(ServerQueue[] serverBuckets,
452+
AvlKeyComparator<ServerQueue> keyComparator, ServerName serverName) {
415453
int index = getBucketIndex(serverBuckets, serverName.hashCode());
416-
serverBuckets[index] =
417-
AvlTree.remove(serverBuckets[index], serverName, SERVER_QUEUE_KEY_COMPARATOR);
454+
serverBuckets[index] = AvlTree.remove(serverBuckets[index], serverName, keyComparator);
418455
locking.removeServerLock(serverName);
419456
}
420457

421-
private void tryCleanupServerQueue(ServerName serverName, Procedure<?> proc) {
458+
private void tryCleanupServerQueue(Procedure<?> proc) {
459+
ServerName serverName = getServerName(proc);
460+
ServerProcedureInterface spi = (ServerProcedureInterface) proc;
461+
if (spi.hasMetaTableRegion()) {
462+
tryCleanupServerQueue(this.serverHighPriorityBuckets, this.serverHighPriorityRunQueue,
463+
SERVER_HIGHPRIORITY_QUEUE_KEY_COMPARATOR, serverName, proc);
464+
} else {
465+
tryCleanupServerQueue(this.serverBuckets, this.serverRunQueue, SERVER_QUEUE_KEY_COMPARATOR,
466+
serverName, proc);
467+
}
468+
}
469+
470+
private void tryCleanupServerQueue(ServerQueue[] serverBuckets,
471+
FairQueue<ServerName> serverRunQueue, AvlKeyComparator<ServerQueue> keyComparator,
472+
ServerName serverName, Procedure<?> proc) {
422473
schedLock();
423474
try {
424475
int index = getBucketIndex(serverBuckets, serverName.hashCode());
425-
ServerQueue node = AvlTree.get(serverBuckets[index], serverName, SERVER_QUEUE_KEY_COMPARATOR);
476+
ServerQueue node = AvlTree.get(serverBuckets[index], serverName, keyComparator);
426477
if (node == null) {
427478
return;
428479
}
@@ -431,7 +482,7 @@ private void tryCleanupServerQueue(ServerName serverName, Procedure<?> proc) {
431482
if (node.isEmpty() && lock.tryExclusiveLock(proc)) {
432483
removeFromRunQueue(serverRunQueue, node,
433484
() -> "clean up server queue after " + proc + " completed");
434-
removeServerQueue(serverName);
485+
removeServerQueue(serverBuckets, keyComparator, serverName);
435486
}
436487
} finally {
437488
schedUnlock();
@@ -873,13 +924,14 @@ public boolean waitServerExclusiveLock(final Procedure<?> procedure,
873924
try {
874925
final LockAndQueue lock = locking.getServerLock(serverName);
875926
if (lock.tryExclusiveLock(procedure)) {
876-
// In tests we may pass procedures other than ServerProcedureInterface, just pass null if
877-
// so.
878-
removeFromRunQueue(serverRunQueue,
879-
getServerQueue(serverName,
880-
procedure instanceof ServerProcedureInterface ? (ServerProcedureInterface) procedure
881-
: null),
882-
() -> procedure + " held exclusive lock");
927+
ServerProcedureInterface spi = (ServerProcedureInterface) procedure;
928+
if (spi.hasMetaTableRegion()) {
929+
removeFromRunQueue(serverHighPriorityRunQueue, getServerQueue(procedure),
930+
() -> procedure + " held exclusive lock");
931+
} else {
932+
removeFromRunQueue(serverRunQueue, getServerQueue(procedure),
933+
() -> procedure + " held exclusive lock");
934+
}
883935
return false;
884936
}
885937
waitProcedure(lock, procedure);
@@ -902,12 +954,14 @@ public void wakeServerExclusiveLock(final Procedure<?> procedure, final ServerNa
902954
final LockAndQueue lock = locking.getServerLock(serverName);
903955
// Only SCP will acquire/release server lock so do not need to check the return value here.
904956
lock.releaseExclusiveLock(procedure);
905-
// In tests we may pass procedures other than ServerProcedureInterface, just pass null if
906-
// so.
907-
addToRunQueue(serverRunQueue,
908-
getServerQueue(serverName,
909-
procedure instanceof ServerProcedureInterface ? (ServerProcedureInterface) procedure
910-
: null), () -> procedure + " released exclusive lock");
957+
ServerProcedureInterface spi = (ServerProcedureInterface) procedure;
958+
if (spi.hasMetaTableRegion()) {
959+
addToRunQueue(serverHighPriorityRunQueue, getServerQueue(procedure),
960+
() -> procedure + " released exclusive lock");
961+
} else {
962+
addToRunQueue(serverRunQueue, getServerQueue(procedure),
963+
() -> procedure + " released exclusive lock");
964+
}
911965
int waitingCount = wakeWaitingProcedures(lock);
912966
wakePollIfNeeded(waitingCount);
913967
} finally {

0 commit comments

Comments
 (0)