Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 34 additions & 3 deletions python/sglang/srt/managers/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -517,6 +517,8 @@ def init_metrics(self):
self.spec_num_total_forward_ct = 0
self.cum_spec_accept_length = 0
self.cum_spec_accept_count = 0
# The number of retracted requests
self.retract_count = 0
self.stats = SchedulerStats()
if self.enable_metrics:
engine_type = "unified"
Expand Down Expand Up @@ -1031,6 +1033,14 @@ def log_prefill_stats(
self._largest_prefill_len, adder.log_input_tokens
)

mempool_size = self.token_to_kv_pool_allocator.total_size()
mempool_available_size = self.token_to_kv_pool_allocator.available_size()
tree_cache_size = self.tree_cache.total_size()
tree_cache_evictable_size = self.tree_cache.evictable_size()
cache_hit_rate = adder.log_hit_tokens / (
adder.log_input_tokens + adder.log_hit_tokens
)

f = (
f"Prefill batch. "
f"#new-seq: {len(can_run_list)}, "
Expand All @@ -1039,18 +1049,22 @@ def log_prefill_stats(
f"token usage: {num_used / self.max_total_num_tokens:.2f}, "
f"#running-req: {running_bs}, "
f"#queue-req: {len(self.waiting_queue)}, "
f"mempool size: {mempool_size}, available size: {mempool_available_size}, "
f"tree cache size: {tree_cache_size}, evictable size: {tree_cache_evictable_size}, "
f"cache hit rate: {cache_hit_rate:.2f}, "
)
logger.info(f)

if self.enable_metrics:
cache_hit_rate = adder.log_hit_tokens / (
adder.log_input_tokens + adder.log_hit_tokens
)
self.stats.num_running_reqs = running_bs
self.stats.num_used_tokens = num_used
self.stats.token_usage = round(num_used / self.max_total_num_tokens, 2)
self.stats.num_queue_reqs = len(self.waiting_queue)
self.stats.cache_hit_rate = cache_hit_rate
self.stats.mempool_size = mempool_size
self.stats.mempool_available_size = mempool_available_size
self.stats.tree_cache_size = tree_cache_size
self.stats.tree_cache_evictable_size = tree_cache_evictable_size
self.metrics_collector.log_stats(self.stats)

def log_decode_stats(self):
Expand All @@ -1069,6 +1083,11 @@ def log_decode_stats(self):
gap_latency / self.server_args.decode_log_interval
)

mempool_size = self.token_to_kv_pool_allocator.total_size()
mempool_available_size = self.token_to_kv_pool_allocator.available_size()
tree_cache_size = self.tree_cache.total_size()
tree_cache_evictable_size = self.tree_cache.evictable_size()

if self.spec_algorithm.is_none():
msg = (
f"Decode batch. "
Expand All @@ -1077,6 +1096,9 @@ def log_decode_stats(self):
f"token usage: {num_used / self.max_total_num_tokens:.2f}, "
f"gen throughput (token/s): {self.last_gen_throughput:.2f}, "
f"#queue-req: {len(self.waiting_queue)}, "
f"mempool size: {mempool_size}, available size: {mempool_available_size}, "
f"tree cache size: {tree_cache_size}, evictable size: {tree_cache_evictable_size}, "
f"retract count: {self.retract_count}, "
)
spec_accept_length = 0
else:
Expand All @@ -1094,6 +1116,9 @@ def log_decode_stats(self):
f"accept len: {spec_accept_length:.2f}, "
f"gen throughput (token/s): {self.last_gen_throughput:.2f}, "
f"#queue-req: {len(self.waiting_queue)}, "
f"mempool size: {mempool_size}, available size: {mempool_available_size}, "
f"tree cache size: {tree_cache_size}, evictable size: {tree_cache_evictable_size}, "
f"retract count: {self.retract_count}, "
)

logger.info(msg)
Expand All @@ -1105,6 +1130,11 @@ def log_decode_stats(self):
self.stats.gen_throughput = self.last_gen_throughput
self.stats.num_queue_reqs = len(self.waiting_queue)
self.stats.spec_accept_length = spec_accept_length
self.stats.mempool_size = mempool_size
self.stats.mempool_available_size = mempool_available_size
self.stats.tree_cache_size = tree_cache_size
self.stats.tree_cache_evictable_size = tree_cache_evictable_size
self.stats.retract_count = self.retract_count
self.metrics_collector.log_stats(self.stats)

def check_memory(self):
Expand Down Expand Up @@ -1362,6 +1392,7 @@ def update_running_batch(self, batch: ScheduleBatch) -> Optional[ScheduleBatch]:
f"#new_token_ratio: {old_ratio:.4f} -> {self.new_token_ratio:.4f}"
)
self._extend_requests_to_queue(retracted_reqs)
self.retract_count += len(retracted_reqs)
else:
self.new_token_ratio = max(
self.new_token_ratio - self.new_token_ratio_decay,
Expand Down
3 changes: 3 additions & 0 deletions python/sglang/srt/mem_cache/chunk_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,3 +63,6 @@ def dec_lock_ref(self, node: Any):

def pretty_print(self):
return ""

def total_size(self):
return 0
3 changes: 3 additions & 0 deletions python/sglang/srt/mem_cache/memory_pool.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,9 @@ def __init__(
def available_size(self):
return len(self.free_slots)

def total_size(self):
return self.size

def get_kvcache(self):
return self._kvcache

Expand Down
3 changes: 3 additions & 0 deletions python/sglang/srt/mem_cache/paged_allocator.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,9 @@ def __init__(
def available_size(self):
return len(self.free_pages) * self.page_size

def total_size(self):
return self.size

def get_kvcache(self):
return self._kvcache

Expand Down
45 changes: 45 additions & 0 deletions python/sglang/srt/metrics/collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,11 @@ class SchedulerStats:
num_queue_reqs: int = 0
cache_hit_rate: float = 0.0
spec_accept_length: float = 0.0
mempool_size: int = 0
mempool_available_size: int = 0
tree_cache_size: int = 0
tree_cache_evictable_size: int = 0
retract_count: int = 0


class SchedulerMetricsCollector:
Expand Down Expand Up @@ -87,6 +92,41 @@ def __init__(self, labels: Dict[str, str]) -> None:
multiprocess_mode="mostrecent",
)

self.mempool_size = Gauge(
name="sglang:mempool_size",
documentation="The size of the memory pool.",
labelnames=labels.keys(),
multiprocess_mode="mostrecent",
)

self.mempool_available_size = Gauge(
name="sglang:mempool_available_size",
documentation="The available size of the memory pool.",
labelnames=labels.keys(),
multiprocess_mode="mostrecent",
)

self.tree_cache_size = Gauge(
name="sglang:tree_cache_size",
documentation="The size of the tree cache.",
labelnames=labels.keys(),
multiprocess_mode="mostrecent",
)

self.tree_cache_evictable_size = Gauge(
name="sglang:tree_cache_evictable_size",
documentation="The size of the evictable tree cache.",
labelnames=labels.keys(),
multiprocess_mode="mostrecent",
)

self.retract_count = Gauge(
name="sglang:retract_count",
documentation="The number of retracted requests.",
labelnames=labels.keys(),
multiprocess_mode="mostrecent",
)

def _log_gauge(self, gauge, data: Union[int, float]) -> None:
# Convenience function for logging to gauge.
gauge.labels(**self.labels).set(data)
Expand All @@ -99,6 +139,11 @@ def log_stats(self, stats: SchedulerStats) -> None:
self._log_gauge(self.num_queue_reqs, stats.num_queue_reqs)
self._log_gauge(self.cache_hit_rate, stats.cache_hit_rate)
self._log_gauge(self.spec_accept_length, stats.spec_accept_length)
self._log_gauge(self.mempool_size, stats.mempool_size)
self._log_gauge(self.mempool_available_size, stats.mempool_available_size)
self._log_gauge(self.tree_cache_size, stats.tree_cache_size)
self._log_gauge(self.tree_cache_evictable_size, stats.tree_cache_evictable_size)
self._log_gauge(self.retract_count, stats.retract_count)
self.last_log_time = time.time()


Expand Down