Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions python/sglang/srt/model_executor/model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -402,7 +402,7 @@ def model_specific_adjustment(self):
else:
server_args.attention_backend = "triton"
logger.info(
f"Attention backend not set. Use {server_args.attention_backend} backend by default."
f"Attention backend not explicitly specified. Use {server_args.attention_backend} backend by default."
)
elif self.use_mla_backend:
if server_args.device != "cpu":
Expand Down Expand Up @@ -454,7 +454,7 @@ def model_specific_adjustment(self):
if not self.is_multimodal_chunked_prefill_supported:
server_args.chunked_prefill_size = -1
logger.info(
f"Automatically turn of --chunked-prefill-size as it is not supported for "
f"Automatically turn off --chunked-prefill-size as it is not supported for "
f"{self.model_config.hf_config.model_type}"
)

Expand Down
48 changes: 46 additions & 2 deletions python/sglang/srt/server_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,8 +325,52 @@ def __post_init__(self):

# Multimodal models need more memory for the image processor
model_config = ModelConfig.from_server_args(self)
if model_config.is_multimodal:
self.mem_fraction_static *= 0.90

vision_config = getattr(model_config.hf_config, "vision_config", None)

if model_config.is_multimodal and vision_config:
# roughly reduce the mem_fraction_static base on params of Vit
original_server_arg_mem_fraction = self.mem_fraction_static
# a base mem_fraction_static factor for regular Vit
base_mem_fraction_reduction_ratio = 0.95

vit_num_layers = getattr(vision_config, "num_hidden_layers", 24)
vit_hidden_size = getattr(vision_config, "hidden_size", 1024)

# baseline ViT params (ViT-L/14)
baseline_vit_layers = 24
baseline_vit_hidden_size = 1024

# weight params count
current_complexity_score = vit_num_layers * (vit_hidden_size**2)
baseline_complexity_score = baseline_vit_layers * (
baseline_vit_hidden_size**2
)
complexity_ratio = (
current_complexity_score / baseline_complexity_score
if baseline_complexity_score > 0
else 1.0
)

# every time the complexity grows 100%, adjust final factor for 10%
sensitivity_scale = 0.1
dynamic_adjustment_factor = 1.0 - sensitivity_scale * (
complexity_ratio - 1.0
)
dynamic_adjustment_factor = max(
0.8, min(1.05, dynamic_adjustment_factor)
)

final_overall_factor = (
base_mem_fraction_reduction_ratio * dynamic_adjustment_factor
)
self.mem_fraction_static = (
original_server_arg_mem_fraction * final_overall_factor
)
logger.warning(
f"Multimodal model: Dynamically adjusted --mem-fraction-static "
f"from: {original_server_arg_mem_fraction:.3f} to: {self.mem_fraction_static:.3f}."
)

# Set chunked prefill size, which depends on the gpu memory capacity
if self.chunked_prefill_size is None:
Expand Down
10 changes: 5 additions & 5 deletions test/srt/test_vision_openai_server_a.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def setUpClass(cls):
api_key=cls.api_key,
other_args=[
"--mem-fraction-static",
"0.4",
"0.35",
],
)
cls.base_url += "/v1"
Expand All @@ -49,7 +49,7 @@ def setUpClass(cls):
api_key=cls.api_key,
other_args=[
"--mem-fraction-static",
"0.4",
"0.35",
],
)
cls.base_url += "/v1"
Expand All @@ -69,7 +69,7 @@ def setUpClass(cls):
other_args=[
"--context-length",
"300",
"--mem-fraction-static=0.80",
"--mem-fraction-static=0.75",
],
)
cls.base_url += "/v1"
Expand Down Expand Up @@ -141,7 +141,7 @@ def setUpClass(cls):
other_args=[
"--trust-remote-code",
"--mem-fraction-static",
"0.4",
"0.35",
],
)
cls.base_url += "/v1"
Expand Down Expand Up @@ -175,7 +175,7 @@ def setUpClass(cls):
other_args=[
"--trust-remote-code",
"--mem-fraction-static",
"0.7",
"0.65",
],
)
cls.base_url += "/v1"
Expand Down
8 changes: 4 additions & 4 deletions test/srt/test_vision_openai_server_b.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def setUpClass(cls):
other_args=[
"--trust-remote-code",
"--mem-fraction-static",
"0.73",
"0.70",
],
)
cls.base_url += "/v1"
Expand All @@ -44,7 +44,7 @@ def setUpClass(cls):
other_args=[
"--trust-remote-code",
"--mem-fraction-static",
"0.8",
"0.75",
],
)
cls.base_url += "/v1"
Expand Down Expand Up @@ -88,7 +88,7 @@ def setUpClass(cls):
other_args=[
"--trust-remote-code",
"--mem-fraction-static",
"0.4",
"0.35",
],
)
cls.base_url += "/v1"
Expand Down Expand Up @@ -197,7 +197,7 @@ def setUpClass(cls):
other_args=[
"--trust-remote-code",
"--mem-fraction-static",
"0.75",
"0.70",
"--disable-radix-cache",
"--max-loras-per-batch",
"1",
Expand Down
Loading