Skip to content

Commit 2b37972

Browse files
Wanli-Jiangdominicshanshan
authored andcommitted
[https://nvbugs/5509024][fix] Print full parsed outputs and update keywords for multimodal model (NVIDIA#7670)
Signed-off-by: Wanli Jiang <[email protected]>
1 parent 237b55a commit 2b37972

File tree

1 file changed

+21
-14
lines changed

1 file changed

+21
-14
lines changed

tests/integration/defs/test_e2e.py

Lines changed: 21 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2457,7 +2457,7 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path,
24572457
],
24582458
"mixture_text_image":
24592459
[["invention", "person", "scientists", "Lick", "engineers"],
2460-
["landscape", "trees", "road", "natural", "rock"]]
2460+
["landscape", "trees", "road", "depicts", "scenic"]]
24612461
},
24622462
"gemma-3-27b-it": {
24632463
"image": [
@@ -2503,13 +2503,14 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path,
25032503
if model_name == "qwen2-vl-7b-instruct" and modality == "image":
25042504
match_ratio = 4.0 / 6
25052505

2506+
parsed_outputs = parse_output(output)
25062507
for prompt_output, prompt_keywords in zip(
2507-
parse_output(output), expected_keywords[model_name][modality]):
2508+
parsed_outputs, expected_keywords[model_name][modality]):
25082509
matches = [
25092510
keyword in prompt_output.lower() for keyword in prompt_keywords
25102511
]
25112512
obs_match_ratio = 1. * sum(matches) / len(matches)
2512-
assert obs_match_ratio >= match_ratio, f"Incorrect output!\nGenerated \"{prompt_output}\"\nExpected keywords \"{prompt_keywords}\"\n Matched keywords: {matches}\n Observed match ratio {obs_match_ratio} below threshold {match_ratio}"
2513+
assert obs_match_ratio >= match_ratio, f"Incorrect output!\nGenerated \"{prompt_output}\"\nExpected keywords \"{prompt_keywords}\"\n Matched keywords: {matches}\n Observed match ratio {obs_match_ratio} below threshold {match_ratio}\n\nParsed output for all prompts: {parsed_outputs}"
25132514

25142515
print("All answers are correct!")
25152516

@@ -2842,13 +2843,14 @@ def test_ptp_quickstart_multimodal_phi4mm(llm_root, llm_venv, modality):
28422843
output = llm_venv.run_cmd(cmd, caller=check_output)
28432844

28442845
match_ratio = 0.6
2845-
for prompt_output, prompt_keywords in zip(parse_output(output),
2846+
parsed_outputs = parse_output(output)
2847+
for prompt_output, prompt_keywords in zip(parsed_outputs,
28462848
expected_keywords[modality]):
28472849
matches = [
28482850
keyword in prompt_output.lower() for keyword in prompt_keywords
28492851
]
28502852
obs_match_ratio = 1. * sum(matches) / len(matches)
2851-
assert obs_match_ratio >= match_ratio, f"Incorrect output!\nGenerated \"{prompt_output}\"\nExpected keywords \"{prompt_keywords}\"\n Matched keywords: {matches}\n Observed match ratio {obs_match_ratio} below threshold {match_ratio}"
2853+
assert obs_match_ratio >= match_ratio, f"Incorrect output!\nGenerated \"{prompt_output}\"\nExpected keywords \"{prompt_keywords}\"\n Matched keywords: {matches}\n Observed match ratio {obs_match_ratio} below threshold {match_ratio}\n\nParsed output for all prompts: {parsed_outputs}"
28522854

28532855
print("All answers are correct!")
28542856

@@ -2953,13 +2955,14 @@ def test_ptp_quickstart_multimodal_2gpu(llm_root, llm_venv, model_name,
29532955
match_ratio = 0.6
29542956

29552957
# Check output accuracy
2958+
parsed_outputs = parse_output(output)
29562959
for prompt_output, prompt_keywords in zip(
2957-
parse_output(output), expected_keywords[model_name]["image"]):
2960+
parsed_outputs, expected_keywords[model_name]["image"]):
29582961
matches = [
29592962
keyword in prompt_output.lower() for keyword in prompt_keywords
29602963
]
29612964
obs_match_ratio = 1. * sum(matches) / len(matches)
2962-
assert obs_match_ratio >= match_ratio, f"Incorrect output!\nGenerated \"{prompt_output}\"\nExpected keywords \"{prompt_keywords}\"\n Matched keywords: {matches}\n Observed match ratio {obs_match_ratio} below threshold {match_ratio}"
2965+
assert obs_match_ratio >= match_ratio, f"Incorrect output!\nGenerated \"{prompt_output}\"\nExpected keywords \"{prompt_keywords}\"\n Matched keywords: {matches}\n Observed match ratio {obs_match_ratio} below threshold {match_ratio}\n\nParsed output for all prompts: {parsed_outputs}"
29632966

29642967
print("All answers are correct!")
29652968

@@ -2996,20 +2999,23 @@ def test_ptp_quickstart_multimodal_multiturn(llm_root, llm_venv, model_name,
29962999
expected_keywords = {
29973000
"gemma-3-27b-it": {
29983001
"image": [
2999-
["half", "dome", "yosemite", "landmark", "rounded"],
3000-
["atmosphere", "peaceful", "majestic", "calm", "quiet"],
3002+
["description", "image", "half", "dome", "park"],
3003+
["atmosphere", "peaceful", "majestic", "scene", "sky"],
30013004
],
30023005
},
30033006
"mistral-small-3.1-24b-instruct": {
30043007
"image": [
3005-
["depicts", "landscape", "rock", "sky", "high", "altitude"],
3006-
["atmosphere", "serene", "majestic", "sense", "tranquility"],
3008+
[
3009+
"depicts", "scenic", "landscape", "rock", "formation",
3010+
"background"
3011+
],
3012+
["atmosphere", "serene", "majestic", "clear", "sky", "trees"],
30073013
],
30083014
},
30093015
"Phi-4-multimodal-instruct": {
30103016
"image": [
30113017
["depicts", "landscape", "mountain", "half", "dome"],
3012-
["atmosphere", "serene", "sense", "tranquility", "peace."],
3018+
["atmosphere", "serene", "sense", "scene", "majestic"],
30133019
],
30143020
},
30153021
}
@@ -3059,8 +3065,9 @@ def test_ptp_quickstart_multimodal_multiturn(llm_root, llm_venv, model_name,
30593065
match_ratio = 0.6
30603066

30613067
# Check output accuracy
3068+
parsed_outputs = parse_output(output)
30623069
for prompt_output, prompt_keywords in zip(
3063-
parse_output(output), expected_keywords[model_name]["image"]):
3070+
parsed_outputs, expected_keywords[model_name]["image"]):
30643071
matches = [
30653072
keyword in prompt_output.lower() for keyword in prompt_keywords
30663073
]
@@ -3069,7 +3076,7 @@ def test_ptp_quickstart_multimodal_multiturn(llm_root, llm_venv, model_name,
30693076
print("prompt_keywords:", prompt_keywords)
30703077
print("matches:", matches)
30713078
print("obs_match_ratio:", obs_match_ratio)
3072-
assert obs_match_ratio >= match_ratio, f"Incorrect output!\nGenerated \"{prompt_output}\"\nExpected keywords \"{prompt_keywords}\"\n Matched keywords: {matches}\n Observed match ratio {obs_match_ratio} below threshold {match_ratio}"
3079+
assert obs_match_ratio >= match_ratio, f"Incorrect output!\nGenerated \"{prompt_output}\"\nExpected keywords \"{prompt_keywords}\"\n Matched keywords: {matches}\n Observed match ratio {obs_match_ratio} below threshold {match_ratio}\n\nParsed output for all prompts: {parsed_outputs}"
30733080

30743081
print("All answers are correct!")
30753082

0 commit comments

Comments
 (0)