@@ -2457,7 +2457,7 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path,
24572457 ],
24582458 "mixture_text_image" :
24592459 [["invention" , "person" , "scientists" , "Lick" , "engineers" ],
2460- ["landscape" , "trees" , "road" , "natural " , "rock " ]]
2460+ ["landscape" , "trees" , "road" , "depicts " , "scenic " ]]
24612461 },
24622462 "gemma-3-27b-it" : {
24632463 "image" : [
@@ -2503,13 +2503,14 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path,
25032503 if model_name == "qwen2-vl-7b-instruct" and modality == "image" :
25042504 match_ratio = 4.0 / 6
25052505
2506+ parsed_outputs = parse_output (output )
25062507 for prompt_output , prompt_keywords in zip (
2507- parse_output ( output ) , expected_keywords [model_name ][modality ]):
2508+ parsed_outputs , expected_keywords [model_name ][modality ]):
25082509 matches = [
25092510 keyword in prompt_output .lower () for keyword in prompt_keywords
25102511 ]
25112512 obs_match_ratio = 1. * sum (matches ) / len (matches )
2512- assert obs_match_ratio >= match_ratio , f"Incorrect output!\n Generated \" { prompt_output } \" \n Expected keywords \" { prompt_keywords } \" \n Matched keywords: { matches } \n Observed match ratio { obs_match_ratio } below threshold { match_ratio } "
2513+ assert obs_match_ratio >= match_ratio , f"Incorrect output!\n Generated \" { prompt_output } \" \n Expected keywords \" { prompt_keywords } \" \n Matched keywords: { matches } \n Observed match ratio { obs_match_ratio } below threshold { match_ratio } \n \n Parsed output for all prompts: { parsed_outputs } "
25132514
25142515 print ("All answers are correct!" )
25152516
@@ -2842,13 +2843,14 @@ def test_ptp_quickstart_multimodal_phi4mm(llm_root, llm_venv, modality):
28422843 output = llm_venv .run_cmd (cmd , caller = check_output )
28432844
28442845 match_ratio = 0.6
2845- for prompt_output , prompt_keywords in zip (parse_output (output ),
2846+ parsed_outputs = parse_output (output )
2847+ for prompt_output , prompt_keywords in zip (parsed_outputs ,
28462848 expected_keywords [modality ]):
28472849 matches = [
28482850 keyword in prompt_output .lower () for keyword in prompt_keywords
28492851 ]
28502852 obs_match_ratio = 1. * sum (matches ) / len (matches )
2851- assert obs_match_ratio >= match_ratio , f"Incorrect output!\n Generated \" { prompt_output } \" \n Expected keywords \" { prompt_keywords } \" \n Matched keywords: { matches } \n Observed match ratio { obs_match_ratio } below threshold { match_ratio } "
2853+ assert obs_match_ratio >= match_ratio , f"Incorrect output!\n Generated \" { prompt_output } \" \n Expected keywords \" { prompt_keywords } \" \n Matched keywords: { matches } \n Observed match ratio { obs_match_ratio } below threshold { match_ratio } \n \n Parsed output for all prompts: { parsed_outputs } "
28522854
28532855 print ("All answers are correct!" )
28542856
@@ -2953,13 +2955,14 @@ def test_ptp_quickstart_multimodal_2gpu(llm_root, llm_venv, model_name,
29532955 match_ratio = 0.6
29542956
29552957 # Check output accuracy
2958+ parsed_outputs = parse_output (output )
29562959 for prompt_output , prompt_keywords in zip (
2957- parse_output ( output ) , expected_keywords [model_name ]["image" ]):
2960+ parsed_outputs , expected_keywords [model_name ]["image" ]):
29582961 matches = [
29592962 keyword in prompt_output .lower () for keyword in prompt_keywords
29602963 ]
29612964 obs_match_ratio = 1. * sum (matches ) / len (matches )
2962- assert obs_match_ratio >= match_ratio , f"Incorrect output!\n Generated \" { prompt_output } \" \n Expected keywords \" { prompt_keywords } \" \n Matched keywords: { matches } \n Observed match ratio { obs_match_ratio } below threshold { match_ratio } "
2965+ assert obs_match_ratio >= match_ratio , f"Incorrect output!\n Generated \" { prompt_output } \" \n Expected keywords \" { prompt_keywords } \" \n Matched keywords: { matches } \n Observed match ratio { obs_match_ratio } below threshold { match_ratio } \n \n Parsed output for all prompts: { parsed_outputs } "
29632966
29642967 print ("All answers are correct!" )
29652968
@@ -2996,20 +2999,23 @@ def test_ptp_quickstart_multimodal_multiturn(llm_root, llm_venv, model_name,
29962999 expected_keywords = {
29973000 "gemma-3-27b-it" : {
29983001 "image" : [
2999- ["half " , "dome " , "yosemite " , "landmark " , "rounded " ],
3000- ["atmosphere" , "peaceful" , "majestic" , "calm " , "quiet " ],
3002+ ["description " , "image " , "half " , "dome " , "park " ],
3003+ ["atmosphere" , "peaceful" , "majestic" , "scene " , "sky " ],
30013004 ],
30023005 },
30033006 "mistral-small-3.1-24b-instruct" : {
30043007 "image" : [
3005- ["depicts" , "landscape" , "rock" , "sky" , "high" , "altitude" ],
3006- ["atmosphere" , "serene" , "majestic" , "sense" , "tranquility" ],
3008+ [
3009+ "depicts" , "scenic" , "landscape" , "rock" , "formation" ,
3010+ "background"
3011+ ],
3012+ ["atmosphere" , "serene" , "majestic" , "clear" , "sky" , "trees" ],
30073013 ],
30083014 },
30093015 "Phi-4-multimodal-instruct" : {
30103016 "image" : [
30113017 ["depicts" , "landscape" , "mountain" , "half" , "dome" ],
3012- ["atmosphere" , "serene" , "sense" , "tranquility " , "peace. " ],
3018+ ["atmosphere" , "serene" , "sense" , "scene " , "majestic " ],
30133019 ],
30143020 },
30153021 }
@@ -3059,8 +3065,9 @@ def test_ptp_quickstart_multimodal_multiturn(llm_root, llm_venv, model_name,
30593065 match_ratio = 0.6
30603066
30613067 # Check output accuracy
3068+ parsed_outputs = parse_output (output )
30623069 for prompt_output , prompt_keywords in zip (
3063- parse_output ( output ) , expected_keywords [model_name ]["image" ]):
3070+ parsed_outputs , expected_keywords [model_name ]["image" ]):
30643071 matches = [
30653072 keyword in prompt_output .lower () for keyword in prompt_keywords
30663073 ]
@@ -3069,7 +3076,7 @@ def test_ptp_quickstart_multimodal_multiturn(llm_root, llm_venv, model_name,
30693076 print ("prompt_keywords:" , prompt_keywords )
30703077 print ("matches:" , matches )
30713078 print ("obs_match_ratio:" , obs_match_ratio )
3072- assert obs_match_ratio >= match_ratio , f"Incorrect output!\n Generated \" { prompt_output } \" \n Expected keywords \" { prompt_keywords } \" \n Matched keywords: { matches } \n Observed match ratio { obs_match_ratio } below threshold { match_ratio } "
3079+ assert obs_match_ratio >= match_ratio , f"Incorrect output!\n Generated \" { prompt_output } \" \n Expected keywords \" { prompt_keywords } \" \n Matched keywords: { matches } \n Observed match ratio { obs_match_ratio } below threshold { match_ratio } \n \n Parsed output for all prompts: { parsed_outputs } "
30733080
30743081 print ("All answers are correct!" )
30753082
0 commit comments