@@ -27,7 +27,8 @@ use crate::pipeline::isq::IsqModelLoader;
2727use crate :: pipeline:: loaders:: AutoDeviceMapParams ;
2828use crate :: pipeline:: text_models_inputs_processor:: { FlashParams , PagedAttentionInputMetadata } ;
2929use crate :: pipeline:: {
30- EitherCache , IsqModel , MultimodalPromptPrefixer , Processor , ProcessorCreator ,
30+ EitherCache , IsqModel , Modalities , MultimodalPromptPrefixer , Processor , ProcessorCreator ,
31+ SupportedModality ,
3132} ;
3233use crate :: utils:: varbuilder_utils:: DeviceForLoadTensor ;
3334use crate :: vision_models:: clip:: ClipConfig ;
@@ -104,6 +105,7 @@ pub trait VisionModelLoader: IsqModelLoader + Send + Sync + DeviceMappedModelLoa
104105 // Default is false, specific model must override.
105106 false
106107 }
108+ fn modalities ( & self , config : & str ) -> Result < Modalities > ;
107109 fn prefixer ( & self , config : & str ) -> Arc < dyn MultimodalPromptPrefixer > ;
108110 fn get_device_for_tensor (
109111 & self ,
@@ -311,6 +313,10 @@ impl VisionModelLoader for AutoVisionLoader {
311313 . supports_paged_attention ( config)
312314 }
313315
316+ fn modalities ( & self , config : & str ) -> Result < Modalities > {
317+ Self :: get_loader ( config) ?. modalities ( config)
318+ }
319+
314320 fn supports_prefix_cacher ( & self , config : & str ) -> bool {
315321 Self :: get_loader ( config)
316322 . expect ( "AutoVisionLoader" )
@@ -499,6 +505,12 @@ impl VisionModelLoader for Phi3VLoader {
499505 fn prefixer ( & self , _config : & str ) -> Arc < dyn MultimodalPromptPrefixer > {
500506 Arc :: new ( Phi3VPrefixer )
501507 }
508+ fn modalities ( & self , _config : & str ) -> Result < Modalities > {
509+ Ok ( Modalities {
510+ input : vec ! [ SupportedModality :: Text , SupportedModality :: Vision ] ,
511+ output : vec ! [ SupportedModality :: Text ] ,
512+ } )
513+ }
502514}
503515
504516impl IsqModelLoader for Phi3VLoader {
@@ -771,6 +783,12 @@ impl VisionModelLoader for Idefics2Loader {
771783 fn prefixer ( & self , _config : & str ) -> Arc < dyn MultimodalPromptPrefixer > {
772784 Arc :: new ( Idefics2Prefixer )
773785 }
786+ fn modalities ( & self , _config : & str ) -> Result < Modalities > {
787+ Ok ( Modalities {
788+ input : vec ! [ SupportedModality :: Text , SupportedModality :: Vision ] ,
789+ output : vec ! [ SupportedModality :: Text ] ,
790+ } )
791+ }
774792}
775793
776794impl IsqModelLoader for Idefics2Loader {
@@ -1109,6 +1127,12 @@ impl VisionModelLoader for LLaVANextLoader {
11091127 fn prefixer ( & self , _config : & str ) -> Arc < dyn MultimodalPromptPrefixer > {
11101128 Arc :: new ( LLaVANextPrefixer )
11111129 }
1130+ fn modalities ( & self , _config : & str ) -> Result < Modalities > {
1131+ Ok ( Modalities {
1132+ input : vec ! [ SupportedModality :: Text , SupportedModality :: Vision ] ,
1133+ output : vec ! [ SupportedModality :: Text ] ,
1134+ } )
1135+ }
11121136}
11131137
11141138impl IsqModelLoader for LLaVANextLoader {
@@ -1371,6 +1395,12 @@ impl VisionModelLoader for LLaVALoader {
13711395 fn prefixer ( & self , _config : & str ) -> Arc < dyn MultimodalPromptPrefixer > {
13721396 Arc :: new ( LLaVAPrefixer )
13731397 }
1398+ fn modalities ( & self , _config : & str ) -> Result < Modalities > {
1399+ Ok ( Modalities {
1400+ input : vec ! [ SupportedModality :: Text , SupportedModality :: Vision ] ,
1401+ output : vec ! [ SupportedModality :: Text ] ,
1402+ } )
1403+ }
13741404}
13751405
13761406impl IsqModelLoader for LLaVALoader {
@@ -1625,6 +1655,12 @@ impl VisionModelLoader for VLlamaLoader {
16251655 fn prefixer ( & self , _config : & str ) -> Arc < dyn MultimodalPromptPrefixer > {
16261656 Arc :: new ( VLlamaPrefixer )
16271657 }
1658+ fn modalities ( & self , _config : & str ) -> Result < Modalities > {
1659+ Ok ( Modalities {
1660+ input : vec ! [ SupportedModality :: Text , SupportedModality :: Vision ] ,
1661+ output : vec ! [ SupportedModality :: Text ] ,
1662+ } )
1663+ }
16281664}
16291665
16301666impl IsqModelLoader for VLlamaLoader {
@@ -2009,6 +2045,12 @@ impl VisionModelLoader for Qwen2VLLoader {
20092045 fn prefixer ( & self , _config : & str ) -> Arc < dyn MultimodalPromptPrefixer > {
20102046 Arc :: new ( Qwen2VLPrefixer )
20112047 }
2048+ fn modalities ( & self , _config : & str ) -> Result < Modalities > {
2049+ Ok ( Modalities {
2050+ input : vec ! [ SupportedModality :: Text , SupportedModality :: Vision ] ,
2051+ output : vec ! [ SupportedModality :: Text ] ,
2052+ } )
2053+ }
20122054}
20132055
20142056impl IsqModelLoader for Qwen2VLLoader {
@@ -2297,6 +2339,12 @@ impl VisionModelLoader for Idefics3Loader {
22972339 fn prefixer ( & self , _config : & str ) -> Arc < dyn MultimodalPromptPrefixer > {
22982340 Arc :: new ( Idefics3Prefixer )
22992341 }
2342+ fn modalities ( & self , _config : & str ) -> Result < Modalities > {
2343+ Ok ( Modalities {
2344+ input : vec ! [ SupportedModality :: Text , SupportedModality :: Vision ] ,
2345+ output : vec ! [ SupportedModality :: Text ] ,
2346+ } )
2347+ }
23002348}
23012349
23022350impl IsqModelLoader for Idefics3Loader {
@@ -2606,6 +2654,12 @@ impl VisionModelLoader for MiniCpmOLoader {
26062654 fn prefixer ( & self , _config : & str ) -> Arc < dyn MultimodalPromptPrefixer > {
26072655 Arc :: new ( MiniCpmOPrefixer )
26082656 }
2657+ fn modalities ( & self , _config : & str ) -> Result < Modalities > {
2658+ Ok ( Modalities {
2659+ input : vec ! [ SupportedModality :: Text , SupportedModality :: Vision ] ,
2660+ output : vec ! [ SupportedModality :: Text ] ,
2661+ } )
2662+ }
26092663}
26102664
26112665impl IsqModelLoader for MiniCpmOLoader {
@@ -2892,6 +2946,16 @@ impl VisionModelLoader for Phi4MMLoader {
28922946 fn prefixer ( & self , _config : & str ) -> Arc < dyn MultimodalPromptPrefixer > {
28932947 Arc :: new ( Phi4MMPrefixer )
28942948 }
2949+ fn modalities ( & self , _config : & str ) -> Result < Modalities > {
2950+ Ok ( Modalities {
2951+ input : vec ! [
2952+ SupportedModality :: Text ,
2953+ SupportedModality :: Vision ,
2954+ SupportedModality :: Audio ,
2955+ ] ,
2956+ output : vec ! [ SupportedModality :: Text ] ,
2957+ } )
2958+ }
28952959}
28962960
28972961impl IsqModelLoader for Phi4MMLoader {
@@ -3213,6 +3277,12 @@ impl VisionModelLoader for Qwen2_5VLLoader {
32133277 fn prefixer ( & self , _config : & str ) -> Arc < dyn MultimodalPromptPrefixer > {
32143278 Arc :: new ( Qwen2_5VLPrefixer )
32153279 }
3280+ fn modalities ( & self , _config : & str ) -> Result < Modalities > {
3281+ Ok ( Modalities {
3282+ input : vec ! [ SupportedModality :: Text , SupportedModality :: Vision ] ,
3283+ output : vec ! [ SupportedModality :: Text ] ,
3284+ } )
3285+ }
32163286}
32173287
32183288impl IsqModelLoader for Qwen2_5VLLoader {
@@ -3500,6 +3570,12 @@ impl VisionModelLoader for Gemma3Loader {
35003570 fn prefixer ( & self , _config : & str ) -> Arc < dyn MultimodalPromptPrefixer > {
35013571 Arc :: new ( Gemma3Prefixer )
35023572 }
3573+ fn modalities ( & self , _config : & str ) -> Result < Modalities > {
3574+ Ok ( Modalities {
3575+ input : vec ! [ SupportedModality :: Text , SupportedModality :: Vision ] ,
3576+ output : vec ! [ SupportedModality :: Text ] ,
3577+ } )
3578+ }
35033579}
35043580
35053581impl IsqModelLoader for Gemma3Loader {
@@ -3827,6 +3903,12 @@ impl VisionModelLoader for Mistral3Loader {
38273903 fn prefixer ( & self , _config : & str ) -> Arc < dyn MultimodalPromptPrefixer > {
38283904 Arc :: new ( Mistral3Prefixer )
38293905 }
3906+ fn modalities ( & self , _config : & str ) -> Result < Modalities > {
3907+ Ok ( Modalities {
3908+ input : vec ! [ SupportedModality :: Text , SupportedModality :: Vision ] ,
3909+ output : vec ! [ SupportedModality :: Text ] ,
3910+ } )
3911+ }
38303912}
38313913
38323914impl IsqModelLoader for Mistral3Loader {
@@ -4143,6 +4225,12 @@ impl VisionModelLoader for VLlama4Loader {
41434225 fn prefixer ( & self , _config : & str ) -> Arc < dyn MultimodalPromptPrefixer > {
41444226 Arc :: new ( VLlama4Prefixer )
41454227 }
4228+ fn modalities ( & self , _config : & str ) -> Result < Modalities > {
4229+ Ok ( Modalities {
4230+ input : vec ! [ SupportedModality :: Text , SupportedModality :: Vision ] ,
4231+ output : vec ! [ SupportedModality :: Text ] ,
4232+ } )
4233+ }
41464234}
41474235
41484236impl IsqModelLoader for VLlama4Loader {
0 commit comments