@@ -4,7 +4,7 @@ use crate::{
44 pipeline:: { text_models_inputs_processor:: PagedAttentionMeta , LayerCaches } ,
55 response:: { ChatCompletionChunkResponse , Choice , ChunkChoice , Response , SYSTEM_FINGERPRINT } ,
66 sampler:: { Logprobs , Sampler } ,
7- ChatCompletionResponse , Usage ,
7+ AudioInput , ChatCompletionResponse , Usage ,
88} ;
99use crate :: {
1010 paged_attention:: { BlockEngineSequence , LogicalTokenBlock } ,
@@ -171,6 +171,53 @@ pub struct SequenceImages {
171171 hashes : Vec < u64 > ,
172172}
173173
174+ #[ derive( Clone ) ]
175+ pub struct SequenceAudios {
176+ audios : Vec < AudioInput > ,
177+ hashes : Vec < u64 > ,
178+ }
179+
180+ impl SequenceAudios {
181+ fn new ( input_audios : Vec < AudioInput > ) -> Self {
182+ let hashes = input_audios. iter ( ) . map ( |a| {
183+ let mut hasher = DefaultHasher :: new ( ) ;
184+ for s in & a. samples {
185+ s. to_bits ( ) . hash ( & mut hasher) ;
186+ }
187+ a. sample_rate . hash ( & mut hasher) ;
188+ hasher. finish ( )
189+ } ) ;
190+ Self {
191+ hashes : hashes. collect ( ) ,
192+ audios : input_audios,
193+ }
194+ }
195+
196+ fn clone_audios ( & self ) -> Vec < AudioInput > {
197+ self . audios . clone ( )
198+ }
199+
200+ fn audios ( & self ) -> & [ AudioInput ] {
201+ & self . audios
202+ }
203+
204+ fn audios_mut ( & mut self ) -> & mut Vec < AudioInput > {
205+ & mut self . audios
206+ }
207+
208+ fn hashes ( & self ) -> & [ u64 ] {
209+ & self . hashes
210+ }
211+
212+ fn keep_num_audios ( & mut self , audios_to_keep : usize ) {
213+ if self . audios . len ( ) > audios_to_keep {
214+ let start = self . audios . len ( ) - audios_to_keep;
215+ self . audios = self . audios [ start..] . to_vec ( ) ;
216+ self . hashes = self . hashes [ start..] . to_vec ( ) ;
217+ }
218+ }
219+ }
220+
174221impl SequenceImages {
175222 fn new ( input_images : Vec < image:: DynamicImage > ) -> Self {
176223 let hashes = input_images. iter ( ) . map ( |x| {
@@ -211,6 +258,7 @@ impl SequenceImages {
211258// Holds all multimodal (vision/diffusion) data for a Sequence.
212259pub struct MultimodalData {
213260 pub input_images : Option < SequenceImages > ,
261+ pub input_audios : Option < SequenceAudios > ,
214262 pub cached_pixel_values : Option < Tensor > ,
215263 pub cached_img_thw : Option < Tensor > ,
216264 pub cached_vid_thw : Option < Tensor > ,
@@ -222,11 +270,13 @@ pub struct MultimodalData {
222270impl MultimodalData {
223271 pub fn new (
224272 input_images : Option < Vec < image:: DynamicImage > > ,
273+ input_audios : Option < Vec < AudioInput > > ,
225274 image_gen_response_format : Option < ImageGenerationResponseFormat > ,
226275 diffusion_params : Option < DiffusionGenerationParams > ,
227276 ) -> Self {
228277 MultimodalData {
229278 input_images : input_images. map ( SequenceImages :: new) ,
279+ input_audios : input_audios. map ( SequenceAudios :: new) ,
230280 cached_pixel_values : None ,
231281 cached_img_thw : None ,
232282 cached_vid_thw : None ,
@@ -268,6 +318,40 @@ impl MultimodalData {
268318 . is_some_and ( |imgs| !imgs. images ( ) . is_empty ( ) )
269319 }
270320
321+ pub fn take_audios ( & mut self ) -> Option < Vec < AudioInput > > {
322+ if let Some ( input_audios) = self . input_audios . as_mut ( ) {
323+ let mut audios = Vec :: new ( ) ;
324+ std:: mem:: swap ( & mut audios, input_audios. audios_mut ( ) ) ;
325+ Some ( audios)
326+ } else {
327+ None
328+ }
329+ }
330+
331+ pub fn clone_audios ( & self ) -> Option < Vec < AudioInput > > {
332+ self . input_audios . as_ref ( ) . map ( |a| a. clone_audios ( ) )
333+ }
334+
335+ pub fn audios ( & self ) -> Option < & [ AudioInput ] > {
336+ self . input_audios . as_ref ( ) . map ( |a| a. audios ( ) )
337+ }
338+
339+ pub fn audio_hashes ( & self ) -> Option < & [ u64 ] > {
340+ self . input_audios . as_ref ( ) . map ( |a| a. hashes ( ) )
341+ }
342+
343+ pub fn has_audios ( & self ) -> bool {
344+ self . input_audios
345+ . as_ref ( )
346+ . is_some_and ( |a| !a. audios ( ) . is_empty ( ) )
347+ }
348+
349+ pub fn keep_num_audios ( & mut self , audios_to_keep : usize ) {
350+ if let Some ( auds) = self . input_audios . as_mut ( ) {
351+ auds. keep_num_audios ( audios_to_keep)
352+ }
353+ }
354+
271355 pub fn keep_num_images ( & mut self , images_to_keep : usize ) {
272356 if let Some ( imgs) = self . input_images . as_mut ( ) {
273357 imgs. keep_num_images ( images_to_keep)
@@ -422,6 +506,7 @@ impl Sequence {
422506 suffix : Option < String > ,
423507 prefix : Option < String > ,
424508 input_images : Option < Vec < image:: DynamicImage > > ,
509+ input_audios : Option < Vec < AudioInput > > ,
425510 // Paged attention
426511 block_size : Option < usize > ,
427512 //
@@ -492,6 +577,7 @@ impl Sequence {
492577 // Multimodal data
493578 multimodal : MultimodalData :: new (
494579 input_images,
580+ input_audios,
495581 image_gen_response_format,
496582 diffusion_params,
497583 ) ,
@@ -967,6 +1053,30 @@ impl Sequence {
9671053 self . multimodal . has_images ( )
9681054 }
9691055
1056+ pub fn take_audios ( & mut self ) -> Option < Vec < AudioInput > > {
1057+ self . multimodal . take_audios ( )
1058+ }
1059+
1060+ pub fn clone_audios ( & self ) -> Option < Vec < AudioInput > > {
1061+ self . multimodal . clone_audios ( )
1062+ }
1063+
1064+ pub fn audios ( & self ) -> Option < & [ AudioInput ] > {
1065+ self . multimodal . audios ( )
1066+ }
1067+
1068+ pub fn audio_hashes ( & self ) -> Option < & [ u64 ] > {
1069+ self . multimodal . audio_hashes ( )
1070+ }
1071+
1072+ pub fn has_audios ( & self ) -> bool {
1073+ self . multimodal . has_audios ( )
1074+ }
1075+
1076+ pub fn keep_num_audios ( & mut self , audios_to_keep : usize ) {
1077+ self . multimodal . keep_num_audios ( audios_to_keep)
1078+ }
1079+
9701080 /// Keep these last n images
9711081 pub fn keep_num_images ( & mut self , images_to_keep : usize ) {
9721082 self . multimodal . keep_num_images ( images_to_keep)
0 commit comments