@@ -125,6 +125,8 @@ def __init__(self,
125125 on_wakeword_timeout = None ,
126126 on_wakeword_detection_start = None ,
127127 on_wakeword_detection_end = None ,
128+ on_recorded_chunk = None ,
129+ debug_mode = False
128130 ):
129131 """
130132 Initializes an audio recorder and transcription
@@ -246,6 +248,11 @@ def __init__(self,
246248 - on_wakeword_detection_end (callable, default=None): Callback
247249 function to be called when the system stops to listen for
248250 wake words (e.g. because of timeout or wake word detected)
251+ - on_recorded_chunk (callable, default=None): Callback function to be
252+ called when a chunk of audio is recorded. The function is called
253+ with the recorded audio chunk as its argument.
254+ - debug_mode (bool, default=False): If set to True, the system will
255+ print additional debug information to the console.
249256
250257 Raises:
251258 Exception: Errors related to initializing transcription
@@ -278,6 +285,7 @@ def __init__(self,
278285 self .on_vad_detect_stop = on_vad_detect_stop
279286 self .on_wakeword_detection_start = on_wakeword_detection_start
280287 self .on_wakeword_detection_end = on_wakeword_detection_end
288+ self .on_recorded_chunk = on_recorded_chunk
281289 self .on_transcription_start = on_transcription_start
282290 self .enable_realtime_transcription = enable_realtime_transcription
283291 self .realtime_model_type = realtime_model_type
@@ -288,6 +296,7 @@ def __init__(self,
288296 self .on_realtime_transcription_stabilized = (
289297 on_realtime_transcription_stabilized
290298 )
299+ self .debug_mode = debug_mode
291300 self .allowed_latency_limit = ALLOWED_LATENCY_LIMIT
292301
293302 self .level = level
@@ -578,9 +587,6 @@ def _transcription_worker(conn,
578587 transcription = " " .join (seg .text for seg in segments )
579588 transcription = transcription .strip ()
580589 conn .send (('success' , transcription ))
581- except faster_whisper .WhisperError as e :
582- logging .error (f"Whisper transcription error: { e } " )
583- conn .send (('error' , str (e )))
584590 except Exception as e :
585591 logging .error (f"General transcription error: { e } " )
586592 conn .send (('error' , str (e )))
@@ -633,13 +639,14 @@ def _audio_data_worker(audio_queue,
633639
634640 try :
635641 audio_interface = pyaudio .PyAudio ()
636- stream = audio_interface .open (rate = sample_rate ,
637- format = pyaudio .paInt16 ,
638- channels = 1 ,
639- input = True ,
640- frames_per_buffer = buffer_size ,
641- input_device_index = input_device_index ,
642- )
642+ stream = audio_interface .open (
643+ rate = sample_rate ,
644+ format = pyaudio .paInt16 ,
645+ channels = 1 ,
646+ input = True ,
647+ frames_per_buffer = buffer_size ,
648+ input_device_index = input_device_index ,
649+ )
643650
644651 except Exception as e :
645652 logging .exception ("Error initializing pyaudio "
@@ -978,6 +985,8 @@ def _recording_worker(self):
978985 try :
979986
980987 data = self .audio_queue .get ()
988+ if self .on_recorded_chunk :
989+ self .on_recorded_chunk (data )
981990
982991 # Handle queue overflow
983992 queue_overflow_logged = False
@@ -1326,10 +1335,20 @@ def _is_webrtc_speech(self, data, all_frames_must_be_true=False):
13261335 if self .webrtc_vad_model .is_speech (frame , self .sample_rate ):
13271336 speech_frames += 1
13281337 if not all_frames_must_be_true :
1338+ if self .debug_mode :
1339+ print (f"Speech detected in frame { i + 1 } "
1340+ f" of { num_frames } " )
13291341 return True
13301342 if all_frames_must_be_true :
1343+ if self .debug_mode and speech_frames == num_frames :
1344+ print (f"Speech detected in { speech_frames } of "
1345+ f"{ num_frames } frames" )
1346+ elif self .debug_mode :
1347+ print (f"Speech not detected in all { num_frames } frames" )
13311348 return speech_frames == num_frames
13321349 else :
1350+ if self .debug_mode :
1351+ print (f"Speech not detected in any of { num_frames } frames" )
13331352 return False
13341353
13351354 def _check_voice_activity (self , data ):
0 commit comments