diff --git a/data/locale/en-US.ini b/data/locale/en-US.ini index 8ca4e47..470eccf 100644 --- a/data/locale/en-US.ini +++ b/data/locale/en-US.ini @@ -1,19 +1,39 @@ CleanStreamAudioFilter="Clean stream audio filter" CleanStreamFilterPlugin="Clean stream filter plugin" +Language="Language" +None="None" +Silence="Silence" +Beep="Beep" +Random="Random" +Horn="Horn" +External="External" +WavFilesFilter="WAV files (*.wav);;All files (*.*)" +AdvancedSettings="Advanced Settings" +BeamSearch="Beam search" +Greedy="Greedy" +initial_prompt="Initial prompt" +speed_up="Speed up" +suppress_blank="Suppress blank" +suppress_nst="Suppress non-speech tokens" +temperature="Temperature" +max_initial_ts="Max initial timestamp" +length_penalty="Length penalty" +CPUOnly="CPU only" +GPUName="GPU: " detect_regex="Detect regex" advanced_settings="Advanced settings" -filler_p_threshold="Filler p threshold" -do_silence="Do silence" vad_enabled="VAD enabled" log_level="Log level" log_words="Log words" whisper_model="Whisper model" Whisper_Parameters="Whisper Parameters" +whisper_sampling_method_tooltip="Greedy: Fastest method. Picks the most likely word at each step. Good for speed.\nBeam Search: More accurate but slower. Explores multiple possibilities to find the best overall sentence." whisper_sampling_method="Whisper sampling method" n_threads="Number of threads" n_max_text_ctx="Number of max text context" no_context="No context" replace_sound_path="Replace Sound Path" +replace_sound_random_folder="Random Sounds Folder" replace_sound="Replace Sound" backend_group="Whisper Backend Configuration" backend_device="GPU device" diff --git a/data/locale/pt-BR.ini b/data/locale/pt-BR.ini new file mode 100644 index 0000000..9d2832c --- /dev/null +++ b/data/locale/pt-BR.ini @@ -0,0 +1,41 @@ +CleanStreamAudioFilter="Filtro de áudio CleanStream" +CleanStreamFilterPlugin="Plugin de filtro CleanStream" +Language="Idioma" +None="Nenhum" +Silence="Silêncio" +Beep="Beep" +Random="Aleatório" +Horn="Buzina" +External="Externo" +WavFilesFilter="Arquivos WAV (*.wav);;Todos os arquivos (*.*)" +AdvancedSettings="Configurações Avançadas" +BeamSearch="Busca em feixe" +Greedy="Busca Relaxada" +initial_prompt="Prompt inicial" +speed_up="Acelerar" +suppress_blank="Suprimir espaços em branco" +suppress_nst="Suprimir tokens não-falados" +temperature="Temperatura" +max_initial_ts="Timestamp inicial máx." +length_penalty="Penalidade de comprimento" +CPUOnly="Apenas CPU" +GPUName="GPU: " +detect_regex="Detectar com Regex" +advanced_settings="Configurações avançadas" +vad_enabled="VAD ativado" +log_level="Nível de log" +log_words="Registrar palavras" +whisper_model="Modelo Whisper" +Whisper_Parameters="Parâmetros do Whisper" +whisper_sampling_method_tooltip="Busca Relaxada: Método mais rápido. Escolhe a palavra mais provável a cada passo. Bom para velocidade.\nBusca em Feixe: Mais preciso, porém mais lento. Explora múltiplas possibilidades para encontrar a melhor frase." +whisper_sampling_method="Método de amostragem Whisper" +n_threads="Número de threads" +n_max_text_ctx="Número máximo de contexto de texto" +no_context="Sem contexto" +replace_sound_path="Caminho do Som de Substituição" +replace_sound_random_folder="Pasta de Sons Aleatórios" +replace_sound="Som de Substituição" +backend_group="Configuração do Backend Whisper" +backend_device="Dispositivo GPU" +enable_flash_attn="Ativar Flash Attention" +enable_flash_attn_tooltip="Melhora a velocidade de transcrição em algumas GPUs (NVidia: Ampere ou mais recente, AMD: RDNA ou mais recente). Pode diminuir a velocidade em outros casos" diff --git a/src/cleanstream-filter-data.h b/src/cleanstream-filter-data.h index 1910368..1737f3d 100644 --- a/src/cleanstream-filter-data.h +++ b/src/cleanstream-filter-data.h @@ -24,6 +24,7 @@ enum ReplaceSounds { REPLACE_SOUNDS_SILENCE = 2, REPLACE_SOUNDS_HORN = 3, REPLACE_SOUNDS_EXTERNAL = 4, + REPLACE_SOUNDS_RANDOM = 5, }; // Audio packet info @@ -90,7 +91,10 @@ struct cleanstream_data { bool log_words; bool active; long long replace_sound; + std::string current_random_audio; + std::vector random_audio_files; std::string replace_sound_external; + std::string replace_sound_random_folder; }; #endif diff --git a/src/cleanstream-filter.cpp b/src/cleanstream-filter.cpp index 1cf124b..eea1328 100644 --- a/src/cleanstream-filter.cpp +++ b/src/cleanstream-filter.cpp @@ -23,6 +23,7 @@ #include "whisper-utils/whisper-language.h" #include "whisper-utils/whisper-processing.h" #include "whisper-utils/whisper-utils.h" +#include "audio-utils/read-audio-file.h" #include "cleanstream-filter-data.h" #include "plugin-support.h" @@ -104,13 +105,38 @@ struct obs_audio_data *cleanstream_filter_audio(void *data, struct obs_audio_dat struct cleanstream_data *gf = static_cast(data); - if (!gf->active) { + bool muted = obs_source_muted(obs_filter_get_parent(gf->context)); + + if (!gf->active || muted) { + if (gf->whisper_context != nullptr) { + obs_log(LOG_INFO, "Source is muted or filter is inactive, shutting down whisper thread"); + shutdown_whisper_thread(gf); + + // Clear audio buffers to prevent leftover sound on unmute + std::lock_guard lock(gf->whisper_buf_mutex); + for (size_t c = 0; c < gf->channels; c++) { + deque_free(&gf->input_buffers[c]); + deque_init(&gf->input_buffers[c]); + } + deque_free(&gf->info_buffer); + deque_init(&gf->info_buffer); + gf->audioFilePointer = 0; + gf->current_result = DETECTION_RESULT_UNKNOWN; + gf->current_result_start_timestamp = 0; + gf->current_result_end_timestamp = 0; + } return audio; } if (gf->whisper_context == nullptr) { - // Whisper not initialized, just pass through - return audio; + // Whisper not initialized, try to start it + obs_log(LOG_INFO, "Whisper context is null, attempting to start whisper thread"); + obs_data_t *settings = obs_source_get_settings(gf->context); + update_whisper_model(gf, settings); + obs_data_release(settings); + // If it's still null, pass through audio + if (gf->whisper_context == nullptr) + return audio; } size_t input_buffer_size = 0; @@ -191,6 +217,7 @@ struct obs_audio_data *cleanstream_filter_audio(void *data, struct obs_audio_dat temporary_buffers[i].resize(num_frames, 0.0f); } } else if (gf->replace_sound == REPLACE_SOUNDS_HORN || + gf->replace_sound == REPLACE_SOUNDS_RANDOM || gf->replace_sound == REPLACE_SOUNDS_BEEP || gf->replace_sound == REPLACE_SOUNDS_EXTERNAL) { @@ -198,9 +225,17 @@ struct obs_audio_data *cleanstream_filter_audio(void *data, struct obs_audio_dat gf->replace_sound == REPLACE_SOUNDS_HORN ? "horn.wav" : gf->replace_sound == REPLACE_SOUNDS_BEEP ? "beep.wav" : gf->replace_sound == REPLACE_SOUNDS_EXTERNAL - ? gf->replace_sound_external + ? gf->replace_sound_external + : gf->replace_sound == REPLACE_SOUNDS_RANDOM + ? gf->current_random_audio : ""; + if (gf->replace_sound == REPLACE_SOUNDS_RANDOM && gf->audioFilePointer == 0 && !gf->random_audio_files.empty()) { + size_t random_index = rand() % gf->random_audio_files.size(); + gf->current_random_audio = gf->random_audio_files[random_index]; + replace_audio_name = gf->current_random_audio; + } + if (replace_audio_name != "") { // replace the audio with beep or horn sound const AudioDataFloat &replace_audio = @@ -288,6 +323,50 @@ void cleanstream_update(void *data, obs_data_t *s) gf->log_words = obs_data_get_bool(s, "log_words"); gf->delay_ms = BUFFER_SIZE_MSEC + INITIAL_DELAY_MSEC; gf->current_result = DetectionResult::DETECTION_RESULT_UNKNOWN; + +#if defined(_WIN32) || defined(__APPLE__) + // Load external sound file if configured + if (gf->replace_sound == REPLACE_SOUNDS_EXTERNAL) { + std::string replace_sound_path_ = + obs_data_get_string(s, "replace_sound_path"); + if (!replace_sound_path_.empty() && + gf->audioFileCache.find(replace_sound_path_) == + gf->audioFileCache.end()) { + AudioDataFloat audioFile = + read_audio_file(replace_sound_path_.c_str(), gf->sample_rate); + if (!audioFile.empty()) { + gf->audioFileCache[replace_sound_path_] = audioFile; + gf->replace_sound_external = replace_sound_path_; + } + } + } + + // Load random sound files if folder is configured + if (gf->replace_sound == REPLACE_SOUNDS_RANDOM) { + std::string random_folder_path = + obs_data_get_string(s, "replace_sound_random_folder"); + if (!random_folder_path.empty()) { + gf->replace_sound_random_folder = random_folder_path; + gf->random_audio_files.clear(); + for (const auto &entry : + std::filesystem::directory_iterator(random_folder_path)) { + if (entry.path().extension() == ".wav") { + std::string file_path = entry.path().string(); + gf->random_audio_files.push_back(file_path); + if (gf->audioFileCache.find(file_path) == + gf->audioFileCache.end()) { + AudioDataFloat audioFile = read_audio_file( + file_path.c_str(), gf->sample_rate); + if (!audioFile.empty()) { + gf->audioFileCache[file_path] = audioFile; + } + } + } + } + } + } +#endif + gf->current_result_start_timestamp = 0; gf->current_result_end_timestamp = 0; @@ -381,6 +460,7 @@ void *cleanstream_create(obs_data_t *settings, obs_source_t *filter) gf->detect_regex = nullptr; gf->replace_sound = REPLACE_SOUNDS_SILENCE; gf->replace_sound_external = ""; + gf->replace_sound_random_folder = ""; // get absolute path of the audio files char *module_data_sounds_folder_path = obs_module_file("sounds"); @@ -481,15 +561,14 @@ void add_whisper_backend_group_properties(obs_properties_t *ppts, struct cleanst obs_property_t *backend_device = obs_properties_add_list(backend_group, "backend_device", MT_("backend_device"), OBS_COMBO_TYPE_LIST, OBS_COMBO_FORMAT_INT); - - obs_property_list_add_int(backend_device, "CPU only", -1); + + obs_property_list_add_int(backend_device, MT_("CPUOnly"), -1); for (size_t i = 0; i < gf->gpu_devices.size(); i++) { auto name = gf->gpu_devices.at(i).device_name; auto description = gf->gpu_devices.at(i).device_description; obs_property_list_add_int( - backend_device, - std::string("GPU: ").append(name).append(" - ").append(description).c_str(), - i); + backend_device, std::string(MT_("GPUName")) + .append(name).append(" - ").append(description).c_str(), i); } obs_property_t *enable_flash_attn = obs_properties_add_bool( @@ -509,22 +588,31 @@ obs_properties_t *cleanstream_properties(void *data) obs_property_t *replace_sounds_list = obs_properties_add_list(ppts, "replace_sound", MT_("replace_sound"), OBS_COMBO_TYPE_LIST, OBS_COMBO_FORMAT_INT); - obs_property_list_add_int(replace_sounds_list, "None", REPLACE_SOUNDS_NONE); - obs_property_list_add_int(replace_sounds_list, "Silence", REPLACE_SOUNDS_SILENCE); + obs_property_list_add_int(replace_sounds_list, MT_("None"), REPLACE_SOUNDS_NONE); + obs_property_list_add_int(replace_sounds_list, MT_("Silence"), REPLACE_SOUNDS_SILENCE); // on windows and mac, add external file path for replace sound #if defined(_WIN32) || defined(__APPLE__) if (!gf->audioFileCache["beep.wav"].empty()) { - obs_property_list_add_int(replace_sounds_list, "Beep", REPLACE_SOUNDS_BEEP); + obs_property_list_add_int(replace_sounds_list, MT_("Beep"), REPLACE_SOUNDS_BEEP); } + if (!gf->audioFileCache["horn.wav"].empty()) { - obs_property_list_add_int(replace_sounds_list, "Horn", REPLACE_SOUNDS_HORN); + obs_property_list_add_int(replace_sounds_list, MT_("Random"), REPLACE_SOUNDS_RANDOM); + obs_property_list_add_int(replace_sounds_list, MT_("Horn"), REPLACE_SOUNDS_HORN); } - obs_property_list_add_int(replace_sounds_list, "External", REPLACE_SOUNDS_EXTERNAL); + + obs_property_list_add_int(replace_sounds_list, MT_("External"), REPLACE_SOUNDS_EXTERNAL); // add external file path for replace sound + obs_property_t *random_sound_path = nullptr; obs_property_t *replace_sound_path = obs_properties_add_path( ppts, "replace_sound_path", MT_("replace_sound_path"), OBS_PATH_FILE, - "WAV files (*.wav);;All files (*.*)", nullptr); + MT_("WavFilesFilter"), nullptr); + + // add folder path for random sounds + random_sound_path = obs_properties_add_path( + ppts, "replace_sound_random_folder", MT_("replace_sound_random_folder"), + OBS_PATH_DIRECTORY, nullptr, nullptr); // show/hide external file path based on the selected replace sound obs_property_set_modified_callback(replace_sounds_list, [](obs_properties_t *props, @@ -532,6 +620,9 @@ obs_properties_t *cleanstream_properties(void *data) obs_data_t *settings) { UNUSED_PARAMETER(property); const long long replace_sound = obs_data_get_int(settings, "replace_sound"); + obs_property_set_visible( + obs_properties_get(props, "replace_sound_random_folder"), + replace_sound == REPLACE_SOUNDS_RANDOM); obs_property_set_visible(obs_properties_get(props, "replace_sound_path"), replace_sound == REPLACE_SOUNDS_EXTERNAL); return true; @@ -566,6 +657,43 @@ obs_properties_t *cleanstream_properties(void *data) return true; }, gf); + + obs_property_set_modified_callback2( + random_sound_path, + [](void *data_, obs_properties_t *props, obs_property_t *property, + obs_data_t *settings) { + UNUSED_PARAMETER(property); + UNUSED_PARAMETER(props); + struct cleanstream_data *gf_ = + static_cast(data_); + gf_->random_audio_files.clear(); + std::string random_folder_path = + obs_data_get_string(settings, "replace_sound_random_folder"); + if (random_folder_path.empty()) { + return true; + } + gf_->replace_sound_random_folder = random_folder_path; + for (const auto &entry : + std::filesystem::directory_iterator(random_folder_path)) { + if (entry.path().extension() == ".wav") { + std::string file_path = entry.path().string(); + gf_->random_audio_files.push_back(file_path); + if (gf_->audioFileCache.find(file_path) == + gf_->audioFileCache.end()) { + AudioDataFloat audioFile = read_audio_file( + file_path.c_str(), gf_->sample_rate); + if (audioFile.empty()) { + obs_log(LOG_ERROR, "Failed to load audio file: %s", + file_path.c_str()); + } else { + gf_->audioFileCache[file_path] = audioFile; + } + } + } + } + return true; + }, + gf); #endif // Add a list of available whisper models to download @@ -583,7 +711,7 @@ obs_properties_t *cleanstream_properties(void *data) // Add language selector obs_property_t *whisper_language_select_list = - obs_properties_add_list(ppts, "whisper_language_select", "Language", + obs_properties_add_list(ppts, "whisper_language_select", MT_("Language"), OBS_COMBO_TYPE_LIST, OBS_COMBO_FORMAT_STRING); // get a sorted list of available languages std::vector whisper_available_lang_keys; @@ -617,7 +745,7 @@ obs_properties_t *cleanstream_properties(void *data) add_whisper_backend_group_properties(ppts, gf); obs_properties_t *advanced_settings_group = obs_properties_create(); - obs_properties_add_group(ppts, "advanced_settings_group", MT_("Advanced_Settings"), + obs_properties_add_group(ppts, "advanced_settings_group", MT_("AdvancedSettings"), OBS_GROUP_NORMAL, advanced_settings_group); obs_properties_add_bool(advanced_settings_group, "vad_enabled", MT_("vad_enabled")); @@ -636,9 +764,10 @@ obs_properties_t *cleanstream_properties(void *data) obs_property_t *whisper_sampling_method_list = obs_properties_add_list( whisper_params_group, "whisper_sampling_method", MT_("whisper_sampling_method"), OBS_COMBO_TYPE_LIST, OBS_COMBO_FORMAT_INT); - obs_property_list_add_int(whisper_sampling_method_list, "Beam search", + obs_property_list_add_int(whisper_sampling_method_list, MT_("BeamSearch"), WHISPER_SAMPLING_BEAM_SEARCH); - obs_property_list_add_int(whisper_sampling_method_list, "Greedy", WHISPER_SAMPLING_GREEDY); + obs_property_set_long_description(whisper_sampling_method_list, MT_("whisper_sampling_method_tooltip")); + obs_property_list_add_int(whisper_sampling_method_list, MT_("Greedy"), WHISPER_SAMPLING_GREEDY); // int n_threads; obs_properties_add_int_slider(whisper_params_group, "n_threads", MT_("n_threads"), 1, 8, 1); diff --git a/src/whisper-utils/silero-vad-onnx.cpp b/src/whisper-utils/silero-vad-onnx.cpp index b3d4487..539c56e 100644 --- a/src/whisper-utils/silero-vad-onnx.cpp +++ b/src/whisper-utils/silero-vad-onnx.cpp @@ -107,11 +107,11 @@ void VadIterator::reset_states() current_speech = timestamp_t(); }; -void VadIterator::predict(const std::vector &data) +void VadIterator::predict(const float *data) { // Infer // Create ort tensors - input.assign(data.begin(), data.end()); + std::memcpy(input.data(), data, window_size_samples * sizeof(float)); Ort::Value input_ort = Ort::Value::CreateTensor(memory_info, input.data(), input.size(), input_node_dims, 2); Ort::Value sr_ort = Ort::Value::CreateTensor(memory_info, sr.data(), sr.size(), @@ -256,17 +256,17 @@ void VadIterator::predict(const std::vector &data) } }; -void VadIterator::process(const std::vector &input_wav) +void VadIterator::process(const float *input_wav, size_t num_samples) { reset_states(); - audio_length_samples = (int)input_wav.size(); + audio_length_samples = (int)num_samples; for (int j = 0; j < audio_length_samples; j += (int)window_size_samples) { if (j + (int)window_size_samples > audio_length_samples) break; - std::vector r{&input_wav[0] + j, &input_wav[0] + j + window_size_samples}; - predict(r); + const float *chunk = input_wav + j; + predict(chunk); } if (current_speech.start >= 0) { @@ -282,7 +282,7 @@ void VadIterator::process(const std::vector &input_wav) void VadIterator::process(const std::vector &input_wav, std::vector &output_wav) { - process(input_wav); + process(input_wav.data(), input_wav.size()); collect_chunks(input_wav, output_wav); } diff --git a/src/whisper-utils/silero-vad-onnx.h b/src/whisper-utils/silero-vad-onnx.h index 88a1e4f..c49b395 100644 --- a/src/whisper-utils/silero-vad-onnx.h +++ b/src/whisper-utils/silero-vad-onnx.h @@ -41,13 +41,11 @@ class VadIterator { Ort::MemoryInfo memory_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeCPU); private: - void init_engine_threads(int inter_threads, int intra_threads); - void init_onnx_model(const SileroString &model_path); - void reset_states(); - void predict(const std::vector &data); + void init_engine_threads(int inter_threads, int intra_threads); void init_onnx_model(const SileroString &model_path); + void reset_states(); void predict(const float *data); public: - void process(const std::vector &input_wav); + void process(const float *input_wav, size_t num_samples); void process(const std::vector &input_wav, std::vector &output_wav); void collect_chunks(const std::vector &input_wav, std::vector &output_wav); const std::vector get_speech_timestamps() const; diff --git a/src/whisper-utils/whisper-processing.cpp b/src/whisper-utils/whisper-processing.cpp index c11c4b4..dc1a3e9 100644 --- a/src/whisper-utils/whisper-processing.cpp +++ b/src/whisper-utils/whisper-processing.cpp @@ -274,9 +274,7 @@ long long process_audio_from_buffer(struct cleanstream_data *gf) bool skipped_inference = false; if (gf->vad_enabled && gf->vad != nullptr) { - std::vector vad_input(whisper_buffer_16khz[0], - whisper_buffer_16khz[0] + whisper_frames); - gf->vad->process(vad_input); + gf->vad->process(whisper_buffer_16khz[0], whisper_frames); std::vector stamps = gf->vad->get_speech_timestamps(); if (stamps.size() == 0) {