feat: accept approximate rates

dhdaines · dhdaines · commit 70d848d92327 · 2022-08-21T12:27:49.000-04:00
diff --git a/include/pocketsphinx/ps_vad.h b/include/pocketsphinx/ps_vad.h
@@ -80,9 +80,12 @@ typedef enum ps_vad_class_e {
  *             misclassify non-speech as speech.
  * @param sample_rate Sampling rate of input, or 0 for default (which can
  *                    be obtained with ps_vad_sample_rate()).  Only 8000,
- *                    16000, 32000, 48000 currently supported.
+ *                    16000, 32000, 48000 are directly supported.  See
+ *                    ps_vad_set_input_params() for more information.
  * @param frame_length Frame length in seconds, or 0.0 for the default.  Only
- *                     0.01, 0.02, 0.03 currently supported.
+ *                     0.01, 0.02, 0.03 currently supported.  **Actual** value
+ *                     may differ, you must use ps_vad_frame_length() to
+ *                     obtain it.
  * @return VAD object or NULL on failure (invalid parameter for instance).
  */
 POCKETSPHINX_EXPORT
@@ -111,9 +114,16 @@ int ps_vad_free(ps_vad_t *vad);
  *
  * @param sample_rate Sampling rate of input, or 0 for default (which can
  *                    be obtained with ps_vad_sample_rate()).  Only 8000,
- *                    16000, 32000, 48000 currently supported.
- * @param frame_length Frame length in seconds, or 0.0 for the default.  Only
- *                     0.01, 0.02, 0.03 currently supported.
+ *                    16000, 32000, 48000 are directly supporte, others
+ *                    will use the closest supported rate (within reason).
+ *                    Note that this means that the actual frame length
+ *                    may not be exactly the one requested, so you must
+ *                    always use the one returned by ps_vad_frame_size()
+ *                    (in samples) or ps_vad_frame_length() (in seconds).
+ * @param frame_length Requested frame length in seconds, or 0.0 for the
+ *                     default.  Only 0.01, 0.02, 0.03 currently supported.
+ *                     **Actual frame length may be different, you must
+ *                     always use ps_vad_frame_length() to obtain it.**
  * @return 0 for success or -1 on error.
  */
 POCKETSPHINX_EXPORT
@@ -131,27 +141,29 @@ int ps_vad_sample_rate(ps_vad_t *vad);
 /**
  * Get the number of samples expected by voice activity detection.
  *
+ * You **must** always ensure that the buffers passed to
+ * ps_vad_classify() contain this number of samples (zero-pad them if
+ * necessary).
+ *
  * @param vad Voice activity detector.
  * @return Size, in samples, of the frames passed to ps_vad_classify().
  */
 POCKETSPHINX_EXPORT
 size_t ps_vad_frame_size(ps_vad_t *vad);
 
 /**
- * Get the number of seconds of audio in each frame.
+ * Get the *actual* length of a frame in seconds.
  *
- * @param vad Voice activity detector.
- * @return Length, in seconds, of the frames passed to ps_vad_classify().
+ * This may differ from the value requested in ps_vad_set_input_params().
  */
-POCKETSPHINX_EXPORT
-float ps_vad_frame_length(ps_vad_t *vad);
+#define ps_vad_frame_length(vad) ((float)ps_vad_frame_size(vad) / ps_vad_sample_rate(vad))
 
 /**
  * Classify a frame as speech or not speech.
  *
  * @param vad Voice activity detector.
- * @param frame Frame of input, must contain the number of samples
- *              returned by ps_vad_frame_size().
+ * @param frame Frame of input, **must** contain the number of
+ *              samples returned by ps_vad_frame_size().
  * @return PS_VAD_SPEECH, PS_VAD_NOT_SPEECH, or PS_VAD_ERROR (see
  *         ps_vad_class_t).
  */
diff --git a/src/ps_vad.c b/src/ps_vad.c
@@ -29,6 +29,7 @@
  */
 
 #include <stdlib.h>
+#include <math.h>
 
 #include <sphinxbase/ckd_alloc.h>
 #include <sphinxbase/err.h>
@@ -41,8 +42,8 @@ struct ps_vad_s {
     VadInstT v;
     int refcount;
     int sample_rate;
+    int closest_sample_rate;
     int frame_size;
-    float frame_length;
 };
 
 ps_vad_t *
@@ -81,25 +82,46 @@ ps_vad_free(ps_vad_t *vad)
     return 0;
 }
 
+static const int sample_rates[] = {
+    8000, 16000, 32000, 48000
+};
+static const int n_sample_rates = sizeof(sample_rates)/sizeof(sample_rates[0]);
+
 int
 ps_vad_set_input_params(ps_vad_t *vad, int sample_rate, float frame_length)
 {
     size_t frame_size;
-    int rv;
+    int i, rv;
+    int closest_sample_rate = 0;
+    float best_diff = 0.5;
 
     if (sample_rate == 0)
         sample_rate = PS_VAD_DEFAULT_SAMPLE_RATE;
     if (frame_length == 0)
         frame_length = PS_VAD_DEFAULT_FRAME_LENGTH;
-    frame_size = (size_t)(sample_rate * frame_length);
-    if ((rv = WebRtcVad_ValidRateAndFrameLength(sample_rate, frame_size)) < 0) {
-        E_ERROR("Invalid sampling rate %d or frame length %f\n",
-                sample_rate, frame_length);
+    for (i = 0; i < n_sample_rates; ++i) {
+        float diff = fabs(1.0 - (float)sample_rates[i] / sample_rate);
+        if (diff < best_diff) {
+            closest_sample_rate = sample_rates[i];
+            best_diff = diff;
+        }
+    }
+    if (closest_sample_rate == 0) {
+        E_ERROR("No suitable sampling rate found for %d\n", sample_rate);
+        return -1;
+    }
+    frame_size = (size_t)(closest_sample_rate * frame_length);
+    if (closest_sample_rate != sample_rate) {
+        E_INFO("Closest supported sampling rate to %d is %d, frame size %d (%.3fs)\n",
+               sample_rate, closest_sample_rate, frame_size, (float)frame_size / sample_rate);
+    }
+    if ((rv = WebRtcVad_ValidRateAndFrameLength(closest_sample_rate, frame_size)) < 0) {
+        E_WARN("Unsupported frame length %f\n", frame_length);
         return rv;
     }
     vad->sample_rate = sample_rate;
+    vad->closest_sample_rate = closest_sample_rate;
     vad->frame_size = frame_size;
-    vad->frame_length = frame_length;
     return rv;
 }
 
@@ -119,17 +141,10 @@ ps_vad_frame_size(ps_vad_t *vad)
     return vad->frame_size;
 }
 
-float
-ps_vad_frame_length(ps_vad_t *vad)
-{
-    if (vad == NULL)
-        return 0.0;
-    return vad->frame_length;
-}
-
 ps_vad_class_t
 ps_vad_classify(ps_vad_t *vad, const short *frame)
 {
-    return WebRtcVad_Process((VadInst *)vad, vad->sample_rate,
+    return WebRtcVad_Process((VadInst *)vad,
+                             vad->closest_sample_rate,
                              frame, vad->frame_size);
 }