@@ -80,9 +80,12 @@ typedef enum ps_vad_class_e {
8080 * misclassify non-speech as speech.
8181 * @param sample_rate Sampling rate of input, or 0 for default (which can
8282 * be obtained with ps_vad_sample_rate()). Only 8000,
83- * 16000, 32000, 48000 currently supported.
83+ * 16000, 32000, 48000 are directly supported. See
84+ * ps_vad_set_input_params() for more information.
8485 * @param frame_length Frame length in seconds, or 0.0 for the default. Only
85- * 0.01, 0.02, 0.03 currently supported.
86+ * 0.01, 0.02, 0.03 currently supported. **Actual** value
87+ * may differ, you must use ps_vad_frame_length() to
88+ * obtain it.
8689 * @return VAD object or NULL on failure (invalid parameter for instance).
8790 */
8891POCKETSPHINX_EXPORT
@@ -111,9 +114,16 @@ int ps_vad_free(ps_vad_t *vad);
111114 *
112115 * @param sample_rate Sampling rate of input, or 0 for default (which can
113116 * be obtained with ps_vad_sample_rate()). Only 8000,
114- * 16000, 32000, 48000 currently supported.
115- * @param frame_length Frame length in seconds, or 0.0 for the default. Only
116- * 0.01, 0.02, 0.03 currently supported.
117+ * 16000, 32000, 48000 are directly supporte, others
118+ * will use the closest supported rate (within reason).
119+ * Note that this means that the actual frame length
120+ * may not be exactly the one requested, so you must
121+ * always use the one returned by ps_vad_frame_size()
122+ * (in samples) or ps_vad_frame_length() (in seconds).
123+ * @param frame_length Requested frame length in seconds, or 0.0 for the
124+ * default. Only 0.01, 0.02, 0.03 currently supported.
125+ * **Actual frame length may be different, you must
126+ * always use ps_vad_frame_length() to obtain it.**
117127 * @return 0 for success or -1 on error.
118128 */
119129POCKETSPHINX_EXPORT
@@ -131,27 +141,29 @@ int ps_vad_sample_rate(ps_vad_t *vad);
131141/**
132142 * Get the number of samples expected by voice activity detection.
133143 *
144+ * You **must** always ensure that the buffers passed to
145+ * ps_vad_classify() contain this number of samples (zero-pad them if
146+ * necessary).
147+ *
134148 * @param vad Voice activity detector.
135149 * @return Size, in samples, of the frames passed to ps_vad_classify().
136150 */
137151POCKETSPHINX_EXPORT
138152size_t ps_vad_frame_size (ps_vad_t * vad );
139153
140154/**
141- * Get the number of seconds of audio in each frame .
155+ * Get the *actual* length of a frame in seconds .
142156 *
143- * @param vad Voice activity detector.
144- * @return Length, in seconds, of the frames passed to ps_vad_classify().
157+ * This may differ from the value requested in ps_vad_set_input_params().
145158 */
146- POCKETSPHINX_EXPORT
147- float ps_vad_frame_length (ps_vad_t * vad );
159+ #define ps_vad_frame_length (vad ) ((float)ps_vad_frame_size(vad) / ps_vad_sample_rate(vad))
148160
149161/**
150162 * Classify a frame as speech or not speech.
151163 *
152164 * @param vad Voice activity detector.
153- * @param frame Frame of input, must contain the number of samples
154- * returned by ps_vad_frame_size().
165+ * @param frame Frame of input, ** must** contain the number of
166+ * samples returned by ps_vad_frame_size().
155167 * @return PS_VAD_SPEECH, PS_VAD_NOT_SPEECH, or PS_VAD_ERROR (see
156168 * ps_vad_class_t).
157169 */
0 commit comments