Skip to content

Commit 70d848d

Browse files
committed
feat: accept approximate rates
1 parent 93d02d5 commit 70d848d

File tree

2 files changed

+55
-28
lines changed

2 files changed

+55
-28
lines changed

include/pocketsphinx/ps_vad.h

Lines changed: 24 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -80,9 +80,12 @@ typedef enum ps_vad_class_e {
8080
* misclassify non-speech as speech.
8181
* @param sample_rate Sampling rate of input, or 0 for default (which can
8282
* be obtained with ps_vad_sample_rate()). Only 8000,
83-
* 16000, 32000, 48000 currently supported.
83+
* 16000, 32000, 48000 are directly supported. See
84+
* ps_vad_set_input_params() for more information.
8485
* @param frame_length Frame length in seconds, or 0.0 for the default. Only
85-
* 0.01, 0.02, 0.03 currently supported.
86+
* 0.01, 0.02, 0.03 currently supported. **Actual** value
87+
* may differ, you must use ps_vad_frame_length() to
88+
* obtain it.
8689
* @return VAD object or NULL on failure (invalid parameter for instance).
8790
*/
8891
POCKETSPHINX_EXPORT
@@ -111,9 +114,16 @@ int ps_vad_free(ps_vad_t *vad);
111114
*
112115
* @param sample_rate Sampling rate of input, or 0 for default (which can
113116
* be obtained with ps_vad_sample_rate()). Only 8000,
114-
* 16000, 32000, 48000 currently supported.
115-
* @param frame_length Frame length in seconds, or 0.0 for the default. Only
116-
* 0.01, 0.02, 0.03 currently supported.
117+
* 16000, 32000, 48000 are directly supporte, others
118+
* will use the closest supported rate (within reason).
119+
* Note that this means that the actual frame length
120+
* may not be exactly the one requested, so you must
121+
* always use the one returned by ps_vad_frame_size()
122+
* (in samples) or ps_vad_frame_length() (in seconds).
123+
* @param frame_length Requested frame length in seconds, or 0.0 for the
124+
* default. Only 0.01, 0.02, 0.03 currently supported.
125+
* **Actual frame length may be different, you must
126+
* always use ps_vad_frame_length() to obtain it.**
117127
* @return 0 for success or -1 on error.
118128
*/
119129
POCKETSPHINX_EXPORT
@@ -131,27 +141,29 @@ int ps_vad_sample_rate(ps_vad_t *vad);
131141
/**
132142
* Get the number of samples expected by voice activity detection.
133143
*
144+
* You **must** always ensure that the buffers passed to
145+
* ps_vad_classify() contain this number of samples (zero-pad them if
146+
* necessary).
147+
*
134148
* @param vad Voice activity detector.
135149
* @return Size, in samples, of the frames passed to ps_vad_classify().
136150
*/
137151
POCKETSPHINX_EXPORT
138152
size_t ps_vad_frame_size(ps_vad_t *vad);
139153

140154
/**
141-
* Get the number of seconds of audio in each frame.
155+
* Get the *actual* length of a frame in seconds.
142156
*
143-
* @param vad Voice activity detector.
144-
* @return Length, in seconds, of the frames passed to ps_vad_classify().
157+
* This may differ from the value requested in ps_vad_set_input_params().
145158
*/
146-
POCKETSPHINX_EXPORT
147-
float ps_vad_frame_length(ps_vad_t *vad);
159+
#define ps_vad_frame_length(vad) ((float)ps_vad_frame_size(vad) / ps_vad_sample_rate(vad))
148160

149161
/**
150162
* Classify a frame as speech or not speech.
151163
*
152164
* @param vad Voice activity detector.
153-
* @param frame Frame of input, must contain the number of samples
154-
* returned by ps_vad_frame_size().
165+
* @param frame Frame of input, **must** contain the number of
166+
* samples returned by ps_vad_frame_size().
155167
* @return PS_VAD_SPEECH, PS_VAD_NOT_SPEECH, or PS_VAD_ERROR (see
156168
* ps_vad_class_t).
157169
*/

src/ps_vad.c

Lines changed: 31 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
*/
3030

3131
#include <stdlib.h>
32+
#include <math.h>
3233

3334
#include <sphinxbase/ckd_alloc.h>
3435
#include <sphinxbase/err.h>
@@ -41,8 +42,8 @@ struct ps_vad_s {
4142
VadInstT v;
4243
int refcount;
4344
int sample_rate;
45+
int closest_sample_rate;
4446
int frame_size;
45-
float frame_length;
4647
};
4748

4849
ps_vad_t *
@@ -81,25 +82,46 @@ ps_vad_free(ps_vad_t *vad)
8182
return 0;
8283
}
8384

85+
static const int sample_rates[] = {
86+
8000, 16000, 32000, 48000
87+
};
88+
static const int n_sample_rates = sizeof(sample_rates)/sizeof(sample_rates[0]);
89+
8490
int
8591
ps_vad_set_input_params(ps_vad_t *vad, int sample_rate, float frame_length)
8692
{
8793
size_t frame_size;
88-
int rv;
94+
int i, rv;
95+
int closest_sample_rate = 0;
96+
float best_diff = 0.5;
8997

9098
if (sample_rate == 0)
9199
sample_rate = PS_VAD_DEFAULT_SAMPLE_RATE;
92100
if (frame_length == 0)
93101
frame_length = PS_VAD_DEFAULT_FRAME_LENGTH;
94-
frame_size = (size_t)(sample_rate * frame_length);
95-
if ((rv = WebRtcVad_ValidRateAndFrameLength(sample_rate, frame_size)) < 0) {
96-
E_ERROR("Invalid sampling rate %d or frame length %f\n",
97-
sample_rate, frame_length);
102+
for (i = 0; i < n_sample_rates; ++i) {
103+
float diff = fabs(1.0 - (float)sample_rates[i] / sample_rate);
104+
if (diff < best_diff) {
105+
closest_sample_rate = sample_rates[i];
106+
best_diff = diff;
107+
}
108+
}
109+
if (closest_sample_rate == 0) {
110+
E_ERROR("No suitable sampling rate found for %d\n", sample_rate);
111+
return -1;
112+
}
113+
frame_size = (size_t)(closest_sample_rate * frame_length);
114+
if (closest_sample_rate != sample_rate) {
115+
E_INFO("Closest supported sampling rate to %d is %d, frame size %d (%.3fs)\n",
116+
sample_rate, closest_sample_rate, frame_size, (float)frame_size / sample_rate);
117+
}
118+
if ((rv = WebRtcVad_ValidRateAndFrameLength(closest_sample_rate, frame_size)) < 0) {
119+
E_WARN("Unsupported frame length %f\n", frame_length);
98120
return rv;
99121
}
100122
vad->sample_rate = sample_rate;
123+
vad->closest_sample_rate = closest_sample_rate;
101124
vad->frame_size = frame_size;
102-
vad->frame_length = frame_length;
103125
return rv;
104126
}
105127

@@ -119,17 +141,10 @@ ps_vad_frame_size(ps_vad_t *vad)
119141
return vad->frame_size;
120142
}
121143

122-
float
123-
ps_vad_frame_length(ps_vad_t *vad)
124-
{
125-
if (vad == NULL)
126-
return 0.0;
127-
return vad->frame_length;
128-
}
129-
130144
ps_vad_class_t
131145
ps_vad_classify(ps_vad_t *vad, const short *frame)
132146
{
133-
return WebRtcVad_Process((VadInst *)vad, vad->sample_rate,
147+
return WebRtcVad_Process((VadInst *)vad,
148+
vad->closest_sample_rate,
134149
frame, vad->frame_size);
135150
}

0 commit comments

Comments
 (0)