165 #pragma warning (disable: 4305)
175 #define _ABS(x) ((x) >= 0 ? (x) : -(x))
181 #define CONT_AD_ADFRMSIZE 256
183 #define CONT_AD_POWHISTSIZE 98
186 #define CONT_AD_CALIB_FRAMES (CONT_AD_POWHISTSIZE * 2)
188 #define CONT_AD_THRESH_UPDATE 100
191 #define CONT_AD_ADAPT_RATE 0.2
193 #define CONT_AD_SPS 16000
195 #define CONT_AD_DEFAULT_NOISE 30
196 #define CONT_AD_DELTA_SIL 10
197 #define CONT_AD_DELTA_SPEECH 17
198 #define CONT_AD_MIN_NOISE 2
199 #define CONT_AD_MAX_NOISE 70
201 #define CONT_AD_HIST_INERTIA 3
203 #define CONT_AD_WINSIZE 21
206 #define CONT_AD_SPEECH_ONSET 9
213 #define CONT_AD_SIL_ONSET 18
220 #define CONT_AD_LEADER 5
224 #define CONT_AD_TRAILER 10
235 fprintf(fp,
"PowHist:\n");
236 for (i = 0, j = 0; i < CONT_AD_POWHISTSIZE; i++) {
238 fprintf(fp,
"\t%3d %6d\n", i, r->
pow_hist[i]);
243 fprintf(fp,
"PH[%7.2f]:",
245 for (i = 0; i <= j; i++)
246 fprintf(fp,
" %2d", r->
pow_hist[i]);
258 cont_ad_frame_pow(int16 * buf, int32 * prev, int32 spf)
266 for (i = 0; i < spf; i++) {
268 v = (double) (buf[i] - p);
298 i = (int32) ((10.0 * (log10(sumsq) - log10((
double) spf))) + 0.5);
312 compute_frame_pow(
cont_ad_t * r, int32 frm)
335 for (i = 0; i < CONT_AD_POWHISTSIZE; i++)
347 int32 old_noise_level, old_thresh_sil, old_thresh_speech;
358 (i < CONT_AD_POWHISTSIZE) && (r->
pow_hist[i] == 0); i++);
367 for (j = i, th = i; (j < CONT_AD_POWHISTSIZE) && (j < i + 20); j++) {
368 if (max < r->pow_hist[j]) {
389 "%7.2fs %8df: NoisePeak: %d, Noiselevel: %d -> %d, Th-Sil: %d -> %d, Th-Sp: %d -> %d\n",
392 old_thresh_sil, r->
thresh_sil, old_thresh_speech,
413 sil2speech_transition(
cont_ad_t *r,
int frm)
418 seg = malloc(
sizeof(*seg));
421 if (seg->startfrm < 0)
422 seg->startfrm += CONT_AD_ADFRMSIZE;
438 n = frm - seg->startfrm;
440 n += CONT_AD_ADFRMSIZE;
444 "%7.2fs %8d[%3d]f: Sil -> Sp detect; seg start: %7.2fs %8d\n",
449 (
double) (n * r->
spf) / (
double) (r->
sps), n);
464 speech2sil_transition(
cont_ad_t *r,
int frm)
478 if (n >= CONT_AD_ADFRMSIZE)
479 n -= CONT_AD_ADFRMSIZE;
482 n += CONT_AD_ADFRMSIZE;
486 "%7.2fs %8d[%3d]f: Sp -> Sil detect; seg end: %7.2fs %8d\n",
489 (
double) (n * r->
spf) / (
double) (r->
sps), n);
508 if (f >= CONT_AD_ADFRMSIZE)
521 boundary_detect(
cont_ad_t * r, int32 frm)
537 "%7.2fs %8d[%3d]f: P: %2d, N: %2d, T+: %2d, T-: %2d, #O: %2d, %s\n",
541 (r->
tail_state == CONT_AD_STATE_SIL) ?
"--" :
"Sp");
551 sil2speech_transition(r, frm);
556 speech2sil_transition(r, frm);
592 max_siglvl(
cont_ad_t * r, int32 startfrm, int32 nfrm)
598 for (i = 0, f = startfrm; i < nfrm; i++, f++) {
599 if (f >= CONT_AD_ADFRMSIZE)
600 f -= CONT_AD_ADFRMSIZE;
615 get_audio_data(
cont_ad_t * r, int16 * buf, int32 max)
622 cont_ad_read_log(
cont_ad_t * r, int32 retval)
626 fprintf(r->
logfp,
"return from cont_ad_read() -> %d:\n", retval);
628 fprintf(r->
logfp,
"\tread_ts: %d (%.2fs)\n",
630 fprintf(r->
logfp,
"\tseglen: %d (%.2fs)\n",
645 fprintf(r->
logfp,
"\tspseg:");
646 for (seg = r->
spseg_head; seg; seg = seg->next)
647 fprintf(r->
logfp,
" %d[%d]", seg->startfrm, seg->nfrm);
648 fprintf(r->
logfp,
"\n");
663 buf_copy(
cont_ad_t * r, int32 sf, int32 nf, int16 * buf)
667 assert((sf >= 0) && (sf < CONT_AD_ADFRMSIZE));
670 if (sf + nf > CONT_AD_ADFRMSIZE) {
672 f = CONT_AD_ADFRMSIZE - sf;
674 memcpy(buf, r->
adbuf + (sf * r->
spf), l *
sizeof(int16));
678 "return %d speech frames [%d..%d]; %d samples\n",
679 f, sf, sf + f - 1, l);
689 memcpy(buf, r->
adbuf + (sf * r->
spf), l *
sizeof(int16));
693 "return %d speech frames [%d..%d]; %d samples\n",
694 nf, sf, sf + nf - 1, l);
698 if ((sf + nf) >= CONT_AD_ADFRMSIZE) {
699 assert((sf + nf) == CONT_AD_ADFRMSIZE);
716 cont_ad_read_internal(
cont_ad_t *r, int16 *buf, int32 max)
718 int32 head, tail, len, l;
728 assert((len >= 0) && (len < r->spf));
730 if ((tail < r->adbufsize) && (!r->
eof)) {
733 (*(r->adfunc)) (r->
ad, r->
adbuf + tail,
748 memcpy(r->
adbuf + tail, buf, l *
sizeof(int16));
751 if ((l > 0) && r->
rawfp) {
752 fwrite(r->
adbuf + tail,
sizeof(int16), l, r->
rawfp);
765 (*(r->adfunc)) (r->
ad,
766 r->
adbuf + tail, head - tail)) < 0) {
775 memcpy(r->
adbuf + tail, buf, l *
sizeof(int16));
777 if ((l > 0) && r->
rawfp) {
778 fwrite(r->
adbuf + tail,
sizeof(int16), l, r->
rawfp);
795 cont_ad_classify(
cont_ad_t *r, int32 len)
800 if (tailfrm >= CONT_AD_ADFRMSIZE)
801 tailfrm -= CONT_AD_ADFRMSIZE;
803 for (; len >= r->
spf; len -= r->
spf) {
804 compute_frame_pow(r, tailfrm);
812 boundary_detect(r, tailfrm);
814 if (++tailfrm >= CONT_AD_ADFRMSIZE)
836 if (f >= CONT_AD_ADFRMSIZE)
846 if (f >= CONT_AD_ADFRMSIZE)
865 int32 flen, len, retval, newstate;
868 if ((r == NULL) || (buf == NULL))
873 (
"cont_ad_read requires buffer of at least %d samples\n",
879 fprintf(r->
logfp,
"cont_ad_read(,, %d)\n", max);
884 len = cont_ad_read_internal(r, buf, max);
887 cont_ad_classify(r, len);
923 if ((seg == NULL) || (r->
headfrm != seg->startfrm)) {
938 flen = seg->startfrm - r->
headfrm;
940 flen += CONT_AD_ADFRMSIZE;
945 int32 f = max / r->
spf;
950 newstate = CONT_AD_STATE_SIL;
954 if (flen > seg->nfrm)
957 newstate = CONT_AD_STATE_SPEECH;
964 if ((newstate == CONT_AD_STATE_SIL) && (!r->
rawmode)) {
967 if (r->
headfrm >= CONT_AD_ADFRMSIZE)
968 r->
headfrm -= CONT_AD_ADFRMSIZE;
984 if (r->
state == newstate)
990 if (newstate == CONT_AD_STATE_SPEECH) {
992 assert(seg->startfrm >= 0);
997 && (seg->next || (r->
tail_state == CONT_AD_STATE_SIL))) {
999 if (seg->next == NULL)
1012 cont_ad_read_log(r, retval);
1024 int32 i, s, k, len, tailfrm;
1030 for (i = 0; i < CONT_AD_POWHISTSIZE; i++)
1033 if (tailfrm >= CONT_AD_ADFRMSIZE)
1034 tailfrm -= CONT_AD_ADFRMSIZE;
1035 s = (tailfrm * r->
spf);
1043 if ((k = (*(r->adfunc)) (r->
ad, r->
adbuf + s, len)) < 0)
1050 compute_frame_pow(r, tailfrm);
1054 return find_thresh(r);
1060 return r->
spf * CONT_AD_CALIB_FRAMES;
1066 int32 i, s, len, tailfrm;
1073 for (i = 0; i < CONT_AD_POWHISTSIZE; i++)
1078 if (tailfrm >= CONT_AD_ADFRMSIZE)
1079 tailfrm -= CONT_AD_ADFRMSIZE;
1080 s = (tailfrm * r->
spf);
1087 memcpy(r->
adbuf + s, buf, len *
sizeof(int16));
1090 compute_frame_pow(r, tailfrm);
1094 return find_thresh(r);
1105 if ((sil < 0) || (speech < 0)) {
1107 "cont_ad_set_thresh: invalid threshold arguments: %d, %d\n",
1127 int32 delta_speech, int32 min_noise,
1128 int32 max_noise, int32 winsize,
1129 int32 speech_onset, int32 sil_onset, int32 leader,
1130 int32 trailer, float32 adapt_rate)
1132 if ((delta_sil < 0) || (delta_speech < 0) || (min_noise < 0)
1133 || (max_noise < 0)) {
1134 E_ERROR(
"threshold arguments: "
1135 "%d, %d, %d, %d must all be >=0\n", delta_sil,
1136 delta_speech, min_noise, max_noise);
1140 if ((speech_onset > winsize) || (speech_onset <= 0)
1141 || (winsize <= 0)) {
1143 (
"speech_onset, %d, must be <= winsize, %d, and both >0\n",
1144 speech_onset, winsize);
1148 if ((sil_onset > winsize) || (sil_onset <= 0) || (winsize <= 0)) {
1150 (
"sil_onset, %d, must be <= winsize, %d, and both >0\n",
1151 sil_onset, winsize);
1155 if (((leader + trailer) > winsize) || (leader <= 0)
1156 || (trailer <= 0)) {
1158 (
"leader, %d, plus trailer, %d, must be <= winsize, %d, and both >0\n",
1159 leader, trailer, winsize);
1163 if ((adapt_rate < 0.0) || (adapt_rate > 1.0)) {
1164 E_ERROR(
"adapt_rate, %e; must be in range 0..1\n", adapt_rate);
1200 int32 * delta_speech, int32 * min_noise,
1201 int32 * max_noise, int32 * winsize,
1202 int32 * speech_onset, int32 * sil_onset,
1203 int32 * leader, int32 * trailer, float32 * adapt_rate)
1205 if (!delta_sil || !delta_speech || !min_noise || !max_noise
1206 || !winsize || !speech_onset || !sil_onset || !leader
1207 || !trailer || !adapt_rate) {
1208 fprintf(stderr,
"cont_ad_get_params: some param slots are NULL\n");
1294 int32(*func) (
ad_rec_t *, int16 *, int32))
1323 if (f >= CONT_AD_ADFRMSIZE)
1327 else if (r->
tail_state == CONT_AD_STATE_SPEECH) {
1333 if (f >= CONT_AD_ADFRMSIZE)
1378 if ((r = malloc(
sizeof(*r))) == NULL) {
1391 r->
sps = CONT_AD_SPS;
1394 r->
spf = (r->
sps * 256) / CONT_AD_SPS;
1403 calloc(CONT_AD_POWHISTSIZE,
sizeof(*r->
pow_hist))) == NULL) {
1410 calloc(CONT_AD_ADFRMSIZE,
sizeof(*r->
frm_pow))) == NULL) {
1418 r->
state = CONT_AD_STATE_SIL;
1434 r->
leader = CONT_AD_LEADER;
1460 int32(*func) (
ad_rec_t *, int16 *, int32))
SPHINXBASE_EXPORT cont_ad_t * cont_ad_init_rawmode(ad_rec_t *ad, int32(*adfunc)(ad_rec_t *ad, int16 *buf, int32 max))
Initializes a continuous listening object which simply passes data through (!)
spseg_t * spseg_tail
Last of unconsumed speech segments.
int32 tot_frm
Total number of frames of A/D data read, including consumed ones.
int32 max_noise
noise higher than this signals an error
int32 sps
Samples/sec; moved from ad->sps to break dependence on ad by N.
FILE * rawfp
If non-NULL, raw audio input data processed by cont_ad is dumped to this file.
SPHINXBASE_EXPORT int32 cont_ad_reset(cont_ad_t *cont)
Reset, discarding any accumulated speech segments.
int16 * adbuf
Circular buffer for maintaining A/D data read until consumed.
SPHINXBASE_EXPORT int32 cont_ad_read(cont_ad_t *r, int16 *buf, int32 max)
Read raw audio data into the silence filter.
int32 state
State of data returned by most recent cont_ad_read call; CONT_AD_STATE_SIL or CONT_AD_STATE_SPEECH.
SPHINXBASE_EXPORT int32 cont_ad_set_logfp(cont_ad_t *c, FILE *fp)
Set the file to which cont_ad logs its progress.
int32 * pow_hist
Histogram of frame power, moving window, decayed.
int32 eof
Whether the source ad device has encountered EOF.
Continuous A/D listening and silence filtering module.
int32 leader
pad beggining of speech with this many extra frms
(FOR INTERNAL USE ) Data structure for maintaining speech (non-silence) segments not yet consumed by ...
FILE * logfp
If non-NULL, write detailed logs of this object's progress to the file.
Basic type definitions used in Sphinx.
char * frm_pow
Frame power.
int32 speech_onset
start speech on >= these many frames out of winsize, of >= delta_speech
int32 headfrm
Frame number in adbuf with unconsumed A/D data.
SPHINXBASE_EXPORT int32 cont_ad_close(cont_ad_t *cont)
Close the continuous listening object.
int32 auto_thresh
Do automatic threshold adjustment or not.
spseg_t * spseg_head
First of unconsumed speech segments.
int32 n_calib_frame
Number of frames of calibration data seen so far.
int32 win_startfrm
Where next analysis window begins.
int32 n_sample
Number of samples of unconsumed data in adbuf.
SPHINXBASE_EXPORT cont_ad_t * cont_ad_init(ad_rec_t *ad, int32(*adfunc)(ad_rec_t *ad, int16 *buf, int32 max))
Initialize a continuous listening/silence filtering object.
int32 win_validfrm
Number of frames currently available from win_startfrm for analysis.
int32 n_other
If in SILENCE state, number of frames in analysis window considered to be speech; otherwise number of...
SPHINXBASE_EXPORT void cont_ad_powhist_dump(FILE *fp, cont_ad_t *cont)
Dump the power histogram.
SPHINXBASE_EXPORT int32 cont_set_thresh(cont_ad_t *r, int32 silence, int32 speech)
Set the silence and speech thresholds.
int32 delta_sil
Max silence power/frame ABOVE noise level.
SPHINXBASE_EXPORT int32 cont_ad_calib(cont_ad_t *cont)
Calibrate the silence filter.
Implementation of logging routines.
SPHINXBASE_EXPORT int32 cont_ad_detach(cont_ad_t *c)
Detach the given continuous listening module from the associated audio device.
SPHINXBASE_EXPORT int32 cont_ad_calib_size(cont_ad_t *r)
Get the number of samples required to calibrate the silence filter.
generic live audio interface for recording and playback
int32 tail_state
State at the end of its internal buffer (internal use): CONT_AD_STATE_SIL or CONT_AD_STATE_SPEECH.
int32 min_noise
noise lower than this we ignore
SPHINXBASE_EXPORT int32 cont_ad_attach(cont_ad_t *c, ad_rec_t *a, int32(*func)(ad_rec_t *, int16 *, int32))
Attach the continuous listening module to the given audio device/function.
SPHINXBASE_EXPORT int32 cont_ad_set_rawfp(cont_ad_t *c, FILE *fp)
Set a file for dumping raw audio input.
SPHINXBASE_EXPORT int32 cont_ad_get_params(cont_ad_t *r, int32 *delta_sil, int32 *delta_speech, int32 *min_noise, int32 *max_noise, int32 *winsize, int32 *speech_onset, int32 *sil_onset, int32 *leader, int32 *trailer, float32 *adapt_rate)
PWP 1/14/98 – get the changable params.
int32 noise_level
PWP: what we claim as the "current" noise level.
Continuous listening module or object Continuous listening module or object.
int32 n_frm
Number of complete frames of unconsumed A/D data in adbuf.
SPHINXBASE_EXPORT int32 cont_ad_buffer_space(cont_ad_t *r)
Get the maximum number of samples which can be passed into cont_ad_read().
float32 adapt_rate
Linear interpolation constant for rate at which noise level adapted to each estimate; range: 0-1; 0=>...
int32 delta_speech
Min speech power/frame ABOVE noise level.
#define E_ERROR
Print error message to standard error stream.
SPHINXBASE_EXPORT int32 cont_ad_calib_loop(cont_ad_t *r, int16 *buf, int32 max)
Calibrate the silence filter without an audio device.
int32 prev_sample
For pre-emphasis filter.
int32 spf
Samples/frame; audio level is analyzed within frames.
int32 read_ts
Absolute timestamp (total no.
ad_rec_t * ad
A/D device argument for adfunc.
#define E_ERROR_SYSTEM
Print error text; Call perror("");.
int32 rawmode
Pass all input data through, without filtering silence.
int32 siglvl
Max signal level for the data consumed by the most recent cont_ad_read call (dB range: 0-99)...
int32 winsize
how many frames to look at for speech det
int32 sil_onset
end speech on >= these many frames out of winsize, of <= delta_sil
int32 thresh_update
Number of frames before next update to pow_hist/thresholds.
SPHINXBASE_EXPORT int32 cont_ad_set_thresh(cont_ad_t *cont, int32 sil, int32 sp)
Set silence and speech threshold parameters.
int32 trailer
pad end of speech with this many extra frms
int32 adbufsize
Buffer size (Number of samples)
int32 thresh_speech
Frame considered to be speech if power >= thresh_speech (for transitioning from SILENCE to SPEECH sta...
int32 thresh_sil
Frame considered to be silence if power <= thresh_sil (for transitioning from SPEECH to SILENCE state...
SPHINXBASE_EXPORT int32 cont_ad_set_params(cont_ad_t *r, int32 delta_sil, int32 delta_speech, int32 min_noise, int32 max_noise, int32 winsize, int32 speech_onset, int32 sil_onset, int32 leader, int32 trailer, float32 adapt_rate)
Set the changable parameters.