SphinxBase  0.6
cont_ad_base.c
1 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /* ====================================================================
3  * Copyright (c) 1999-2001 Carnegie Mellon University. All rights
4  * reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright
11  * notice, this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright
14  * notice, this list of conditions and the following disclaimer in
15  * the documentation and/or other materials provided with the
16  * distribution.
17  *
18  * This work was supported in part by funding from the Defense Advanced
19  * Research Projects Agency and the National Science Foundation of the
20  * United States of America, and the CMU Sphinx Speech Consortium.
21  *
22  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33  *
34  * ====================================================================
35  *
36  */
37 /*
38  * cont_ad.c -- Continuous A/D listening and silence filtering module.
39  *
40  * HISTORY
41  *
42  * $Log: cont_ad_base.c,v $
43  * Revision 1.14 2005/07/02 03:51:32 rkm
44  * Slowed down power histogram decay rate
45  *
46  * Revision 1.13 2005/06/30 00:27:17 rkm
47  * Fixed silence handling in rawmode; added extra state variables
48  *
49  *
50  * 28-Jun-2005 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University.
51  * - Changed rawmode handling to simply copy data even for silence
52  * segments.
53  * - Moved definitions of CONT_AD_STATE_{SIL,SPEECH} from .c to .h.
54  *
55  * Revision 1.12 2005/06/29 23:48:04 egouvea
56  * Revert changes: variables defined in cont_ad_base.c should not be accessible by the application
57  *
58  * Revision 1.10 2005/02/13 01:29:48 rkm
59  * Fixed cont_ad_read to never cross sil/speech boundary, and rawmode
60  *
61  * Revision 1.9 2005/02/01 22:21:19 rkm
62  * Added raw data logging, and raw data pass-through mode to cont_ad
63  *
64  * Revision 1.8 2004/07/23 23:36:34 egouvea
65  * Ravi's merge, with the latest fixes in the FSG code, and making the log files generated by FSG, LM, and allphone have the same 'look and feel', with the backtrace information presented consistently
66  *
67  * 23-Jul-2004 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
68  * Changed default adapt_rate from 0.5 to 0.2.
69  *
70  * Revision 1.7 2004/07/16 00:57:12 egouvea
71  * Added Ravi's implementation of FSG support.
72  *
73  * Revision 1.2 2004/06/23 20:31:18 rkm
74  * Added adapt_rate parameter; restructured frame processing to include threshold update
75  *
76  *
77  * 23-Oct-98 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
78  * Small change in the way the noiselevel is updated in find_thresh().
79  *
80  * 26-Aug-98 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
81  * Separated computation of "frame power" into a separate low-level
82  * function.
83  *
84  * 13-Jul-98 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
85  * Modified to allow frame size to depend on audio sampling rate.
86  *
87  * 01-Jul-98 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
88  * Changed CONT_AD_DELTA_SPEECH back to 20.
89  *
90  * 30-Jun-98 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
91  * Changed CONT_AD_DELTA_SPEECH from 10 to 15.
92  * Added FILE* argument to cont_ad_powhist_dump().
93  *
94  * 19-Jun-98 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
95  * Changed CONT_AD_DELTA_SPEECH from 20 to 10, to increase sensitivity
96  * to very short utterances.
97  *
98  * 16-Jan-98 Paul Placeway (pwp@cs.cmu.edu) at Carnegie Mellon University
99  * Changed to use dB instead of the weird power measure.
100  * Changed analysis window size, tuned default settings of most
101  * parameters to make the system less sensitive to noise, changed
102  * the histogram update frequency and decay to make the system
103  * adapt more rapidly to changes in the environment.
104  * Added cont_ad_set_params() and cont_ad_get_params().
105  *
106  * 28-Jul-96 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
107  * Added FRMPOW2SIGLVL, max_siglvl(), and cont_ad_t.siglvl.
108  * Changed min signal energy/frame to CONT_AD_SPF.
109  *
110  * 27-Jun-96 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
111  * Added the option for cont_ad_read to return -1 on EOF.
112  *
113  * 21-Jun-96 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
114  * Added cont_ad_set_thresh().
115  * Bugfix: n_other is recomputed after updating thresholds.
116  *
117  * 20-Jun-96 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
118  * Separated thresholds for speech and silence.
119  * Fixed bug in moving analysis window upon transition to speech state.
120  *
121  * 17-Jun-96 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
122  * Created, based loosely on Steve Reed's original implementation.
123  */
124 
125 /*
126  * This module is intended to be interposed as a filter between any raw A/D source and the
127  * application to remove silence regions. It is initialized with a raw A/D source function
128  * (during the cont_ad_init call). Filtered A/D data can be read by the application using
129  * the cont_ad_read function. This module assumes that the A/D source function supplies an
130  * endless stream of data. The application is responsible for setting up the A/D source,
131  * turning recording on and off as it desires. It is also responsible for invoking the
132  * cont_ad_read function frequently enough to avoid buffer overruns and dropping A/D data.
133  * This continuous listening module has an internal buffer of about 4 sec.
134  *
135  * This module must be initialized and calibrated at first (cont_ad_init and cont_ad_calib
136  * functions). Raw samples are grouped into frames, the signal power in each frame is
137  * computed and accumulated in a histogram. The module is always in one of two states:
138  * SILENCE or SPEECH. Transitions between the two states are detected by looking for a
139  * contiguous window of several frames that is predominantly of the other type. The type
140  * is determined by comparing frame power to either of two thresholds, thresh_sil and
141  * thresh_speech, as appropriate for the current state. These thresholds are set from the
142  * first peak in the low-end of the power histogram, and are updated every few seconds.
143  * Separate thresholds are used to provide some hysteresis.
144  *
145  * The module maintains a linked list of speech (non-silence) segments not yet read by the
146  * application. The cont_ad_read function returns speech data, if any available, by
147  * following this list. It also updates an "absolute" timestamp at the end of the
148  * cont_ad_read operation. The timestamp indicates the total #samples of A/D data read
149  * until this point, including data discarded as silence frames. The application is
150  * responsible for using this timestamp to make any policy decisions regarding utterance
151  * boundaries or whatever.
152  */
153 
154 #include <stdio.h>
155 #include <stdlib.h>
156 #include <string.h>
157 #include <assert.h>
158 #include <math.h>
159 
160 #ifdef HAVE_CONFIG_H
161 #include <config.h>
162 #endif
163 
164 #ifdef _MSC_VER
165 #pragma warning (disable: 4305)
166 #endif
167 
168 #include "sphinxbase/prim_type.h"
169 #include "sphinxbase/ad.h"
170 #include "sphinxbase/cont_ad.h"
171 #include "sphinxbase/err.h"
172 
173 
174 #ifndef _ABS
175 #define _ABS(x) ((x) >= 0 ? (x) : -(x))
176 #endif
177 
178 
179 /* Various parameters, including defaults for many cont_ad_t member variables */
180 
181 #define CONT_AD_ADFRMSIZE 256 /* #Frames of internal A/D buffer maintained */
182 
183 #define CONT_AD_POWHISTSIZE 98 /* #Powhist bins: ~ FRMPOW(65536^2*CONT_AD_SPF) */
184 /* Maximum level is 96.3 dB full-scale; 97 for safety, plus 1 for zero-based */
185 
186 #define CONT_AD_CALIB_FRAMES (CONT_AD_POWHISTSIZE * 2)
187 
188 #define CONT_AD_THRESH_UPDATE 100 /* Update thresholds approx every so many frames */
189  /* PWP: update was 200 frames, or 3.2 seconds. Now about every 1.6 sec. */
190 
191 #define CONT_AD_ADAPT_RATE 0.2 /* Interpolation of new and old noiselevel */
192 
193 #define CONT_AD_SPS 16000
194 
195 #define CONT_AD_DEFAULT_NOISE 30 /* Default background noise power level */
196 #define CONT_AD_DELTA_SIL 10 /* Initial default for cont_ad_t.delta_sil */
197 #define CONT_AD_DELTA_SPEECH 17 /* Initial default for cont_ad_t.delta_speech */
198 #define CONT_AD_MIN_NOISE 2 /* Expected minimum background noise level */
199 #define CONT_AD_MAX_NOISE 70 /* Maximum background noise level */
200 
201 #define CONT_AD_HIST_INERTIA 3 /* Used in decaying the power histogram */
202 
203 #define CONT_AD_WINSIZE 21 /* Analysis window for state transitions */
204  /* rkm had 16 */
205 
206 #define CONT_AD_SPEECH_ONSET 9 /* Min #speech frames in analysis window for
207  SILENCE -> SPEECH state transition */
208 /*
209  * SReed had 100 ms == 6.25 fr contiguous; rkm had 9 (out of 16+10) with a
210  * lower threshold.
211  */
212 
213 #define CONT_AD_SIL_ONSET 18 /* Min #silence frames in analysis window for
214  SPEECH -> SILENCE state transition
215  MUST BE <= CONT_AD_WINSIZE */
216 /*
217  * SReed had 400 ms == 25 fr contiguous; rkm had 14 out of 16
218  */
219 
220 #define CONT_AD_LEADER 5 /* On transition to SPEECH state, so many frames
221  BEFORE window included in speech data (>0) */
222  /* SReed had 200 ms == 12.5 fr; rkm had 5 */
223 
224 #define CONT_AD_TRAILER 10 /* On transition to SILENCE state, so many frames
225  of silence included in speech data (>0).
226  NOTE: Ensure (0 < TRAILER+LEADER <= WINSIZE) */
227  /* SReed had 100 ms == 6.25 fr; rkm had 10 */
228 
229 
230 void
232 {
233  int32 i, j;
234 
235  fprintf(fp, "PowHist:\n");
236  for (i = 0, j = 0; i < CONT_AD_POWHISTSIZE; i++) {
237  if (r->pow_hist[i] > 0) {
238  fprintf(fp, "\t%3d %6d\n", i, r->pow_hist[i]);
239  j = i;
240  }
241  }
242 
243  fprintf(fp, "PH[%7.2f]:",
244  (double) (r->tot_frm * r->spf) / (double) (r->sps));
245  for (i = 0; i <= j; i++)
246  fprintf(fp, " %2d", r->pow_hist[i]);
247  fprintf(fp, "\n");
248 
249  fflush(fp);
250 }
251 
252 
253 /*
254  * Compute frame power. Interface deliberately kept low level to allow arbitrary
255  * users to call this function with appropriate data.
256  */
257 int32
258 cont_ad_frame_pow(int16 * buf, int32 * prev, int32 spf)
259 {
260  double sumsq, v;
261  int32 i;
262  int32 p;
263 
264  sumsq = 0.0;
265  p = *prev;
266  for (i = 0; i < spf; i++) {
267  /* Note: pre-emphasis done to remove low-frequency noise. */
268  v = (double) (buf[i] - p);
269  sumsq += v * v;
270  p = buf[i];
271  }
272  *prev = p;
273 
274  if (sumsq < spf) /* Make sure FRMPOW(sumsq) >= 0 */
275  sumsq = spf;
276 
277  /*
278  * PWP: Units changed to dB
279  *
280  * Now the units of measurement of an input sample are volts (really!),
281  * so the power in dB is p = 20*log10(samp). Further, we want the RMS
282  * (root-mean-squared) average power across the frame.
283  *
284  * "sumsq" is the sum of the sum of the squares, so we want
285  *
286  * i = 20 * log10( sqrt ( sumsq / n_samps) )
287  *
288  * (Stephen Reed's code actually had
289  * i = 20 * log10( sqrt (sumsq) / n_samps )
290  * but this only produced an additive error.)
291  *
292  * i = 20 * log10( sqrt ( sumsq / n_samps) )
293  * = 20 * log10( ( sumsq / n_samps) ^ 0.5 )
294  * = 20 * log10( ( sumsq / n_samps) ) * 0.5 )
295  * = 10 * log10( ( sumsq / n_samps) )
296  * = 10 * ( log10( sumsq) - log10(n_samps) )
297  */
298  i = (int32) ((10.0 * (log10(sumsq) - log10((double) spf))) + 0.5);
299  if (i < 0)
300  i = 0; /* trim lower bound again to be safe. */
301  assert(i < 97);
302 
303  return (i);
304 }
305 
306 
307 /*
308  * Classify frame (id=frm, starting at sample position s) as sil/nonsil. Classification
309  * done in isolation, independent of any other frame, based only on power histogram.
310  */
311 static void
312 compute_frame_pow(cont_ad_t * r, int32 frm)
313 {
314  int32 i;
315 
316  i = cont_ad_frame_pow(r->adbuf + (frm * r->spf), &(r->prev_sample),
317  r->spf);
318 
319  r->frm_pow[frm] = (char) i;
320  (r->pow_hist[i])++;
321  r->thresh_update--;
322 }
323 
324 
325 /* PWP: $$$ check this */
326 /*
327  * PWP: in SReed's code, decay was done by zeroing the histogram,
328  * i.e. no history.
329  */
330 static void
331 decay_hist(cont_ad_t * r)
332 {
333  int32 i;
334 
335  for (i = 0; i < CONT_AD_POWHISTSIZE; i++)
336  r->pow_hist[i] -= (r->pow_hist[i] >> CONT_AD_HIST_INERTIA);
337 }
338 
339 
340 /*
341  * Find silence threshold from power histogram.
342  */
343 static int32
344 find_thresh(cont_ad_t * r)
345 {
346  int32 i, j, max, th;
347  int32 old_noise_level, old_thresh_sil, old_thresh_speech;
348 
349  if (!r->auto_thresh)
350  return 0;
351 
352  /*
353  * Find smallest non-zero histogram entry, but starting at some minimum power.
354  * Power lower than CONT_AD_MIN_NOISE indicates bad A/D input (eg, mic off...).
355  * Too high a minimum power is also bad.
356  */
357  for (i = r->min_noise;
358  (i < CONT_AD_POWHISTSIZE) && (r->pow_hist[i] == 0); i++);
359  if (i > r->max_noise) /* Bad signal? */
360  return -1;
361 
362  /* PWP: Hmmmmm.... SReed's code looks over the lower 20 dB */
363  /* PWP: 1/14/98 Made to work like Stephen Reed's code */
364 
365  /* This method of detecting the noise level is VERY unsatisfactory */
366  max = 0;
367  for (j = i, th = i; (j < CONT_AD_POWHISTSIZE) && (j < i + 20); j++) { /* PWP: was i+6, which was 9 dB */
368  if (max < r->pow_hist[j]) {
369  max = r->pow_hist[j];
370  th = j;
371  }
372  }
373 
374  /* "Don't change the threshold too fast" */
375  old_noise_level = r->noise_level;
376  old_thresh_sil = r->thresh_sil;
377  old_thresh_speech = r->thresh_speech;
378  /* r->noise_level = (int32) (th * r->adapt_rate + r->noise_level * (1.0 - r->adapt_rate)); */
379  r->noise_level =
380  (int32) (r->noise_level +
381  r->adapt_rate * (th - r->noise_level) + 0.5);
382 
383  /* update thresholds */
384  r->thresh_sil = r->noise_level + r->delta_sil;
386 
387  if (r->logfp) {
388  fprintf(r->logfp,
389  "%7.2fs %8df: NoisePeak: %d, Noiselevel: %d -> %d, Th-Sil: %d -> %d, Th-Sp: %d -> %d\n",
390  (double) (r->tot_frm * r->spf) / (double) (r->sps),
391  r->tot_frm, th, old_noise_level, r->noise_level,
392  old_thresh_sil, r->thresh_sil, old_thresh_speech,
393  r->thresh_speech);
394 
396 
397  fflush(r->logfp);
398  }
399 
400  /*
401  * PWP: in SReed's original, he cleared the histogram here.
402  * I can't fathom why.
403  */
404 
405  return 0;
406 }
407 
408 
409 /*
410  * Silence to speech transition
411  */
412 static void
413 sil2speech_transition(cont_ad_t *r, int frm)
414 {
415  spseg_t *seg;
416 
417  /* Speech detected; create speech segment description */
418  seg = malloc(sizeof(*seg));
419 
420  seg->startfrm = r->win_startfrm - r->leader;
421  if (seg->startfrm < 0)
422  seg->startfrm += CONT_AD_ADFRMSIZE;
423  seg->nfrm = r->leader + r->winsize;
424  seg->next = NULL;
425 
426  if (!r->spseg_head)
427  r->spseg_head = seg;
428  else
429  r->spseg_tail->next = seg;
430  r->spseg_tail = seg;
431 
432  r->tail_state = CONT_AD_STATE_SPEECH;
433 
434  if (r->logfp) {
435  int32 n;
436 
437  /* Where (in absolute time) this speech segment starts */
438  n = frm - seg->startfrm;
439  if (n < 0)
440  n += CONT_AD_ADFRMSIZE;
441  n = r->tot_frm - n - 1;
442 
443  fprintf(r->logfp,
444  "%7.2fs %8d[%3d]f: Sil -> Sp detect; seg start: %7.2fs %8d\n",
445  (double) (r->tot_frm *
446  r->spf) /
447  (double) (r->sps),
448  r->tot_frm, frm,
449  (double) (n * r->spf) / (double) (r->sps), n);
450  }
451 
452  /* Now in SPEECH state; want to look for silence from end of this window */
453  r->win_validfrm = 1;
454  r->win_startfrm = frm;
455 
456  /* Count #sil frames remaining in reduced window (of 1 frame) */
457  r->n_other = (r->frm_pow[frm] <= r->thresh_sil) ? 1 : 0;
458 }
459 
460 /*
461  * Speech to silence transition
462  */
463 static void
464 speech2sil_transition(cont_ad_t *r, int frm)
465 {
466  int f;
467 
468  /* End of speech detected; speech->sil transition */
469  r->spseg_tail->nfrm += r->trailer;
470 
471  r->tail_state = CONT_AD_STATE_SIL;
472 
473  if (r->logfp) {
474  int32 n;
475 
476  /* Where (in absolute time) this speech segment ends */
477  n = r->spseg_tail->startfrm + r->spseg_tail->nfrm - 1;
478  if (n >= CONT_AD_ADFRMSIZE)
479  n -= CONT_AD_ADFRMSIZE;
480  n = frm - n;
481  if (n < 0)
482  n += CONT_AD_ADFRMSIZE;
483  n = r->tot_frm - n;
484 
485  fprintf(r->logfp,
486  "%7.2fs %8d[%3d]f: Sp -> Sil detect; seg end: %7.2fs %8d\n",
487  (double) (r->tot_frm * r->spf) /
488  (double) (r->sps), r->tot_frm, frm,
489  (double) (n * r->spf) / (double) (r->sps), n);
490  }
491 
492  /* Now in SILENCE state; start looking for speech trailer+leader frames later */
493  r->win_validfrm -= (r->trailer + r->leader - 1);
494  r->win_startfrm += (r->trailer + r->leader - 1);
495  if (r->win_startfrm >= CONT_AD_ADFRMSIZE)
496  r->win_startfrm -= CONT_AD_ADFRMSIZE;
497 
498  /* Count #speech frames remaining in reduced window */
499  r->n_other = 0;
500  for (f = r->win_startfrm;;) {
501  if (r->frm_pow[f] >= r->thresh_speech)
502  r->n_other++;
503 
504  if (f == frm)
505  break;
506 
507  f++;
508  if (f >= CONT_AD_ADFRMSIZE)
509  f = 0;
510  }
511 }
512 
513 
514 /*
515  * Main silence/speech region detection routine. If currently in
516  * SILENCE state, switch to SPEECH state if a window (r->winsize)
517  * of frames is mostly non-silence. If in SPEECH state, switch to
518  * SILENCE state if the window is mostly silence.
519  */
520 static void
521 boundary_detect(cont_ad_t * r, int32 frm)
522 {
523  assert(r->n_other >= 0);
524 
525  r->win_validfrm++;
526  if (r->tail_state == CONT_AD_STATE_SIL) {
527  if (r->frm_pow[frm] >= r->thresh_speech)
528  r->n_other++;
529  }
530  else {
531  if (r->frm_pow[frm] <= r->thresh_sil)
532  r->n_other++;
533  }
534 
535  if (r->logfp) {
536  fprintf(r->logfp,
537  "%7.2fs %8d[%3d]f: P: %2d, N: %2d, T+: %2d, T-: %2d, #O: %2d, %s\n",
538  (double) (r->tot_frm * r->spf) / (double) (r->sps),
539  r->tot_frm, frm, r->frm_pow[frm], r->noise_level,
540  r->thresh_speech, r->thresh_sil, r->n_other,
541  (r->tail_state == CONT_AD_STATE_SIL) ? "--" : "Sp");
542  }
543 
544  if (r->win_validfrm < r->winsize) /* Not reached full analysis window size */
545  return;
546  assert(r->win_validfrm == r->winsize);
547 
548  if (r->tail_state == CONT_AD_STATE_SIL) { /* Currently in SILENCE state */
549  if (r->n_frm >= r->winsize + r->leader
550  && r->n_other >= r->speech_onset) {
551  sil2speech_transition(r, frm);
552  }
553  }
554  else {
555  if (r->n_other >= r->sil_onset) {
556  speech2sil_transition(r, frm);
557  }
558  else {
559  /* In speech state, and staying there; add this frame to segment */
560  r->spseg_tail->nfrm++;
561  }
562  }
563 
564  /*
565  * Get rid of oldest frame in analysis window. Not quite correct;
566  * thresholds could have changed over the window; should preserve
567  * the original speech/silence label for the frame and undo it. Later..
568  */
569  if (r->tail_state == CONT_AD_STATE_SIL) {
570  if (r->frm_pow[r->win_startfrm] >= r->thresh_speech) {
571  if (r->n_other > 0)
572  r->n_other--;
573  }
574  }
575  else {
576  if (r->frm_pow[r->win_startfrm] <= r->thresh_sil) {
577  if (r->n_other > 0)
578  r->n_other--;
579  }
580  }
581  r->win_validfrm--;
582  r->win_startfrm++;
583  if (r->win_startfrm >= CONT_AD_ADFRMSIZE)
584  r->win_startfrm = 0;
585 
586  if (r->logfp)
587  fflush(r->logfp);
588 }
589 
590 
591 static int32
592 max_siglvl(cont_ad_t * r, int32 startfrm, int32 nfrm)
593 {
594  int32 siglvl, i, f;
595 
596  siglvl = 0;
597  if (nfrm > 0) {
598  for (i = 0, f = startfrm; i < nfrm; i++, f++) {
599  if (f >= CONT_AD_ADFRMSIZE)
600  f -= CONT_AD_ADFRMSIZE;
601  if (r->frm_pow[f] > siglvl)
602  siglvl = r->frm_pow[f];
603  }
604  }
605  return siglvl;
606 }
607 
608 
609 #if 0
610 /*
611  * RKM(2005/01/31): Where did this come from? If needed, it should be called
612  * cont_ad_get_audio_data.
613  */
614 void
615 get_audio_data(cont_ad_t * r, int16 * buf, int32 max)
616 {
617 }
618 #endif
619 
620 
621 static void
622 cont_ad_read_log(cont_ad_t * r, int32 retval)
623 {
624  spseg_t *seg;
625 
626  fprintf(r->logfp, "return from cont_ad_read() -> %d:\n", retval);
627  fprintf(r->logfp, "\tstate: %d\n", r->state);
628  fprintf(r->logfp, "\tread_ts: %d (%.2fs)\n",
629  r->read_ts, (float32) r->read_ts / (float32) r->sps);
630  fprintf(r->logfp, "\tseglen: %d (%.2fs)\n",
631  r->seglen, (float32) r->seglen / (float32) r->sps);
632  fprintf(r->logfp, "\tsiglvl: %d\n", r->siglvl);
633  fprintf(r->logfp, "\theadfrm: %d\n", r->headfrm);
634  fprintf(r->logfp, "\tn_frm: %d\n", r->n_frm);
635  fprintf(r->logfp, "\tn_sample: %d\n", r->n_sample);
636  fprintf(r->logfp, "\twin_startfrm: %d\n", r->win_startfrm);
637  fprintf(r->logfp, "\twin_validfrm: %d\n", r->win_validfrm);
638  fprintf(r->logfp, "\tnoise_level: %d\n", r->noise_level);
639  fprintf(r->logfp, "\tthresh_sil: %d\n", r->thresh_sil);
640  fprintf(r->logfp, "\tthresh_speech: %d\n", r->thresh_speech);
641  fprintf(r->logfp, "\tn_other: %d\n", r->n_other);
642  fprintf(r->logfp, "\ttail_state: %d\n", r->tail_state);
643  fprintf(r->logfp, "\ttot_frm: %d\n", r->tot_frm);
644 
645  fprintf(r->logfp, "\tspseg:");
646  for (seg = r->spseg_head; seg; seg = seg->next)
647  fprintf(r->logfp, " %d[%d]", seg->startfrm, seg->nfrm);
648  fprintf(r->logfp, "\n");
649 
650  fflush(r->logfp);
651 }
652 
653 
654 /*
655  * Copy data from r->adbuf[sf], for nf frames, into buf.
656  * All length checks must have been completed before this call; hence, this
657  * function will copy exactly the specified number of frames.
658  *
659  * Return value: Index of frame just after the segment copied, possibly wrapped
660  * around to 0.
661  */
662 static int32
663 buf_copy(cont_ad_t * r, int32 sf, int32 nf, int16 * buf)
664 {
665  int32 f, l;
666 
667  assert((sf >= 0) && (sf < CONT_AD_ADFRMSIZE));
668  assert(nf >= 0);
669 
670  if (sf + nf > CONT_AD_ADFRMSIZE) {
671  /* Amount to be copied wraps around adbuf; copy in two stages */
672  f = CONT_AD_ADFRMSIZE - sf;
673  l = (f * r->spf);
674  memcpy(buf, r->adbuf + (sf * r->spf), l * sizeof(int16));
675 
676  if (r->logfp) {
677  fprintf(r->logfp,
678  "return %d speech frames [%d..%d]; %d samples\n",
679  f, sf, sf + f - 1, l);
680  }
681 
682  buf += l;
683  sf = 0;
684  nf -= f;
685  }
686 
687  if (nf > 0) {
688  l = (nf * r->spf);
689  memcpy(buf, r->adbuf + (sf * r->spf), l * sizeof(int16));
690 
691  if (r->logfp) {
692  fprintf(r->logfp,
693  "return %d speech frames [%d..%d]; %d samples\n",
694  nf, sf, sf + nf - 1, l);
695  }
696  }
697 
698  if ((sf + nf) >= CONT_AD_ADFRMSIZE) {
699  assert((sf + nf) == CONT_AD_ADFRMSIZE);
700  return 0;
701  }
702  else
703  return (sf + nf);
704 }
705 
706 int32
708 {
709  return r->adbufsize - r->n_sample;
710 }
711 
712 /*
713  * Read as much data as possible from r->adfunc into r->adbuf.
714  */
715 static int32
716 cont_ad_read_internal(cont_ad_t *r, int16 *buf, int32 max)
717 {
718  int32 head, tail, len, l;
719 
720  /*
721  * First read as much of raw A/D as possible and available. adbuf is not
722  * really a circular buffer, so may have to read in two steps for wrapping
723  * around.
724  */
725  head = r->headfrm * r->spf;
726  tail = head + r->n_sample;
727  len = r->n_sample - (r->n_frm * r->spf); /* #partial frame samples at the tail */
728  assert((len >= 0) && (len < r->spf));
729 
730  if ((tail < r->adbufsize) && (!r->eof)) {
731  if (r->adfunc) {
732  if ((l =
733  (*(r->adfunc)) (r->ad, r->adbuf + tail,
734  r->adbufsize - tail)) < 0) {
735  r->eof = 1;
736  l = 0;
737  }
738  }
739  else {
740  l = r->adbufsize - tail;
741  if (l > max) {
742  l = max;
743  max = 0;
744  }
745  else {
746  max -= l;
747  }
748  memcpy(r->adbuf + tail, buf, l * sizeof(int16));
749  buf += l;
750  }
751  if ((l > 0) && r->rawfp) {
752  fwrite(r->adbuf + tail, sizeof(int16), l, r->rawfp);
753  fflush(r->rawfp);
754  }
755 
756  tail += l;
757  len += l;
758  r->n_sample += l;
759  }
760  if ((tail >= r->adbufsize) && (!r->eof)) {
761  tail -= r->adbufsize;
762  if (tail < head) {
763  if (r->adfunc) {
764  if ((l =
765  (*(r->adfunc)) (r->ad,
766  r->adbuf + tail, head - tail)) < 0) {
767  r->eof = 1;
768  l = 0;
769  }
770  }
771  else {
772  l = head - tail;
773  if (l > max)
774  l = max;
775  memcpy(r->adbuf + tail, buf, l * sizeof(int16));
776  }
777  if ((l > 0) && r->rawfp) {
778  fwrite(r->adbuf + tail, sizeof(int16), l, r->rawfp);
779  fflush(r->rawfp);
780  }
781 
782  tail += l;
783  len += l;
784  r->n_sample += l;
785  }
786  }
787 
788  return len;
789 }
790 
791 /*
792  * Classify incoming frames as silence or speech.
793  */
794 int32
795 cont_ad_classify(cont_ad_t *r, int32 len)
796 {
797  int32 tailfrm;
798 
799  tailfrm = (r->headfrm + r->n_frm); /* Next free frame slot to be filled */
800  if (tailfrm >= CONT_AD_ADFRMSIZE)
801  tailfrm -= CONT_AD_ADFRMSIZE;
802 
803  for (; len >= r->spf; len -= r->spf) {
804  compute_frame_pow(r, tailfrm);
805  r->n_frm++;
806  r->tot_frm++;
807 
808  /*
809  * Find speech/sil state change, if any. Also, if staying in speech state
810  * add this frame to current speech segment.
811  */
812  boundary_detect(r, tailfrm);
813 
814  if (++tailfrm >= CONT_AD_ADFRMSIZE)
815  tailfrm = 0;
816 
817  /* Update thresholds if time to do so */
818  if (r->thresh_update <= 0) {
819  int32 i, f;
820  find_thresh(r);
821  decay_hist(r);
822  r->thresh_update = CONT_AD_THRESH_UPDATE;
823 
824 #if 1
825  /*
826  * Since threshold has been updated, recompute r->n_other.
827  * (RKM: Is this really necessary? Comment out??)
828  */
829  r->n_other = 0;
830  if (r->tail_state == CONT_AD_STATE_SIL) {
831  for (i = r->win_validfrm, f = r->win_startfrm; i > 0; --i) {
832  if (r->frm_pow[f] >= r->thresh_speech)
833  r->n_other++;
834 
835  f++;
836  if (f >= CONT_AD_ADFRMSIZE)
837  f = 0;
838  }
839  }
840  else {
841  for (i = r->win_validfrm, f = r->win_startfrm; i > 0; --i) {
842  if (r->frm_pow[f] <= r->thresh_sil)
843  r->n_other++;
844 
845  f++;
846  if (f >= CONT_AD_ADFRMSIZE)
847  f = 0;
848  }
849  }
850 #endif
851  }
852  }
853 
854  return r->tail_state;
855 }
856 
857 /*
858  * Main function called by the application to filter out silence regions.
859  * Maintains a linked list of speech segments pointing into r->adbuf and feeds
860  * data to application from them.
861  */
862 int32
863 cont_ad_read(cont_ad_t * r, int16 * buf, int32 max)
864 {
865  int32 flen, len, retval, newstate;
866  spseg_t *seg;
867 
868  if ((r == NULL) || (buf == NULL))
869  return -1;
870 
871  if (max < r->spf) {
872  E_ERROR
873  ("cont_ad_read requires buffer of at least %d samples\n",
874  r->spf);
875  return -1;
876  }
877 
878  if (r->logfp) {
879  fprintf(r->logfp, "cont_ad_read(,, %d)\n", max);
880  fflush(r->logfp);
881  }
882 
883  /* Read data from adfunc or from buf. */
884  len = cont_ad_read_internal(r, buf, max);
885 
886  /* Compute frame power for unprocessed+new data and find speech/silence boundaries */
887  cont_ad_classify(r, len);
888 
889  /*
890  * If eof on input data source, cleanup the final segment.
891  */
892  if (r->eof) {
893  if (r->tail_state == CONT_AD_STATE_SPEECH) {
894  /*
895  * Still inside a speech segment when input data got over. Absort any
896  * remaining frames into the final speech segment.
897  */
898  assert(r->spseg_tail != NULL);
899 
900  /* Absorb frames still in analysis window into final speech seg */
901  assert((r->win_validfrm >= 0)
902  && (r->win_validfrm < r->winsize));
903  r->spseg_tail->nfrm += r->win_validfrm;
904 
905  r->tail_state = CONT_AD_STATE_SIL;
906  }
907 
908  r->win_startfrm += r->win_validfrm;
909  if (r->win_startfrm >= CONT_AD_ADFRMSIZE)
910  r->win_startfrm -= CONT_AD_ADFRMSIZE;
911  r->win_validfrm = 0;
912  r->n_other = 0;
913  }
914 
915  /*
916  * At last ready to copy speech data, if any, into caller's buffer. Raw
917  * speech data is segmented into alternating speech and silence segments.
918  * But any single call to cont_ad_read will never cross a speech/silence
919  * boundary.
920  */
921  seg = r->spseg_head; /* first speech segment available, if any */
922 
923  if ((seg == NULL) || (r->headfrm != seg->startfrm)) {
924  /*
925  * Either no speech data available, or inside a silence segment. Find
926  * length of silence segment.
927  */
928  if (seg == NULL) {
929  assert(r->tail_state == CONT_AD_STATE_SIL);
930 
931  flen =
932  (r->eof) ? r->n_frm : r->n_frm - (r->winsize +
933  r->leader - 1);
934  if (flen < 0)
935  flen = 0;
936  }
937  else {
938  flen = seg->startfrm - r->headfrm;
939  if (flen < 0)
940  flen += CONT_AD_ADFRMSIZE;
941  }
942 
943  if (r->rawmode) {
944  /* Restrict silence segment to user buffer size, integral #frames */
945  int32 f = max / r->spf;
946  if (flen > f)
947  flen = f;
948  }
949 
950  newstate = CONT_AD_STATE_SIL;
951  }
952  else {
953  flen = max / r->spf; /* truncate read-size to integral #frames */
954  if (flen > seg->nfrm)
955  flen = seg->nfrm; /* truncate further to this segment size */
956 
957  newstate = CONT_AD_STATE_SPEECH;
958  }
959 
960  len = flen * r->spf; /* #samples being consumed */
961 
962  r->siglvl = max_siglvl(r, r->headfrm, flen);
963 
964  if ((newstate == CONT_AD_STATE_SIL) && (!r->rawmode)) {
965  /* Skip silence data */
966  r->headfrm += flen;
967  if (r->headfrm >= CONT_AD_ADFRMSIZE)
968  r->headfrm -= CONT_AD_ADFRMSIZE;
969 
970  retval = 0; /* #samples being copied/returned */
971  }
972  else {
973  /* Copy speech/silence(in rawmode) data */
974  r->headfrm = buf_copy(r, r->headfrm, flen, buf);
975 
976  retval = len; /* #samples being copied/returned */
977  }
978 
979  r->n_frm -= flen;
980  r->n_sample -= len;
981  assert((r->n_frm >= 0) && (r->n_sample >= 0));
982  assert(r->win_validfrm <= r->n_frm);
983 
984  if (r->state == newstate)
985  r->seglen += len;
986  else
987  r->seglen = len;
988  r->state = newstate;
989 
990  if (newstate == CONT_AD_STATE_SPEECH) {
991  seg->startfrm = r->headfrm;
992  assert(seg->startfrm >= 0);
993  seg->nfrm -= flen;
994 
995  /* Free seg if empty and not recording into it */
996  if ((seg->nfrm == 0)
997  && (seg->next || (r->tail_state == CONT_AD_STATE_SIL))) {
998  r->spseg_head = seg->next;
999  if (seg->next == NULL)
1000  r->spseg_tail = NULL;
1001  free(seg);
1002  }
1003  }
1004 
1005  /* Update timestamp. Total raw A/D read - those remaining to be consumed */
1006  r->read_ts = (r->tot_frm - r->n_frm) * r->spf;
1007 
1008  if (retval == 0)
1009  retval = (r->eof && (r->spseg_head == NULL)) ? -1 : 0;
1010 
1011  if (r->logfp)
1012  cont_ad_read_log(r, retval);
1013 
1014  return retval;
1015 }
1016 
1017 
1018 /*
1019  * Calibrate input channel for silence threshold.
1020  */
1021 int32
1023 {
1024  int32 i, s, k, len, tailfrm;
1025 
1026  if (r == NULL)
1027  return -1;
1028 
1029  /* clear histogram */
1030  for (i = 0; i < CONT_AD_POWHISTSIZE; i++)
1031  r->pow_hist[i] = 0;
1032  tailfrm = r->headfrm + r->n_frm;
1033  if (tailfrm >= CONT_AD_ADFRMSIZE)
1034  tailfrm -= CONT_AD_ADFRMSIZE;
1035  s = (tailfrm * r->spf);
1036 
1037  for (r->n_calib_frame = 0;
1038  r->n_calib_frame < CONT_AD_CALIB_FRAMES;
1039  ++r->n_calib_frame) {
1040  len = r->spf;
1041  while (len > 0) {
1042  /*Trouble */
1043  if ((k = (*(r->adfunc)) (r->ad, r->adbuf + s, len)) < 0)
1044  return -1;
1045  len -= k;
1046  s += k;
1047  }
1048  s -= r->spf;
1049 
1050  compute_frame_pow(r, tailfrm);
1051  }
1052 
1053  r->thresh_update = CONT_AD_THRESH_UPDATE;
1054  return find_thresh(r);
1055 }
1056 
1057 int32
1059 {
1060  return r->spf * CONT_AD_CALIB_FRAMES;
1061 }
1062 
1063 int32
1064 cont_ad_calib_loop(cont_ad_t * r, int16 * buf, int32 max)
1065 {
1066  int32 i, s, len, tailfrm;
1067 
1068  if (r->n_calib_frame == CONT_AD_CALIB_FRAMES) {
1069  /* If calibration previously succeeded, then this is a
1070  * recalibration, so start again. */
1071  r->n_calib_frame = 0;
1072  /* clear histogram */
1073  for (i = 0; i < CONT_AD_POWHISTSIZE; i++)
1074  r->pow_hist[i] = 0;
1075  }
1076 
1077  tailfrm = r->headfrm + r->n_frm;
1078  if (tailfrm >= CONT_AD_ADFRMSIZE)
1079  tailfrm -= CONT_AD_ADFRMSIZE;
1080  s = (tailfrm * r->spf);
1081 
1082  len = r->spf;
1083  for (; r->n_calib_frame < CONT_AD_CALIB_FRAMES;
1084  ++r->n_calib_frame) {
1085  if (max < len)
1086  return 1;
1087  memcpy(r->adbuf + s, buf, len * sizeof(int16));
1088  max -= len;
1089  buf += len;
1090  compute_frame_pow(r, tailfrm);
1091  }
1092 
1093  r->thresh_update = CONT_AD_THRESH_UPDATE;
1094  return find_thresh(r);
1095 }
1096 
1097 
1098 /* PWP 1/14/98 -- modified for compatibility with old code */
1099 int32
1100 cont_ad_set_thresh(cont_ad_t * r, int32 sil, int32 speech)
1101 {
1102  if (r == NULL)
1103  return -1;
1104 
1105  if ((sil < 0) || (speech < 0)) {
1106  fprintf(stderr,
1107  "cont_ad_set_thresh: invalid threshold arguments: %d, %d\n",
1108  sil, speech);
1109  return -1;
1110  }
1111  r->delta_sil = (3 * sil) / 2;
1112  r->delta_speech = (3 * speech) / 2;
1113 
1114  return 0;
1115 }
1116 
1117 
1118 /*
1119  * PWP 1/14/98 -- set the changable params.
1120  *
1121  * delta_sil, delta_speech, min_noise, and max_noise are in dB,
1122  * winsize, speech_onset, sil_onset, leader and trailer are in frames of
1123  * 16 ms length (256 samples @ 16kHz sampling).
1124  */
1125 int32
1126 cont_ad_set_params(cont_ad_t * r, int32 delta_sil,
1127  int32 delta_speech, int32 min_noise,
1128  int32 max_noise, int32 winsize,
1129  int32 speech_onset, int32 sil_onset, int32 leader,
1130  int32 trailer, float32 adapt_rate)
1131 {
1132  if ((delta_sil < 0) || (delta_speech < 0) || (min_noise < 0)
1133  || (max_noise < 0)) {
1134  E_ERROR("threshold arguments: "
1135  "%d, %d, %d, %d must all be >=0\n", delta_sil,
1136  delta_speech, min_noise, max_noise);
1137  return -1;
1138  }
1139 
1140  if ((speech_onset > winsize) || (speech_onset <= 0)
1141  || (winsize <= 0)) {
1142  E_ERROR
1143  ("speech_onset, %d, must be <= winsize, %d, and both >0\n",
1144  speech_onset, winsize);
1145  return -1;
1146  }
1147 
1148  if ((sil_onset > winsize) || (sil_onset <= 0) || (winsize <= 0)) {
1149  E_ERROR
1150  ("sil_onset, %d, must be <= winsize, %d, and both >0\n",
1151  sil_onset, winsize);
1152  return -1;
1153  }
1154 
1155  if (((leader + trailer) > winsize) || (leader <= 0)
1156  || (trailer <= 0)) {
1157  E_ERROR
1158  ("leader, %d, plus trailer, %d, must be <= winsize, %d, and both >0\n",
1159  leader, trailer, winsize);
1160  return -1;
1161  }
1162 
1163  if ((adapt_rate < 0.0) || (adapt_rate > 1.0)) {
1164  E_ERROR("adapt_rate, %e; must be in range 0..1\n", adapt_rate);
1165  return -1;
1166  }
1167 
1168  if (r == NULL)
1169  return -1;
1170 
1171  r->delta_sil = delta_sil;
1172  r->delta_speech = delta_speech;
1173  r->min_noise = min_noise;
1174  r->max_noise = max_noise;
1175 
1176  r->winsize = winsize;
1177  r->speech_onset = speech_onset;
1178  r->sil_onset = sil_onset;
1179  r->leader = leader;
1180  r->trailer = trailer;
1181 
1182  r->adapt_rate = adapt_rate;
1183 
1184  if (r->win_validfrm >= r->winsize)
1185  r->win_validfrm = r->winsize - 1;
1186 
1187  return 0;
1188 }
1189 
1190 
1191 /*
1192  * PWP 1/14/98 -- get the changable params.
1193  *
1194  * delta_sil, delta_speech, min_noise, and max_noise are in dB,
1195  * winsize, speech_onset, sil_onset, leader and trailer are in frames of
1196  * 16 ms length (256 samples @ 16kHz sampling).
1197  */
1198 int32
1199 cont_ad_get_params(cont_ad_t * r, int32 * delta_sil,
1200  int32 * delta_speech, int32 * min_noise,
1201  int32 * max_noise, int32 * winsize,
1202  int32 * speech_onset, int32 * sil_onset,
1203  int32 * leader, int32 * trailer, float32 * adapt_rate)
1204 {
1205  if (!delta_sil || !delta_speech || !min_noise || !max_noise
1206  || !winsize || !speech_onset || !sil_onset || !leader
1207  || !trailer || !adapt_rate) {
1208  fprintf(stderr, "cont_ad_get_params: some param slots are NULL\n");
1209  return (-1);
1210  }
1211 
1212  if (r == NULL)
1213  return -1;
1214 
1215  *delta_sil = r->delta_sil;
1216  *delta_speech = r->delta_speech;
1217  *min_noise = r->min_noise;
1218  *max_noise = r->max_noise;
1219 
1220  *winsize = r->winsize;
1221  *speech_onset = r->speech_onset;
1222  *sil_onset = r->sil_onset;
1223  *leader = r->leader;
1224  *trailer = r->trailer;
1225 
1226  *adapt_rate = r->adapt_rate;
1227 
1228  return 0;
1229 }
1230 
1231 
1232 /*
1233  * Reset, discarded any accumulated speech.
1234  */
1235 int32
1237 {
1238  spseg_t *seg;
1239 
1240  if (r == NULL)
1241  return -1;
1242 
1243  while (r->spseg_head) {
1244  seg = r->spseg_head;
1245  r->spseg_head = seg->next;
1246  free(seg);
1247  }
1248  r->spseg_tail = NULL;
1249 
1250  r->headfrm = 0;
1251  r->n_frm = 0;
1252  r->n_sample = 0;
1253  r->win_startfrm = 0;
1254  r->win_validfrm = 0;
1255  r->n_other = 0;
1256 
1257  r->tail_state = CONT_AD_STATE_SIL;
1258 
1259  return 0;
1260 }
1261 
1262 
1263 int32
1265 {
1266  if (cont == NULL)
1267  return -1;
1268 
1269  cont_ad_reset(cont); /* Frees any remaining speech segments */
1270 
1271  free(cont->adbuf);
1272  free(cont->pow_hist);
1273  free(cont->frm_pow);
1274  free(cont);
1275 
1276  return 0;
1277 }
1278 
1279 
1280 int32
1282 {
1283  if (c == NULL)
1284  return -1;
1285 
1286  c->ad = NULL;
1287  c->adfunc = NULL;
1288  return 0;
1289 }
1290 
1291 
1292 int32
1294  int32(*func) (ad_rec_t *, int16 *, int32))
1295 {
1296  if (c == NULL)
1297  return -1;
1298 
1299  c->ad = a;
1300  c->adfunc = func;
1301  c->eof = 0;
1302 
1303  return 0;
1304 }
1305 
1306 
1307 int32
1308 cont_set_thresh(cont_ad_t * r, int32 silence, int32 speech)
1309 {
1310  int32 i, f;
1311 
1312  r->thresh_speech = speech;
1313  r->thresh_sil = silence;
1314 
1315  /* Since threshold has been updated, recompute r->n_other */
1316  r->n_other = 0;
1317  if (r->tail_state == CONT_AD_STATE_SIL) {
1318  for (i = r->win_validfrm, f = r->win_startfrm; i > 0; --i) {
1319  if (r->frm_pow[f] >= r->thresh_speech)
1320  r->n_other++;
1321 
1322  f++;
1323  if (f >= CONT_AD_ADFRMSIZE)
1324  f = 0;
1325  }
1326  }
1327  else if (r->tail_state == CONT_AD_STATE_SPEECH) {
1328  for (i = r->win_validfrm, f = r->win_startfrm; i > 0; --i) {
1329  if (r->frm_pow[f] <= r->thresh_sil)
1330  r->n_other++;
1331 
1332  f++;
1333  if (f >= CONT_AD_ADFRMSIZE)
1334  f = 0;
1335  }
1336  }
1337 
1338  return 0;
1339 }
1340 
1341 
1342 /*
1343  * Set the file pointer for dumping the raw input audio stream.
1344  */
1345 int32
1347 {
1348  if (r == NULL)
1349  return -1;
1350 
1351  r->rawfp = fp;
1352  return 0;
1353 }
1354 
1355 
1356 /*
1357  * Set the file pointer for logging cont_ad progress.
1358  */
1359 int32
1361 {
1362  if (r == NULL)
1363  return -1;
1364 
1365  r->logfp = fp;
1366  return 0;
1367 }
1368 
1369 
1370 /*
1371  * One-time initialization.
1372  */
1373 cont_ad_t *
1374 cont_ad_init(ad_rec_t * a, int32(*func) (ad_rec_t *, int16 *, int32))
1375 {
1376  cont_ad_t *r;
1377 
1378  if ((r = malloc(sizeof(*r))) == NULL) {
1379  E_ERROR_SYSTEM("allocation of cont_ad_t failed");
1380  return NULL;
1381  }
1382 
1383  r->ad = a;
1384  r->adfunc = func;
1385  r->eof = 0;
1386  r->rawmode = 0;
1387 
1388  if (a != NULL)
1389  r->sps = a->sps;
1390  else
1391  r->sps = CONT_AD_SPS;
1392 
1393  /* Set samples/frame such that when sps=16000, spf=256 */
1394  r->spf = (r->sps * 256) / CONT_AD_SPS;
1395  r->adbufsize = CONT_AD_ADFRMSIZE * r->spf;
1396 
1397  if ((r->adbuf = malloc(r->adbufsize * sizeof(*r->adbuf))) == NULL) {
1398  E_ERROR_SYSTEM("allocation of audio buffer failed");
1399  free(r);
1400  return NULL;
1401  }
1402  if ((r->pow_hist =
1403  calloc(CONT_AD_POWHISTSIZE, sizeof(*r->pow_hist))) == NULL) {
1404  E_ERROR_SYSTEM("allocation of power history buffer failed");
1405  free(r->adbuf);
1406  free(r);
1407  return NULL;
1408  }
1409  if ((r->frm_pow =
1410  calloc(CONT_AD_ADFRMSIZE, sizeof(*r->frm_pow))) == NULL) {
1411  E_ERROR_SYSTEM("allocation of frame power buffer failed");
1412  free(r->pow_hist);
1413  free(r->adbuf);
1414  free(r);
1415  return NULL;
1416  }
1417 
1418  r->state = CONT_AD_STATE_SIL;
1419  r->read_ts = 0;
1420  r->seglen = 0;
1421  r->siglvl = 0;
1422  r->prev_sample = 0;
1423  r->tot_frm = 0;
1424  r->noise_level = CONT_AD_DEFAULT_NOISE;
1425 
1426  r->auto_thresh = 1;
1427  r->delta_sil = CONT_AD_DELTA_SIL;
1428  r->delta_speech = CONT_AD_DELTA_SPEECH;
1429  r->min_noise = CONT_AD_MIN_NOISE;
1430  r->max_noise = CONT_AD_MAX_NOISE;
1431  r->winsize = CONT_AD_WINSIZE;
1432  r->speech_onset = CONT_AD_SPEECH_ONSET;
1433  r->sil_onset = CONT_AD_SIL_ONSET;
1434  r->leader = CONT_AD_LEADER;
1435  r->trailer = CONT_AD_TRAILER;
1436 
1437  r->thresh_sil = r->noise_level + r->delta_sil;
1438  r->thresh_speech = r->noise_level + r->delta_speech;
1439  r->thresh_update = CONT_AD_THRESH_UPDATE;
1440  r->adapt_rate = CONT_AD_ADAPT_RATE;
1441 
1442  r->tail_state = CONT_AD_STATE_SIL;
1443 
1444  r->spseg_head = NULL;
1445  r->spseg_tail = NULL;
1446 
1447  r->rawfp = NULL;
1448  r->logfp = NULL;
1449 
1450  r->n_calib_frame = 0;
1451 
1452  cont_ad_reset(r);
1453 
1454  return r;
1455 }
1456 
1457 
1458 cont_ad_t *
1460  int32(*func) (ad_rec_t *, int16 *, int32))
1461 {
1462  cont_ad_t *r;
1463 
1464  r = cont_ad_init(a, func);
1465  r->rawmode = 1;
1466 
1467  return r;
1468 }
SPHINXBASE_EXPORT cont_ad_t * cont_ad_init_rawmode(ad_rec_t *ad, int32(*adfunc)(ad_rec_t *ad, int16 *buf, int32 max))
Initializes a continuous listening object which simply passes data through (!)
spseg_t * spseg_tail
Last of unconsumed speech segments.
Definition: cont_ad.h:225
int32 tot_frm
Total number of frames of A/D data read, including consumed ones.
Definition: cont_ad.h:191
int32 max_noise
noise higher than this signals an error
Definition: cont_ad.h:201
int32 sps
Samples/sec; moved from ad->sps to break dependence on ad by N.
Definition: cont_ad.h:180
FILE * rawfp
If non-NULL, raw audio input data processed by cont_ad is dumped to this file.
Definition: cont_ad.h:227
int32 seglen
Total no.
Definition: cont_ad.h:171
SPHINXBASE_EXPORT int32 cont_ad_reset(cont_ad_t *cont)
Reset, discarding any accumulated speech segments.
Definition: ad.h:255
int16 * adbuf
Circular buffer for maintaining A/D data read until consumed.
Definition: cont_ad.h:158
SPHINXBASE_EXPORT int32 cont_ad_read(cont_ad_t *r, int16 *buf, int32 max)
Read raw audio data into the silence filter.
Definition: cont_ad_base.c:863
int32 state
State of data returned by most recent cont_ad_read call; CONT_AD_STATE_SIL or CONT_AD_STATE_SPEECH.
Definition: cont_ad.h:165
SPHINXBASE_EXPORT int32 cont_ad_set_logfp(cont_ad_t *c, FILE *fp)
Set the file to which cont_ad logs its progress.
int32 * pow_hist
Histogram of frame power, moving window, decayed.
Definition: cont_ad.h:194
int32 eof
Whether the source ad device has encountered EOF.
Definition: cont_ad.h:183
int32 sps
Samples/sec.
Definition: ad.h:256
Continuous A/D listening and silence filtering module.
int32 leader
pad beggining of speech with this many extra frms
Definition: cont_ad.h:205
(FOR INTERNAL USE ) Data structure for maintaining speech (non-silence) segments not yet consumed by ...
FILE * logfp
If non-NULL, write detailed logs of this object's progress to the file.
Definition: cont_ad.h:231
Basic type definitions used in Sphinx.
char * frm_pow
Frame power.
Definition: cont_ad.h:195
int32 speech_onset
start speech on >= these many frames out of winsize, of >= delta_speech
Definition: cont_ad.h:203
int32 headfrm
Frame number in adbuf with unconsumed A/D data.
Definition: cont_ad.h:188
SPHINXBASE_EXPORT int32 cont_ad_close(cont_ad_t *cont)
Close the continuous listening object.
int32 auto_thresh
Do automatic threshold adjustment or not.
Definition: cont_ad.h:197
spseg_t * spseg_head
First of unconsumed speech segments.
Definition: cont_ad.h:224
int32 n_calib_frame
Number of frames of calibration data seen so far.
Definition: cont_ad.h:236
int32 win_startfrm
Where next analysis window begins.
Definition: cont_ad.h:220
int32 n_sample
Number of samples of unconsumed data in adbuf.
Definition: cont_ad.h:190
SPHINXBASE_EXPORT cont_ad_t * cont_ad_init(ad_rec_t *ad, int32(*adfunc)(ad_rec_t *ad, int16 *buf, int32 max))
Initialize a continuous listening/silence filtering object.
int32 win_validfrm
Number of frames currently available from win_startfrm for analysis.
Definition: cont_ad.h:221
int32 n_other
If in SILENCE state, number of frames in analysis window considered to be speech; otherwise number of...
Definition: cont_ad.h:222
SPHINXBASE_EXPORT void cont_ad_powhist_dump(FILE *fp, cont_ad_t *cont)
Dump the power histogram.
Definition: cont_ad_base.c:231
SPHINXBASE_EXPORT int32 cont_set_thresh(cont_ad_t *r, int32 silence, int32 speech)
Set the silence and speech thresholds.
int32 delta_sil
Max silence power/frame ABOVE noise level.
Definition: cont_ad.h:198
SPHINXBASE_EXPORT int32 cont_ad_calib(cont_ad_t *cont)
Calibrate the silence filter.
Implementation of logging routines.
SPHINXBASE_EXPORT int32 cont_ad_detach(cont_ad_t *c)
Detach the given continuous listening module from the associated audio device.
SPHINXBASE_EXPORT int32 cont_ad_calib_size(cont_ad_t *r)
Get the number of samples required to calibrate the silence filter.
generic live audio interface for recording and playback
int32 tail_state
State at the end of its internal buffer (internal use): CONT_AD_STATE_SIL or CONT_AD_STATE_SPEECH.
Definition: cont_ad.h:217
int32 min_noise
noise lower than this we ignore
Definition: cont_ad.h:200
SPHINXBASE_EXPORT int32 cont_ad_attach(cont_ad_t *c, ad_rec_t *a, int32(*func)(ad_rec_t *, int16 *, int32))
Attach the continuous listening module to the given audio device/function.
SPHINXBASE_EXPORT int32 cont_ad_set_rawfp(cont_ad_t *c, FILE *fp)
Set a file for dumping raw audio input.
SPHINXBASE_EXPORT int32 cont_ad_get_params(cont_ad_t *r, int32 *delta_sil, int32 *delta_speech, int32 *min_noise, int32 *max_noise, int32 *winsize, int32 *speech_onset, int32 *sil_onset, int32 *leader, int32 *trailer, float32 *adapt_rate)
PWP 1/14/98 – get the changable params.
int32 noise_level
PWP: what we claim as the "current" noise level.
Definition: cont_ad.h:192
Continuous listening module or object Continuous listening module or object.
Definition: cont_ad.h:151
int32 n_frm
Number of complete frames of unconsumed A/D data in adbuf.
Definition: cont_ad.h:189
SPHINXBASE_EXPORT int32 cont_ad_buffer_space(cont_ad_t *r)
Get the maximum number of samples which can be passed into cont_ad_read().
Definition: cont_ad_base.c:707
float32 adapt_rate
Linear interpolation constant for rate at which noise level adapted to each estimate; range: 0-1; 0=>...
Definition: cont_ad.h:213
int32 delta_speech
Min speech power/frame ABOVE noise level.
Definition: cont_ad.h:199
#define E_ERROR
Print error message to standard error stream.
Definition: err.h:169
SPHINXBASE_EXPORT int32 cont_ad_calib_loop(cont_ad_t *r, int16 *buf, int32 max)
Calibrate the silence filter without an audio device.
int32 prev_sample
For pre-emphasis filter.
Definition: cont_ad.h:187
int32 spf
Samples/frame; audio level is analyzed within frames.
Definition: cont_ad.h:185
int32 read_ts
Absolute timestamp (total no.
Definition: cont_ad.h:167
ad_rec_t * ad
A/D device argument for adfunc.
Definition: cont_ad.h:154
#define E_ERROR_SYSTEM
Print error text; Call perror("");.
Definition: err.h:142
int32 rawmode
Pass all input data through, without filtering silence.
Definition: cont_ad.h:156
int32 siglvl
Max signal level for the data consumed by the most recent cont_ad_read call (dB range: 0-99)...
Definition: cont_ad.h:175
int32 winsize
how many frames to look at for speech det
Definition: cont_ad.h:202
int32 sil_onset
end speech on >= these many frames out of winsize, of <= delta_sil
Definition: cont_ad.h:204
int32 thresh_update
Number of frames before next update to pow_hist/thresholds.
Definition: cont_ad.h:212
SPHINXBASE_EXPORT int32 cont_ad_set_thresh(cont_ad_t *cont, int32 sil, int32 sp)
Set silence and speech threshold parameters.
int32 trailer
pad end of speech with this many extra frms
Definition: cont_ad.h:206
int32 adbufsize
Buffer size (Number of samples)
Definition: cont_ad.h:186
int32 thresh_speech
Frame considered to be speech if power >= thresh_speech (for transitioning from SILENCE to SPEECH sta...
Definition: cont_ad.h:208
int32 thresh_sil
Frame considered to be silence if power <= thresh_sil (for transitioning from SPEECH to SILENCE state...
Definition: cont_ad.h:210
SPHINXBASE_EXPORT int32 cont_ad_set_params(cont_ad_t *r, int32 delta_sil, int32 delta_speech, int32 min_noise, int32 max_noise, int32 winsize, int32 speech_onset, int32 sil_onset, int32 leader, int32 trailer, float32 adapt_rate)
Set the changable parameters.