SphinxBase  0.6
cont_fileseg.c
1 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /* ====================================================================
3  * Copyright (c) 1999-2001 Carnegie Mellon University. All rights
4  * reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright
11  * notice, this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright
14  * notice, this list of conditions and the following disclaimer in
15  * the documentation and/or other materials provided with the
16  * distribution.
17  *
18  * This work was supported in part by funding from the Defense Advanced
19  * Research Projects Agency and the National Science Foundation of the
20  * United States of America, and the CMU Sphinx Speech Consortium.
21  *
22  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33  *
34  * ====================================================================
35  *
36  */
37 /*
38  * cont_fileseg.c -- Read input file, filter silence regions, and segment into utterances.
39  *
40  * HISTORY
41  *
42  * $Log: cont_fileseg.c,v $
43  * Revision 1.1.1.1 2006/05/23 18:45:02 dhuggins
44  * re-importation
45  *
46  * Revision 1.13 2005/06/30 00:28:46 rkm
47  * Kept within-utterance silences in rawmode
48  *
49  *
50  * 28-Jun-2005 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
51  * Modified to use new state variables in cont_ad_t.
52  *
53  * Revision 1.12 2005/05/31 15:54:38 rkm
54  * *** empty log message ***
55  *
56  * Revision 1.11 2005/05/24 20:56:58 rkm
57  * Added min/max-noise parameters to cont_fileseg
58  *
59  * Revision 1.10 2005/05/13 23:28:43 egouvea
60  * Changed null device to system dependent one: NUL for windows, /dev/null for everything else
61  *
62  * $Log: cont_fileseg.c,v $
63  * Revision 1.1.1.1 2006/05/23 18:45:02 dhuggins
64  * re-importation
65  *
66  * Revision 1.13 2005/06/30 00:28:46 rkm
67  * Kept within-utterance silences in rawmode
68  *
69  * Revision 1.12 2005/05/31 15:54:38 rkm
70  * *** empty log message ***
71  *
72  * Revision 1.11 2005/05/24 20:56:58 rkm
73  * Added min/max-noise parameters to cont_fileseg
74  *
75  * Revision 1.9 2005/02/13 01:29:48 rkm
76  * Fixed cont_ad_read to never cross sil/speech boundary, and rawmode
77  *
78  * Revision 1.8 2005/02/01 22:21:13 rkm
79  * Added raw data logging, and raw data pass-through mode to cont_ad
80  *
81  * Revision 1.7 2004/07/16 00:57:11 egouvea
82  * Added Ravi's implementation of FSG support.
83  *
84  * Revision 1.3 2004/06/25 14:58:05 rkm
85  * *** empty log message ***
86  *
87  * Revision 1.2 2004/06/23 20:32:08 rkm
88  * Exposed several cont_ad config parameters
89  *
90  *
91  * 27-Jun-96 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
92  * Created.
93  */
94 
95 #include <stdio.h>
96 #include <stdlib.h>
97 #include <string.h>
98 #include <assert.h>
99 #include <math.h>
100 
101 #include <sphinxbase/prim_type.h>
102 #include <sphinxbase/ad.h>
103 #include <sphinxbase/cont_ad.h>
104 #include <sphinxbase/err.h>
105 
106 static FILE *infp; /* File being segmented */
107 static int32 swap;
108 
109 /* Max size read by file_ad_read function on each invocation, for debugging */
110 static int32 max_ad_read_size;
111 
112 #if defined(WIN32) && !defined(GNUWINCE)
113 #define NULL_DEVICE "NUL"
114 #else
115 #define NULL_DEVICE "/dev/null"
116 #endif
117 
118 
119 /*
120  * Need to provide cont_ad_init with a read function to read the input file.
121  * This is it. The ad_rec_t *r argument is ignored since there is no A/D
122  * device involved.
123  */
124 static int32
125 file_ad_read(ad_rec_t * r, int16 * buf, int32 max)
126 {
127  int32 i, k;
128 
129  if (max > max_ad_read_size)
130  max = max_ad_read_size;
131 
132  k = fread(buf, sizeof(int16), max, infp);
133  if (swap) {
134  for (i = 0; i < k; i++) {
135  buf[i] = ((buf[i] >> 8) & 0x00ff) | ((buf[i] << 8) & 0xff00);
136  }
137  }
138 
139  return ((k > 0) ? k : -1);
140 }
141 
142 
143 static void
144 usagemsg(char *pgm)
145 {
146  E_INFO("Usage: %s \\\n", pgm);
147  E_INFOCONT("\t[-? | -h] \\\n");
148  E_INFOCONT("\t[-d | -debug] \\\n");
149  E_INFOCONT("\t[-sps <sampling-rate> (16000)] \\\n");
150  E_INFOCONT("\t[-b | -byteswap] \\\n");
151  E_INFOCONT
152  ("\t[{-s | -silsep} <length-silence-separator(sec) (0.5)]> \\\n");
153  E_INFOCONT("\t[-w | -writeseg] \\\n");
154  E_INFOCONT("\t[-min-noise <min-noise>] \\\n");
155  E_INFOCONT("\t[-max-noise <max-noise>] \\\n");
156  E_INFOCONT("\t[-delta-sil <delta-sil>] \\\n");
157  E_INFOCONT("\t[-delta-speech <delta-speech>] \\\n");
158  E_INFOCONT("\t[-sil-onset <sil-onset>] \\\n");
159  E_INFOCONT("\t[-speech-onset <speech-onset>] \\\n");
160  E_INFOCONT("\t[-adapt-rate <adapt-rate>] \\\n");
161  E_INFOCONT("\t[-max-adreadsize <ad_read_blksize>] \\\n");
162  E_INFOCONT("\t[-c <copy-input-file>] \\\n");
163  E_INFOCONT("\t[-r | -rawmode] \\\n");
164  E_INFOCONT("\t-i <input-file>\n");
165 
166  exit(0);
167 }
168 
169 /*
170  * Read specified input file, segment it into utterances wherever a silence segment of
171  * a given minimum duration is encountered. Filter out long silences.
172  * Utterances are written to files named 00000000.raw, 00000001.raw, 00000002.raw, etc.
173  */
174 int
175 main(int32 argc, char **argv)
176 {
177  cont_ad_t *cont;
178  int32 uttid, uttlen, starttime, siltime, sps, debug, writeseg, rawmode;
179  int16 buf[4096];
180  char *infile, *copyfile, segfile[1024];
181  FILE *fp;
182  float endsil;
183  ad_rec_t ad;
184  int32 i, k;
185  int32 winsize, leader, trailer;
186  int32 orig_min_noise, orig_max_noise;
187  int32 orig_delta_sil, orig_delta_speech;
188  int32 orig_speech_onset, orig_sil_onset;
189  int32 min_noise, max_noise;
190  int32 delta_sil, delta_speech;
191  int32 sil_onset, speech_onset;
192  float32 orig_adapt_rate;
193  float32 adapt_rate;
194  int32 total_speech_samples;
195  float32 total_speech_sec;
196  FILE *rawfp;
197 
198  /* Set argument defaults */
199  cont = NULL;
200  sps = 16000;
201  swap = 0;
202  endsil = 0.5;
203  writeseg = 0;
204  min_noise = max_noise = -1;
205  delta_sil = delta_speech = -1;
206  sil_onset = speech_onset = -1;
207  adapt_rate = -1.0;
208  max_ad_read_size = (int32) 0x7ffffff0;
209  debug = 0;
210  infile = NULL;
211  copyfile = NULL;
212  rawfp = NULL;
213  rawmode = 0;
214 
215  /* Parse arguments */
216  for (i = 1; i < argc; i++) {
217  if ((strcmp(argv[i], "-help") == 0)
218  || (strcmp(argv[i], "-h") == 0)
219  || (strcmp(argv[i], "-?") == 0)) {
220  usagemsg(argv[0]);
221  }
222  else if ((strcmp(argv[i], "-debug") == 0)
223  || (strcmp(argv[i], "-d") == 0)) {
224  debug = 1;
225  }
226  else if (strcmp(argv[i], "-sps") == 0) {
227  i++;
228  if ((i == argc)
229  || (sscanf(argv[i], "%d", &sps) != 1)
230  || (sps <= 0)) {
231  E_ERROR("Invalid -sps argument\n");
232  usagemsg(argv[0]);
233  }
234  }
235  else if ((strcmp(argv[i], "-byteswap") == 0)
236  || (strcmp(argv[i], "-b") == 0)) {
237  swap = 1;
238  }
239  else if ((strcmp(argv[i], "-silsep") == 0)
240  || (strcmp(argv[i], "-s") == 0)) {
241  i++;
242  if ((i == argc)
243  || (sscanf(argv[i], "%f", &endsil) != 1)
244  || (endsil <= 0.0)) {
245  E_ERROR("Invalid -silsep argument\n");
246  usagemsg(argv[0]);
247  }
248  }
249  else if ((strcmp(argv[i], "-writeseg") == 0)
250  || (strcmp(argv[i], "-w") == 0)) {
251  writeseg = 1;
252  }
253  else if (strcmp(argv[i], "-min-noise") == 0) {
254  i++;
255  if ((i == argc) ||
256  (sscanf(argv[i], "%d", &min_noise) != 1) ||
257  (min_noise < 0)) {
258  E_ERROR("Invalid -min-noise argument\n");
259  usagemsg(argv[0]);
260  }
261  }
262  else if (strcmp(argv[i], "-max-noise") == 0) {
263  i++;
264  if ((i == argc) ||
265  (sscanf(argv[i], "%d", &max_noise) != 1) ||
266  (max_noise < 0)) {
267  E_ERROR("Invalid -max-noise argument\n");
268  usagemsg(argv[0]);
269  }
270  }
271  else if (strcmp(argv[i], "-delta-sil") == 0) {
272  i++;
273  if ((i == argc) ||
274  (sscanf(argv[i], "%d", &delta_sil) != 1) ||
275  (delta_sil < 0)) {
276  E_ERROR("Invalid -delta-sil argument\n");
277  usagemsg(argv[0]);
278  }
279  }
280  else if (strcmp(argv[i], "-delta-speech") == 0) {
281  i++;
282  if ((i == argc) ||
283  (sscanf(argv[i], "%d", &delta_speech) != 1) ||
284  (delta_speech < 0)) {
285  E_ERROR("Invalid -delta-speech argument\n");
286  usagemsg(argv[0]);
287  }
288  }
289  else if (strcmp(argv[i], "-sil-onset") == 0) {
290  i++;
291  if ((i == argc) ||
292  (sscanf(argv[i], "%d", &sil_onset) != 1) ||
293  (sil_onset < 1)) {
294  E_ERROR("Invalid -sil-onset argument\n");
295  usagemsg(argv[0]);
296  }
297  }
298  else if (strcmp(argv[i], "-speech-onset") == 0) {
299  i++;
300  if ((i == argc) ||
301  (sscanf(argv[i], "%d", &speech_onset) != 1) ||
302  (speech_onset < 1)) {
303  E_ERROR("Invalid -speech-onset argument\n");
304  usagemsg(argv[0]);
305  }
306  }
307  else if (strcmp(argv[i], "-adapt-rate") == 0) {
308  i++;
309  if ((i == argc) ||
310  (sscanf(argv[i], "%f", &adapt_rate) != 1) ||
311  (adapt_rate < 0.0) || (adapt_rate > 1.0)) {
312  E_ERROR("Invalid -adapt-rate argument\n");
313  usagemsg(argv[0]);
314  }
315  }
316  else if (strcmp(argv[i], "-max-adreadsize") == 0) {
317  i++;
318  if ((i == argc) ||
319  (sscanf(argv[i], "%d", &max_ad_read_size) != 1) ||
320  (max_ad_read_size < 1)) {
321  E_ERROR("Invalid -max-adreadsize argument\n");
322  usagemsg(argv[0]);
323  }
324  }
325  else if (strcmp(argv[i], "-c") == 0) {
326  i++;
327  if (i == argc) {
328  E_ERROR("Invalid -c argument\n");
329  usagemsg(argv[0]);
330  }
331  copyfile = argv[i];
332  }
333  else if ((strcmp(argv[i], "-rawmode") == 0)
334  || (strcmp(argv[i], "-r") == 0)) {
335  rawmode = 1;
336  }
337  else if (strcmp(argv[i], "-i") == 0) {
338  i++;
339  if (i == argc) {
340  E_ERROR("Invalid -i argument\n");
341  usagemsg(argv[0]);
342  }
343  infile = argv[i];
344  }
345  else {
346  usagemsg(argv[0]);
347  }
348  }
349 
350  if (infile == NULL) {
351  E_ERROR("No input file specified\n");
352  usagemsg(argv[0]);
353  }
354 
355  if ((infp = fopen(infile, "rb")) == NULL)
356  E_FATAL_SYSTEM("Failed to open '%s' for reading", infile);
357 
358  /*
359  * Associate continuous listening module with opened input file and read function.
360  * No A/D device is involved, but need to fill in ad->sps.
361  * Calibrate input data using first few seconds of file, but then rewind it!!
362  */
363  ad.sps = sps;
364  ad.bps = sizeof(int16);
365  if (!rawmode)
366  cont = cont_ad_init(&ad, file_ad_read);
367  else
368  cont = cont_ad_init_rawmode(&ad, file_ad_read);
369 
370  printf("Calibrating ...");
371  fflush(stdout);
372  if (cont_ad_calib(cont) < 0)
373  printf(" failed; file too short?\n");
374  else
375  printf(" done\n");
376  rewind(infp);
377 
378  /* Convert desired min. inter-utterance silence duration to #samples */
379  siltime = (int32) (endsil * sps);
380 
381  /* Enable writing raw input to output by the cont module if specified */
382  if (copyfile) {
383  if ((rawfp = fopen(copyfile, "wb")) == NULL)
384  E_ERROR_SYSTEM("Failed to open raw output file '%s' for writing");
385  else
386  cont_ad_set_rawfp(cont, rawfp);
387  }
388 
389  cont_ad_get_params(cont,
390  &orig_delta_sil, &orig_delta_speech,
391  &orig_min_noise, &orig_max_noise,
392  &winsize,
393  &orig_speech_onset, &orig_sil_onset,
394  &leader, &trailer, &orig_adapt_rate);
395 
396  E_INFO("Default parameters:\n");
397  E_INFOCONT("\tmin-noise = %d, max-noise = %d\n",
398  orig_min_noise, orig_max_noise);
399  E_INFOCONT("\tdelta-sil = %d, delta-speech = %d\n",
400  orig_delta_sil, orig_delta_speech);
401  E_INFOCONT("\tsil-onset = %d, speech-onset = %d\n",
402  orig_sil_onset, orig_speech_onset);
403  E_INFOCONT("\tadapt_rate = %.3f\n", orig_adapt_rate);
404 
405  if (min_noise < 0)
406  min_noise = orig_min_noise;
407  if (max_noise < 0)
408  max_noise = orig_max_noise;
409  if (delta_sil < 0)
410  delta_sil = orig_delta_sil;
411  if (delta_speech < 0)
412  delta_speech = orig_delta_speech;
413  if (sil_onset < 0)
414  sil_onset = orig_sil_onset;
415  if (speech_onset < 0)
416  speech_onset = orig_speech_onset;
417  if (adapt_rate < 0.0)
418  adapt_rate = orig_adapt_rate;
419 
420  cont_ad_set_params(cont,
421  delta_sil, delta_speech,
422  min_noise, max_noise,
423  winsize,
424  speech_onset, sil_onset,
425  leader, trailer, adapt_rate);
426 
427  E_INFO("Current parameters:\n");
428  E_INFOCONT("\tmin-noise = %d, max-noise = %d\n", min_noise, max_noise);
429  E_INFOCONT("\tdelta-sil = %d, delta-speech = %d\n", delta_sil,
430  delta_speech);
431  E_INFOCONT("\tsil-onset = %d, speech-onset = %d\n", sil_onset,
432  speech_onset);
433  E_INFOCONT("\tadapt_rate = %.3f\n", adapt_rate);
434 
435  E_INFO("Sampling rate: %d", sps);
436  E_INFOCONT("; Byteswap: %s", swap ? "Yes" : "No");
437  E_INFOCONT("; Max ad-read size: %d\n", max_ad_read_size);
438 
439  if (debug)
440  cont_ad_set_logfp(cont, stdout);
441 
442  total_speech_samples = 0;
443  total_speech_sec = 0.0;
444 
445  uttid = 0;
446  uttlen = 0;
447  starttime = 0;
448  fp = NULL;
449 
450  /* Process data */
451  for (;;) {
452  /* Get audio data from continuous listening module */
453  k = cont_ad_read(cont, buf, 4096);
454 
455  if (k < 0) { /* End of input audio file; close any open output file and exit */
456  if (fp != NULL) {
457  fclose(fp);
458  fp = NULL;
459 
460  printf
461  ("Utt %08d, st= %8.2fs, et= %8.2fs, seg= %7.2fs (#samp= %10d)\n",
462  uttid, (double) starttime / (double) sps,
463  (double) (starttime + uttlen) / (double) sps,
464  (double) uttlen / (double) sps, uttlen);
465  fflush(stdout);
466 
467  total_speech_samples += uttlen;
468  total_speech_sec += (double) uttlen / (double) sps;
469 
470  uttid++;
471  }
472 
473  break;
474  }
475 
476  if (cont->state == CONT_AD_STATE_SIL) { /* Silence data got */
477  if (fp != NULL) { /* Currently in an utterance */
478  if (cont->seglen > siltime) { /* Long enough silence detected; end the utterance */
479  fclose(fp);
480  fp = NULL;
481 
482  printf
483  ("Utt %08d, st= %8.2fs, et= %8.2fs, seg= %7.2fs (#samp= %10d)\n",
484  uttid, (double) starttime / (double) sps,
485  (double) (starttime + uttlen) / (double) sps,
486  (double) uttlen / (double) sps, uttlen);
487  fflush(stdout);
488 
489  total_speech_samples += uttlen;
490  total_speech_sec += (double) uttlen / (double) sps;
491 
492  uttid++;
493  }
494  else {
495  /*
496  * Short silence within utt; write it to output. (Some extra trailing silence
497  * is included in the utterance, as a result. Not to worry about it.)
498  */
499  if (k > 0) {
500  fwrite(buf, sizeof(int16), k, fp);
501  uttlen += k;
502  }
503  }
504  }
505  }
506  else {
507  assert(cont->state == CONT_AD_STATE_SPEECH);
508 
509  if (fp == NULL) { /* Not in an utt; open a new output file */
510  if (writeseg)
511  sprintf(segfile, "%08d.raw", uttid);
512  else
513  strcpy(segfile, NULL_DEVICE);
514  if ((fp = fopen(segfile, "wb")) == NULL)
515  E_FATAL_SYSTEM("Failed to open segmentation file '%s' for writing", segfile);
516 
517  starttime = cont->read_ts - k;
518  uttlen = 0;
519  }
520 
521  /* Write data obtained to output file */
522  if (k > 0) {
523  fwrite(buf, sizeof(int16), k, fp);
524  uttlen += k;
525  }
526  }
527  }
528 
529  if (rawfp)
530  fclose(rawfp);
531 
532  E_INFO("Total raw input speech = %d frames, %d samples, %.2f sec\n",
533  cont->tot_frm, cont->tot_frm * cont->spf,
534  (cont->tot_frm * cont->spf) / (float32) cont->sps);
535  E_INFO("Total speech detected = %d samples, %.2f sec\n",
536  total_speech_samples, total_speech_sec);
537 
538  cont_ad_close(cont);
539 
540  return 0;
541 }
SPHINXBASE_EXPORT cont_ad_t * cont_ad_init_rawmode(ad_rec_t *ad, int32(*adfunc)(ad_rec_t *ad, int16 *buf, int32 max))
Initializes a continuous listening object which simply passes data through (!)
int32 tot_frm
Total number of frames of A/D data read, including consumed ones.
Definition: cont_ad.h:191
int32 sps
Samples/sec; moved from ad->sps to break dependence on ad by N.
Definition: cont_ad.h:180
int32 seglen
Total no.
Definition: cont_ad.h:171
#define E_INFO
Print logging information to standard error stream.
Definition: err.h:147
Definition: ad.h:255
SPHINXBASE_EXPORT int32 cont_ad_read(cont_ad_t *r, int16 *buf, int32 max)
Read raw audio data into the silence filter.
Definition: cont_ad_base.c:863
int32 state
State of data returned by most recent cont_ad_read call; CONT_AD_STATE_SIL or CONT_AD_STATE_SPEECH.
Definition: cont_ad.h:165
SPHINXBASE_EXPORT int32 cont_ad_set_logfp(cont_ad_t *c, FILE *fp)
Set the file to which cont_ad logs its progress.
int32 sps
Samples/sec.
Definition: ad.h:256
Continuous A/D listening and silence filtering module.
Basic type definitions used in Sphinx.
#define E_FATAL_SYSTEM
Print error text; Call perror(""); exit(errno);.
Definition: err.h:132
SPHINXBASE_EXPORT int32 cont_ad_close(cont_ad_t *cont)
Close the continuous listening object.
int32 bps
Bytes/sample.
Definition: ad.h:257
SPHINXBASE_EXPORT cont_ad_t * cont_ad_init(ad_rec_t *ad, int32(*adfunc)(ad_rec_t *ad, int16 *buf, int32 max))
Initialize a continuous listening/silence filtering object.
#define E_INFOCONT
Print logging information without header, to standard error stream.
Definition: err.h:153
SPHINXBASE_EXPORT int32 cont_ad_calib(cont_ad_t *cont)
Calibrate the silence filter.
Implementation of logging routines.
generic live audio interface for recording and playback
SPHINXBASE_EXPORT int32 cont_ad_set_rawfp(cont_ad_t *c, FILE *fp)
Set a file for dumping raw audio input.
SPHINXBASE_EXPORT int32 cont_ad_get_params(cont_ad_t *r, int32 *delta_sil, int32 *delta_speech, int32 *min_noise, int32 *max_noise, int32 *winsize, int32 *speech_onset, int32 *sil_onset, int32 *leader, int32 *trailer, float32 *adapt_rate)
PWP 1/14/98 – get the changable params.
Continuous listening module or object Continuous listening module or object.
Definition: cont_ad.h:151
#define E_ERROR
Print error message to standard error stream.
Definition: err.h:169
int32 spf
Samples/frame; audio level is analyzed within frames.
Definition: cont_ad.h:185
int32 read_ts
Absolute timestamp (total no.
Definition: cont_ad.h:167
#define E_ERROR_SYSTEM
Print error text; Call perror("");.
Definition: err.h:142
SPHINXBASE_EXPORT int32 cont_ad_set_params(cont_ad_t *r, int32 delta_sil, int32 delta_speech, int32 min_noise, int32 max_noise, int32 winsize, int32 speech_onset, int32 sil_onset, int32 leader, int32 trailer, float32 adapt_rate)
Set the changable parameters.