SphinxBase  0.6
sphinx_fe.c
1 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /* ====================================================================
3  * Copyright (c) 1996-2004 Carnegie Mellon University. All rights
4  * reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright
11  * notice, this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright
14  * notice, this list of conditions and the following disclaimer in
15  * the documentation and/or other materials provided with the
16  * distribution.
17  *
18  * This work was supported in part by funding from the Defense Advanced
19  * Research Projects Agency and the National Science Foundation of the
20  * United States of America, and the CMU Sphinx Speech Consortium.
21  *
22  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33  *
34  * ====================================================================
35  *
36  */
37 #include <stdio.h>
38 #include <stdlib.h>
39 #include <string.h>
40 #include <time.h>
41 #include <assert.h>
42 
43 #ifdef HAVE_CONFIG_H
44 #include <config.h>
45 #endif
46 
47 #ifdef HAVE_SNDFILE_H
48 #include <sndfile.h>
49 #endif
50 
51 #include <sphinxbase/fe.h>
52 #include <sphinxbase/strfuncs.h>
53 #include <sphinxbase/pio.h>
54 #include <sphinxbase/filename.h>
55 #include <sphinxbase/cmd_ln.h>
56 #include <sphinxbase/err.h>
57 #include <sphinxbase/ckd_alloc.h>
58 #include <sphinxbase/byteorder.h>
59 #include <sphinxbase/hash_table.h>
60 
61 #include "sphinx_wave2feat.h"
62 #include "cmd_ln_defn.h"
63 
64 typedef struct audio_type_s {
65  char const *name;
66  int (*detect)(sphinx_wave2feat_t *wtf);
67  int (*decode)(sphinx_wave2feat_t *wtf);
68 } audio_type_t;
69 
70 typedef struct output_type_s {
71  char const *name;
72  int (*output_header)(sphinx_wave2feat_t *wtf, int nfloat);
73  int (*output_frames)(sphinx_wave2feat_t *wtf, mfcc_t **frames, int nfr);
75 
77  int refcount;
79  fe_t *fe;
80  char *infile;
81  char *outfile;
82  FILE *infh;
83  FILE *outfh;
84  short *audio;
85  mfcc_t **feat;
86  int blocksize;
87  int featsize;
88  int veclen;
89  int in_veclen;
90  int byteswap;
91 #ifdef HAVE_SNDFILE_H
92  SNDFILE *insfh;
93 #endif
94  output_type_t const *ot;
95 };
96 
98 typedef struct RIFFHeader{
99  char rifftag[4]; /* "RIFF" string */
100  int32 TotalLength; /* Total length */
101  char wavefmttag[8]; /* "WAVEfmt " string (note space after 't') */
102  int32 RemainingLength; /* Remaining length */
103  int16 data_format; /* data format tag, 1 = PCM */
104  int16 numchannels; /* Number of channels in file */
105  int32 SamplingFreq; /* Sampling frequency */
106  int32 BytesPerSec; /* Average bytes/sec */
107  int16 BlockAlign; /* Block align */
108  int16 BitsPerSample; /* 8 or 16 bit */
109  char datatag[4]; /* "data" string */
110  int32 datalength; /* Raw data length */
111 } MSWAV_hdr;
112 
118 static int
119 detect_riff(sphinx_wave2feat_t *wtf)
120 {
121  FILE *fh;
122  MSWAV_hdr hdr;
123 
124  if ((fh = fopen(wtf->infile, "rb")) == NULL) {
125  E_ERROR_SYSTEM("Failed to open %s", wtf->infile);
126  return -1;
127  }
128  if (fread(&hdr, sizeof(hdr), 1, fh) != 1) {
129  E_ERROR_SYSTEM("Failed to read RIFF header");
130  fclose(fh);
131  return -1;
132  }
133  /* Make sure it is actually a RIFF file. */
134  if (0 != memcmp(hdr.rifftag, "RIFF", 4)) {
135  fclose(fh);
136  return FALSE;
137  }
138 
139  /* Get relevant information. */
140  cmd_ln_set_int32_r(wtf->config, "-nchans", hdr.numchannels);
141  cmd_ln_set_float32_r(wtf->config, "-samprate", hdr.SamplingFreq);
142  wtf->infh = fh;
143 
144  return TRUE;
145 }
146 
147 static int
148 open_nist_file(sphinx_wave2feat_t *wtf, char const *infile, FILE **out_fh, int detect_endian)
149 {
150  char nist[7];
151  lineiter_t *li;
152  FILE *fh;
153 
154  if ((fh = fopen(infile, "rb")) == NULL) {
155  E_ERROR_SYSTEM("Failed to open %s", infile);
156  return -1;
157  }
158  if (fread(&nist, 1, 7, fh) != 7) {
159  E_ERROR_SYSTEM("Failed to read NIST header");
160  fclose(fh);
161  return -1;
162  }
163  /* Is this actually a NIST file? */
164  if (0 != strncmp(nist, "NIST_1A", 7)) {
165  fclose(fh);
166  return FALSE;
167  }
168  /* Rewind, parse lines. */
169  fseek(fh, 0, SEEK_SET);
170  for (li = lineiter_start(fh); li; li = lineiter_next(li)) {
171  char **words;
172  int nword;
173 
174  string_trim(li->buf, STRING_BOTH);
175  if (strlen(li->buf) == 0) {
176  lineiter_free(li);
177  break;
178  }
179  nword = str2words(li->buf, NULL, 0);
180  if (nword != 3)
181  continue;
182  words = ckd_calloc(nword, sizeof(*words));
183  str2words(li->buf, words, nword);
184  if (0 == strcmp(words[0], "sample_rate")) {
185  cmd_ln_set_float32_r(wtf->config, "-samprate", atof_c(words[2]));
186  }
187  if (0 == strcmp(words[0], "channel_count")) {
188  cmd_ln_set_int32_r(wtf->config, "-nchans", atoi(words[2]));
189  }
190  if (detect_endian && 0 == strcmp(words[0], "sample_byte_format")) {
191  cmd_ln_set_str_r(wtf->config, "-input_endian",
192  (0 == strcmp(words[2], "10")) ? "big" : "little");
193  }
194  ckd_free(words);
195  }
196 
197  fseek(fh, 1024, SEEK_SET);
198  if (out_fh)
199  *out_fh = fh;
200  else
201  fclose(fh);
202  return TRUE;
203 }
204 
205 #ifdef HAVE_POPEN
206 static int
207 detect_sph2pipe(sphinx_wave2feat_t *wtf)
208 {
209  FILE *fh;
210  char *cmdline;
211  int rv;
212 
213  /* Determine if it's NIST file and get parameters. */
214  if ((rv = open_nist_file(wtf, wtf->infile, NULL, FALSE)) != TRUE)
215  return rv;
216 
217  /* Now popen it with sph2pipe. */
218  cmdline = string_join("sph2pipe -f raw '", wtf->infile, "'", NULL);
219  if ((fh = popen(cmdline, "r")) == NULL) {
220  E_ERROR_SYSTEM("Failed to popen(\"sph2pipe -f raw '%s'\")", wtf->infile);
221  ckd_free(cmdline);
222  return -1;
223  }
224 
225  wtf->infh = fh;
226  return TRUE;
227 }
228 #else /* !HAVE_POPEN */
229 static int
230 detect_sph2pipe(sphinx_wave2feat_t *wtf)
231 {
232  E_ERROR("popen() not available, cannot run sph2pipe\n");
233  return -1;
234 }
235 #endif /* !HAVE_POPEN */
236 
242 static int
243 detect_nist(sphinx_wave2feat_t *wtf)
244 {
245  FILE *fh;
246  int rv;
247 
248  if ((rv = open_nist_file(wtf, wtf->infile, &fh, TRUE)) != TRUE)
249  return rv;
250  wtf->infh = fh;
251 
252  return TRUE;
253 }
254 
255 
262 static int
263 detect_raw(sphinx_wave2feat_t *wtf)
264 {
265  FILE *fh;
266 
267  if ((fh = fopen(wtf->infile, "rb")) == NULL) {
268  E_ERROR_SYSTEM("Failed to open %s", wtf->infile);
269  return -1;
270  }
271  wtf->infh = fh;
272  return TRUE;
273 }
274 
281 static int
282 detect_sphinx_mfc(sphinx_wave2feat_t *wtf)
283 {
284  FILE *fh;
285  int32 len;
286  long flen;
287 
288  if ((fh = fopen(wtf->infile, "rb")) == NULL) {
289  E_ERROR_SYSTEM("Failed to open %s", wtf->infile);
290  return -1;
291  }
292  if (fread(&len, 4, 1, fh) != 1) {
293  E_ERROR_SYSTEM("Failed to read header from %s\n", wtf->infile);
294  fclose(fh);
295  return -1;
296  }
297  fseek(fh, 0, SEEK_END);
298  flen = ftell(fh);
299 
300  /* figure out whether to byteswap */
301  flen = (flen / 4) - 1;
302  if (flen != len) {
303  /* First make sure this is an endianness problem, otherwise fail. */
304  SWAP_INT32(&len);
305  if (flen != len) {
306  SWAP_INT32(&len);
307  E_ERROR("Mismatch in header/file lengths: 0x%08x vs 0x%08x\n",
308  len, flen);
309  return -1;
310  }
311  /* Set the input endianness to the opposite of the machine endianness... */
312  cmd_ln_set_str_r(wtf->config, "-input_endian",
313  (0 == strcmp("big", cmd_ln_str_r(wtf->config, "-mach_endian"))
314  ? "little" : "big"));
315  }
316 
317  fseek(fh, 4, SEEK_SET);
318  wtf->infh = fh;
319  if (cmd_ln_boolean_r(wtf->config, "-spec2cep")) {
320  wtf->in_veclen = cmd_ln_int32_r(wtf->config, "-nfilt");
321  }
322  else if (cmd_ln_boolean_r(wtf->config, "-cep2spec")) {
323  wtf->in_veclen = cmd_ln_int32_r(wtf->config, "-ncep");
324  wtf->veclen = cmd_ln_int32_r(wtf->config, "-nfilt");
325  }
326  else {
327  /* Should not happen. */
328  E_ERROR("Sphinx MFCC file reading requested but -spec2cep/-cep2spec not given\n");
329  assert(FALSE);
330  }
331 
332  return TRUE;
333 }
334 
335 int
336 mixnpick_channels(int16 *buf, int32 nsamp, int32 nchans, int32 whichchan)
337 {
338  int i, j;
339 
340  if (whichchan > 0) {
341  for (i = whichchan - 1; i < nsamp; i += nchans)
342  buf[i/nchans] = buf[i];
343  }
344  else {
345  for (i = 0; i < nsamp; i += nchans) {
346  float64 tmp = 0.0;
347  for (j = 0; j < nchans && i + j < nsamp; ++j) {
348  tmp += buf[i + j];
349  }
350  buf[i/nchans] = (int16)(tmp / nchans);
351  }
352  }
353  return i/nchans;
354 }
355 
356 #ifdef HAVE_SNDFILE_H
357 
362 static int
363 detect_sndfile(sphinx_wave2feat_t *wtf)
364 {
365  SNDFILE *sf;
366  SF_INFO sfinfo;
367 
368  memset(&sfinfo, 0, sizeof(sfinfo));
369  /* We let other detectors catch I/O errors, since there is
370  no way to tell them from format errors when opening :( */
371  if ((sf = sf_open(wtf->infile, SFM_READ, &sfinfo)) == NULL) {
372  return FALSE;
373  }
374  /* Get relevant information. */
375  cmd_ln_set_int32_r(wtf->config, "-nchans", sfinfo.channels);
376  cmd_ln_set_float32_r(wtf->config, "-samprate", sfinfo.samplerate);
377  wtf->insfh = sf;
378  wtf->infh = NULL;
379 
380  return TRUE;
381 }
382 
387 static int
388 decode_sndfile(sphinx_wave2feat_t *wtf)
389 {
390  size_t nsamp;
391  int32 nfr, nchans, whichchan;
392  int nfloat, n;
393 
394  nchans = cmd_ln_int32_r(wtf->config, "-nchans");
395  whichchan = cmd_ln_int32_r(wtf->config, "-whichchan");
396  fe_start_utt(wtf->fe);
397  nfloat = 0;
398  while ((nsamp = sf_read_short(wtf->insfh,
399  wtf->audio,
400  wtf->blocksize)) != 0) {
401  int16 const *inspeech;
402  size_t nvec;
403 
404  /* Mix or pick channels. */
405  if (nchans > 1)
406  nsamp = mixnpick_channels(wtf->audio, nsamp, nchans, whichchan);
407 
408  inspeech = wtf->audio;
409  nvec = wtf->featsize;
410  /* Consume all samples. */
411  while (nsamp) {
412  nfr = nvec;
413  fe_process_frames(wtf->fe, &inspeech, &nsamp, wtf->feat, &nfr);
414  if (nfr) {
415  if ((n = (*wtf->ot->output_frames)(wtf, wtf->feat, nfr)) < 0)
416  return -1;
417  nfloat += n;
418  }
419  }
420  inspeech = wtf->audio;
421  }
422  /* Now process any leftover audio frames. */
423  fe_end_utt(wtf->fe, wtf->feat[0], &nfr);
424  if (nfr) {
425  if ((n = (*wtf->ot->output_frames)(wtf, wtf->feat, nfr)) < 0)
426  return -1;
427  nfloat += n;
428  }
429 
430  sf_close(wtf->insfh);
431  wtf->insfh = NULL;
432  return nfloat;
433 }
434 #endif /* HAVE_SNDFILE_H */
435 
440 static int
441 decode_pcm(sphinx_wave2feat_t *wtf)
442 {
443  size_t nsamp;
444  int32 nfr, nchans, whichchan;
445  int nfloat, n;
446 
447  nchans = cmd_ln_int32_r(wtf->config, "-nchans");
448  whichchan = cmd_ln_int32_r(wtf->config, "-whichchan");
449  fe_start_utt(wtf->fe);
450  nfloat = 0;
451  while ((nsamp = fread(wtf->audio, 2, wtf->blocksize, wtf->infh)) != 0) {
452  size_t nvec;
453  int16 const *inspeech;
454 
455  /* Byteswap stuff here if necessary. */
456  if (wtf->byteswap) {
457  for (n = 0; n < nsamp; ++n)
458  SWAP_INT16(wtf->audio + n);
459  }
460 
461  /* Mix or pick channels. */
462  if (nchans > 1)
463  nsamp = mixnpick_channels(wtf->audio, nsamp, nchans, whichchan);
464 
465  inspeech = wtf->audio;
466  nvec = wtf->featsize;
467  /* Consume all samples. */
468  while (nsamp) {
469  nfr = nvec;
470  fe_process_frames(wtf->fe, &inspeech, &nsamp, wtf->feat, &nfr);
471  if (nfr) {
472  if ((n = (*wtf->ot->output_frames)(wtf, wtf->feat, nfr)) < 0)
473  return -1;
474  nfloat += n;
475  }
476  }
477  inspeech = wtf->audio;
478  }
479  /* Now process any leftover audio frames. */
480  fe_end_utt(wtf->fe, wtf->feat[0], &nfr);
481  if (nfr) {
482  if ((n = (*wtf->ot->output_frames)(wtf, wtf->feat, nfr)) < 0)
483  return -1;
484  nfloat += n;
485  }
486 
487  if (fclose(wtf->infh) == EOF)
488  E_ERROR_SYSTEM("Failed to close input file");
489  wtf->infh = NULL;
490  return nfloat;
491 }
492 
497 static int
498 decode_sphinx_mfc(sphinx_wave2feat_t *wtf)
499 {
500  int nfloat = 0, n;
501  int featsize = wtf->featsize;
502 
503  /* If the input vector length is less than the output length, we
504  * need to do this one frame at a time, because there's empty
505  * space at the end of each vector in wtf->feat. */
506  if (wtf->in_veclen < wtf->veclen)
507  featsize = 1;
508  while ((n = fread(wtf->feat[0], sizeof(**wtf->feat),
509  featsize * wtf->in_veclen, wtf->infh)) != 0) {
510  int i, nfr = n / wtf->in_veclen;
511  if (n % wtf->in_veclen) {
512  E_ERROR("Size of file %d not a multiple of veclen %d\n",
513  n, wtf->in_veclen);
514  return -1;
515  }
516  /* Byteswap stuff here if necessary. */
517  if (wtf->byteswap) {
518  for (i = 0; i < n; ++i)
519  SWAP_FLOAT32(wtf->feat[0] + i);
520  }
521  fe_float_to_mfcc(wtf->fe, (float32 **)wtf->feat, wtf->feat, nfr);
522  for (i = 0; i < nfr; ++i) {
523  if (cmd_ln_boolean_r(wtf->config, "-spec2cep")) {
524  if (0 == strcmp(cmd_ln_str_r(wtf->config, "-transform"), "legacy"))
525  fe_logspec_to_mfcc(wtf->fe, wtf->feat[i], wtf->feat[i]);
526  else
527  fe_logspec_dct2(wtf->fe, wtf->feat[i], wtf->feat[i]);
528  }
529  else if (cmd_ln_boolean_r(wtf->config, "-cep2spec")) {
530  fe_mfcc_dct3(wtf->fe, wtf->feat[i], wtf->feat[i]);
531  }
532  }
533  if ((n = (*wtf->ot->output_frames)(wtf, wtf->feat, nfr)) < 0)
534  return -1;
535  nfloat += n;
536  }
537 
538  if (fclose(wtf->infh) == EOF)
539  E_ERROR_SYSTEM("Failed to close input file");
540  wtf->infh = NULL;
541  return nfloat;
542 }
543 
544 static const audio_type_t types[] = {
545 #ifdef HAVE_SNDFILE_H
546  { "-sndfile", &detect_sndfile, &decode_sndfile },
547 #endif
548  { "-mswav", &detect_riff, &decode_pcm },
549  { "-nist", &detect_nist, &decode_pcm },
550  { "-raw", &detect_raw, &decode_pcm },
551  { "-sph2pipe", &detect_sph2pipe, &decode_pcm }
552 };
553 static const int ntypes = sizeof(types)/sizeof(types[0]);
554 static const audio_type_t mfcc_type = {
555  "sphinx_mfc", &detect_sphinx_mfc, &decode_sphinx_mfc
556 };
557 
563 static int
564 output_header_sphinx(sphinx_wave2feat_t *wtf, int32 nfloat)
565 {
566  if (fwrite(&nfloat, 4, 1, wtf->outfh) != 1) {
567  E_ERROR_SYSTEM("Failed to write to %s", wtf->outfile);
568  return -1;
569  }
570  return 0;
571 }
572 
578 static int
579 output_frames_sphinx(sphinx_wave2feat_t *wtf, mfcc_t **frames, int nfr)
580 {
581  int i, nfloat = 0;
582 
583  fe_mfcc_to_float(wtf->fe, frames, (float32 **)frames, nfr);
584  for (i = 0; i < nfr; ++i) {
585  if (fwrite(frames[i], sizeof(float32), wtf->veclen, wtf->outfh) != wtf->veclen) {
586  E_ERROR_SYSTEM("Writing %d values to %s failed",
587  wtf->veclen, wtf->outfile);
588  return -1;
589  }
590  nfloat += wtf->veclen;
591  }
592  return nfloat;
593 }
594 
595 typedef enum htk_feature_kind_e {
596  WAVEFORM = 0, /* PCM audio (rarely used) */
597  LPC = 1, /* LPC filter coefficients */
598  LPCREFC = 2, /* LPC reflection coefficients */
599  LPCEPSTRA = 3, /* LPC-based cepstral coefficients */
600  LPCDELCEP = 4, /* LPCC plus deltas */
601  IREFC = 5, /* 16-bit integer LPC reflection coefficients */
602  MFCC = 6, /* MFCCs */
603  FBANK = 7, /* Log mel spectrum */
604  MELSPEC = 8, /* Linear mel spectrum */
605  USER = 9, /* User defined */
606  DISCRETE = 10, /* Vector quantized data */
607  PLP = 11 /* PLP coefficients */
608 } htk_feature_kind_t;
609 
610 typedef enum htk_feature_flag_e {
611  _E = 0000100, /* has energy */
612  _N = 0000200, /* absolute energy supressed */
613  _D = 0000400, /* has delta coefficients */
614  _A = 0001000, /* has acceleration (delta-delta) coefficients */
615  _C = 0002000, /* is compressed */
616  _Z = 0004000, /* has zero mean static coefficients (i.e. CMN) */
617  _K = 0010000, /* has CRC checksum */
618  _O = 0020000, /* has 0th cepstral coefficient */
619  _V = 0040000, /* has VQ data */
620  _T = 0100000 /* has third differential coefficients */
621 } htk_feature_flag_t;
622 
626 static int
627 output_header_htk(sphinx_wave2feat_t *wtf, int32 nfloat)
628 {
629  int32 samp_period;
630  int16 samp_size;
631  int16 param_kind;
632  int swap = FALSE;
633 
634  /* HTK files are big-endian. */
635  if (0 == strcmp("little", cmd_ln_str_r(wtf->config, "-mach_endian")))
636  swap = TRUE;
637  /* Same file size thing as in Sphinx files (I think) */
638  if (swap) SWAP_INT32(&nfloat);
639  if (fwrite(&nfloat, 4, 1, wtf->outfh) != 1)
640  return -1;
641  /* Sample period in 100ns units. */
642  samp_period = (int32)(1e+7 / cmd_ln_float32_r(wtf->config, "-frate"));
643  if (swap) SWAP_INT32(&samp_period);
644  if (fwrite(&samp_period, 4, 1, wtf->outfh) != 1)
645  return -1;
646  /* Sample size - veclen * sizeof each sample. */
647  samp_size = wtf->veclen * 4;
648  if (swap) SWAP_INT16(&samp_size);
649  if (fwrite(&samp_size, 2, 1, wtf->outfh) != 1)
650  return -1;
651  /* Format and flags. */
652  if (cmd_ln_boolean_r(wtf->config, "-logspec")
653  || cmd_ln_boolean_r(wtf->config, "-cep2spec"))
654  param_kind = FBANK; /* log mel-filter bank outputs */
655  else
656  param_kind = MFCC | _O; /* MFCC + CEP0 (note reordering...) */
657  if (swap) SWAP_INT16(&param_kind);
658  if (fwrite(&param_kind, 2, 1, wtf->outfh) != 1)
659  return -1;
660 
661  return 0;
662 }
663 
667 static int
668 output_frames_htk(sphinx_wave2feat_t *wtf, mfcc_t **frames, int nfr)
669 {
670  int i, j, swap, htk_reorder, nfloat = 0;
671 
672  fe_mfcc_to_float(wtf->fe, frames, (float32 **)frames, nfr);
673  /* This is possibly inefficient, but probably not a big deal. */
674  swap = (0 == strcmp("little", cmd_ln_str_r(wtf->config, "-mach_endian")));
675  htk_reorder = (0 == strcmp("htk", wtf->ot->name)
676  && !(cmd_ln_boolean_r(wtf->config, "-logspec")
677  || cmd_ln_boolean_r(wtf->config, "-cep2spec")));
678  for (i = 0; i < nfr; ++i) {
679  if (htk_reorder) {
680  mfcc_t c0 = frames[i][0];
681  memmove(frames[i] + 1, frames[i], (wtf->veclen - 1) * 4);
682  frames[i][wtf->veclen - 1] = c0;
683  }
684  if (swap)
685  for (j = 0; j < wtf->veclen; ++j)
686  SWAP_FLOAT32(frames[i] + j);
687  if (fwrite(frames[i], sizeof(float32), wtf->veclen, wtf->outfh) != wtf->veclen) {
688  E_ERROR_SYSTEM("Writing %d values to %s failed",
689  wtf->veclen, wtf->outfile);
690  return -1;
691  }
692  nfloat += wtf->veclen;
693  }
694  return nfloat;
695 }
696 
700 static int
701 output_frames_text(sphinx_wave2feat_t *wtf, mfcc_t **frames, int nfr)
702 {
703  int i, j, nfloat = 0;
704 
705  fe_mfcc_to_float(wtf->fe, frames, (float32 **)frames, nfr);
706  for (i = 0; i < nfr; ++i) {
707  for (j = 0; j < wtf->veclen; ++j) {
708  fprintf(wtf->outfh, "%.5g", frames[i][j]);
709  if (j == wtf->veclen - 1)
710  fprintf(wtf->outfh, "\n");
711  else
712  fprintf(wtf->outfh, " ");
713  }
714  nfloat += wtf->veclen;
715  }
716  return nfloat;
717 }
718 
719 static const output_type_t outtypes[] = {
720  { "sphinx", &output_header_sphinx, &output_frames_sphinx },
721  { "htk", &output_header_htk, &output_frames_htk },
722  { "text", NULL, &output_frames_text }
723 };
724 static const int nouttypes = sizeof(outtypes)/sizeof(outtypes[0]);
725 
727 sphinx_wave2feat_init(cmd_ln_t *config)
728 {
729  sphinx_wave2feat_t *wtf;
730  int i;
731 
732  wtf = ckd_calloc(1, sizeof(*wtf));
733  wtf->refcount = 1;
734  wtf->config = cmd_ln_retain(config);
735  wtf->fe = fe_init_auto_r(wtf->config);
736  wtf->ot = outtypes; /* Default (sphinx) type. */
737  for (i = 0; i < nouttypes; ++i) {
738  output_type_t const *otype = &outtypes[i];
739  if (0 == strcmp(cmd_ln_str_r(config, "-ofmt"), otype->name)) {
740  wtf->ot = otype;
741  break;
742  }
743  }
744  if (i == nouttypes) {
745  E_ERROR("Unknown output type: '%s'\n",
746  cmd_ln_str_r(config, "-ofmt"));
747  sphinx_wave2feat_free(wtf);
748  return NULL;
749  }
750 
751  return wtf;
752 }
753 
754 int
755 sphinx_wave2feat_free(sphinx_wave2feat_t *wtf)
756 {
757  if (wtf == NULL)
758  return 0;
759  if (--wtf->refcount > 0)
760  return wtf->refcount;
761 
762  if (wtf->audio)
763  ckd_free(wtf->audio);
764  if (wtf->feat)
765  ckd_free_2d(wtf->feat);
766  if (wtf->infile)
767  ckd_free(wtf->infile);
768  if (wtf->outfile)
769  ckd_free(wtf->outfile);
770  if (wtf->infh) {
771  if (fclose(wtf->infh) == EOF)
772  E_ERROR_SYSTEM("Failed to close input file");
773  }
774  if (wtf->outfh) {
775  if (fclose(wtf->outfh) == EOF)
776  E_ERROR_SYSTEM("Failed to close output file");
777  }
778  cmd_ln_free_r(wtf->config);
779  fe_free(wtf->fe);
780  ckd_free(wtf);
781 
782  return 0;
783 }
784 
786 sphinx_wave2feat_retain(sphinx_wave2feat_t *wtf)
787 {
788  ++wtf->refcount;
789  return wtf;
790 }
791 
792 static audio_type_t const *
793 detect_audio_type(sphinx_wave2feat_t *wtf)
794 {
795  audio_type_t const *atype;
796  int i;
797 
798  /* Special case audio type for Sphinx MFCC inputs. */
799  if (cmd_ln_boolean_r(wtf->config, "-spec2cep")
800  || cmd_ln_boolean_r(wtf->config, "-cep2spec")) {
801  int rv = mfcc_type.detect(wtf);
802  if (rv == -1)
803  goto error_out;
804  return &mfcc_type;
805  }
806 
807  /* Try to use the type of infile given on the command line. */
808  for (i = 0; i < ntypes; ++i) {
809  int rv;
810  atype = &types[i];
811  if (cmd_ln_boolean_r(wtf->config, atype->name)) {
812  rv = (*atype->detect)(wtf);
813  if (rv == -1)
814  goto error_out;
815  else if (rv == TRUE)
816  break;
817  }
818  }
819  if (i == ntypes) {
820  /* Detect file type of infile and get parameters. */
821  for (i = 0; i < ntypes; ++i) {
822  int rv;
823  atype = &types[i];
824  rv = (*atype->detect)(wtf);
825  if (rv == -1)
826  goto error_out;
827  else if (rv == TRUE)
828  break;
829  }
830  if (i == ntypes)
831  goto error_out;
832  }
833  return atype;
834  error_out:
835  if (wtf->infh)
836  fclose(wtf->infh);
837  wtf->infh = NULL;
838  return NULL;
839 }
840 
841 int
842 sphinx_wave2feat_convert_file(sphinx_wave2feat_t *wtf,
843  char const *infile, char const *outfile)
844 {
845  int nchans, minfft, nfft, nfloat, veclen;
846  audio_type_t const *atype;
847  int fshift, fsize;
848 
849  if (cmd_ln_boolean_r(wtf->config, "-verbose"))
850  E_INFO("Converting %s to %s\n", infile, outfile);
851 
852  wtf->infile = ckd_salloc(infile);
853 
854  /* Detect input file type. */
855  if ((atype = detect_audio_type(wtf)) == NULL)
856  return -1;
857 
858  /* Determine whether to byteswap input. */
859  wtf->byteswap = strcmp(cmd_ln_str_r(wtf->config, "-mach_endian"),
860  cmd_ln_str_r(wtf->config, "-input_endian"));
861 
862  /* Make sure the FFT size is sufficiently large. */
863  minfft = (int)(cmd_ln_float32_r(wtf->config, "-samprate")
864  * cmd_ln_float32_r(wtf->config, "-wlen") + 0.5);
865  for (nfft = 1; nfft < minfft; nfft <<= 1)
866  ;
867  if (nfft > cmd_ln_int32_r(wtf->config, "-nfft")) {
868  E_WARN("Value of -nfft = %d is too small, increasing to %d\n",
869  cmd_ln_int32_r(wtf->config, "-nfft"), nfft);
870  cmd_ln_set_int32_r(wtf->config, "-nfft", nfft);
871  fe_free(wtf->fe);
872  wtf->fe = fe_init_auto_r(wtf->config);
873  }
874 
875  /* Get the output frame size (if not already set). */
876  if (wtf->veclen == 0)
877  wtf->veclen = fe_get_output_size(wtf->fe);
878 
879  /* Set up the input and output buffers. */
880  fe_get_input_size(wtf->fe, &fshift, &fsize);
881  /* Want to get at least a whole frame plus shift in here. Also we
882  will either pick or mix multiple channels so we need to read
883  them all at once. */
884  nchans = cmd_ln_int32_r(wtf->config, "-nchans");
885  wtf->blocksize = cmd_ln_int32_r(wtf->config, "-blocksize") * nchans;
886  if (wtf->blocksize < (fsize + fshift) * nchans) {
887  E_INFO("Block size of %d too small, increasing to %d\n",
888  wtf->blocksize,
889  (fsize + fshift) * nchans);
890  wtf->blocksize = (fsize + fshift) * nchans;
891  }
892  wtf->audio = ckd_calloc(wtf->blocksize, sizeof(*wtf->audio));
893  wtf->featsize = (wtf->blocksize / nchans - fsize) / fshift;
894 
895  /* Use the maximum of the input and output frame sizes to allocate this. */
896  veclen = wtf->veclen;
897  if (wtf->in_veclen > veclen) veclen = wtf->in_veclen;
898 
899  wtf->feat = ckd_calloc_2d(wtf->featsize, veclen, sizeof(**wtf->feat));
900 
901  /* Let's go! */
902  if ((wtf->outfh = fopen(outfile, "wb")) == NULL) {
903  E_ERROR_SYSTEM("Failed to open %s for writing", outfile);
904  return -1;
905  }
906  /* Write an empty header, which we'll fill in later. */
907  if (wtf->ot->output_header &&
908  (*wtf->ot->output_header)(wtf, 0) < 0) {
909  E_ERROR_SYSTEM("Failed to write empty header to %s\n", outfile);
910  goto error_out;
911  }
912  wtf->outfile = ckd_salloc(outfile);
913 
914  if ((nfloat = (*atype->decode)(wtf)) < 0) {
915  E_ERROR("Failed to convert");
916  goto error_out;
917  }
918 
919  if (wtf->ot->output_header) {
920  if (fseek(wtf->outfh, 0, SEEK_SET) < 0) {
921  E_ERROR_SYSTEM("Failed to seek to beginning of %s\n", outfile);
922  goto error_out;
923  }
924  if ((*wtf->ot->output_header)(wtf, nfloat) < 0) {
925  E_ERROR_SYSTEM("Failed to write header to %s\n", outfile);
926  goto error_out;
927  }
928  }
929 
930 
931  if (wtf->audio)
932  ckd_free(wtf->audio);
933  if (wtf->feat)
934  ckd_free_2d(wtf->feat);
935  if (wtf->infile)
936  ckd_free(wtf->infile);
937  if (wtf->outfile)
938  ckd_free(wtf->outfile);
939 
940  wtf->audio = NULL;
941  wtf->infile = NULL;
942  wtf->feat = NULL;
943  wtf->outfile = NULL;
944 
945  if (wtf->outfh)
946  if (fclose(wtf->outfh) == EOF)
947  E_ERROR_SYSTEM("Failed to close output file");
948  wtf->outfh = NULL;
949 
950  return 0;
951 
952 error_out:
953 
954  if (wtf->audio)
955  ckd_free(wtf->audio);
956  if (wtf->feat)
957  ckd_free_2d(wtf->feat);
958  if (wtf->infile)
959  ckd_free(wtf->infile);
960  if (wtf->outfile)
961  ckd_free(wtf->outfile);
962 
963  wtf->audio = NULL;
964  wtf->infile = NULL;
965  wtf->feat = NULL;
966  wtf->outfile = NULL;
967 
968  if (wtf->outfh)
969  if (fclose(wtf->outfh) == EOF)
970  E_ERROR_SYSTEM("Failed to close output file");
971  wtf->outfh = NULL;
972 
973  return -1;
974 }
975 
976 void
977 build_filenames(cmd_ln_t *config, char const *basename,
978  char **out_infile, char **out_outfile)
979 {
980  char const *di, *do_, *ei, *eo;
981 
982  di = cmd_ln_str_r(config, "-di");
983  do_ = cmd_ln_str_r(config, "-do");
984  ei = cmd_ln_str_r(config, "-ei");
985  eo = cmd_ln_str_r(config, "-eo");
986 
987  *out_infile = string_join(di ? di : "",
988  di ? "/" : "",
989  basename,
990  ei ? "." : "",
991  ei ? ei : "",
992  NULL);
993  *out_outfile = string_join(do_ ? do_ : "",
994  do_ ? "/" : "",
995  basename,
996  eo ? "." : "",
997  eo ? eo : "",
998  NULL);
999  /* Build output directory structure if possible/requested (it is
1000  * by default). */
1001  if (cmd_ln_boolean_r(config, "-build_outdirs")) {
1002  char *dirname = ckd_salloc(*out_outfile);
1003  path2dirname(*out_outfile, dirname);
1004  build_directory(dirname);
1005  ckd_free(dirname);
1006  }
1007 }
1008 
1009 static int
1010 run_control_file(sphinx_wave2feat_t *wtf, char const *ctlfile)
1011 {
1012  hash_table_t *files;
1013  hash_iter_t *itor;
1014  lineiter_t *li;
1015  FILE *ctlfh;
1016  int nskip, runlen, npart, rv = 0;
1017 
1018  if ((ctlfh = fopen(ctlfile, "r")) == NULL) {
1019  E_ERROR_SYSTEM("Failed to open control file %s", ctlfile);
1020  return -1;
1021  }
1022  nskip = cmd_ln_int32_r(wtf->config, "-nskip");
1023  runlen = cmd_ln_int32_r(wtf->config, "-runlen");
1024  if ((npart = cmd_ln_int32_r(wtf->config, "-npart"))) {
1025  /* Count lines in the file. */
1026  int partlen, part, nlines = 0;
1027  part = cmd_ln_int32_r(wtf->config, "-part");
1028  for (li = lineiter_start(ctlfh); li; li = lineiter_next(li))
1029  ++nlines;
1030  fseek(ctlfh, 0, SEEK_SET);
1031  partlen = nlines / npart;
1032  nskip = partlen * (part - 1);
1033  if (part == npart)
1034  runlen = -1;
1035  else
1036  runlen = partlen;
1037  }
1038  if (runlen != -1){
1039  E_INFO("Processing %d utterances at position %d\n", runlen, nskip);
1040  files = hash_table_new(runlen, HASH_CASE_YES);
1041  }
1042  else {
1043  E_INFO("Processing all remaining utterances at position %d\n", nskip);
1044  files = hash_table_new(1000, HASH_CASE_YES);
1045  }
1046  for (li = lineiter_start(ctlfh); li; li = lineiter_next(li)) {
1047  char *c, *infile, *outfile;
1048 
1049  if (nskip-- > 0)
1050  continue;
1051  if (runlen == 0) {
1052  lineiter_free(li);
1053  break;
1054  }
1055  --runlen;
1056 
1057  string_trim(li->buf, STRING_BOTH);
1058  /* Extract the file ID from the control line. */
1059  if ((c = strchr(li->buf, ' ')) != NULL)
1060  *c = '\0';
1061  if (strlen(li->buf) == 0) {
1062  E_WARN("Empty line %d in control file, skipping\n", li->lineno);
1063  continue;
1064  }
1065  build_filenames(wtf->config, li->buf, &infile, &outfile);
1066  if (hash_table_lookup(files, infile, NULL) == 0)
1067  continue;
1068  rv = sphinx_wave2feat_convert_file(wtf, infile, outfile);
1069  hash_table_enter(files, infile, outfile);
1070  if (rv != 0) {
1071  lineiter_free(li);
1072  break;
1073  }
1074  }
1075  for (itor = hash_table_iter(files); itor;
1076  itor = hash_table_iter_next(itor)) {
1077  ckd_free((void *)hash_entry_key(itor->ent));
1078  ckd_free(hash_entry_val(itor->ent));
1079  }
1080  hash_table_free(files);
1081 
1082  if (fclose(ctlfh) == EOF)
1083  E_ERROR_SYSTEM("Failed to close control file");
1084  return rv;
1085 }
1086 
1087 int
1088 main(int argc, char *argv[])
1089 {
1090  sphinx_wave2feat_t *wtf;
1091  cmd_ln_t *config;
1092  int rv;
1093 
1094  /* Initialize config. */
1095  if ((config = cmd_ln_parse_r(NULL, defn, argc, argv, TRUE)) == NULL)
1096  return 2;
1097 
1098  /* Parse an argument file if there's one in there. */
1099  if (cmd_ln_str_r(config, "-argfile"))
1100  config = cmd_ln_parse_file_r(config, defn,
1101  cmd_ln_str_r(config, "-argfile"), FALSE);
1102  if (config == NULL) {
1103  E_ERROR("Command line parsing failed\n");
1104  return 1;
1105  }
1106  if ((wtf = sphinx_wave2feat_init(config)) == NULL) {
1107  E_ERROR("Failed to initialize wave2feat object\n");
1108  return 1;
1109  }
1110 
1111  /* If there's a control file run through it, otherwise we will do
1112  * a single file (which is what run_control_file will do
1113  * internally too) */
1114  if (cmd_ln_str_r(config, "-c"))
1115  rv = run_control_file(wtf, cmd_ln_str_r(config, "-c"));
1116  else
1117  rv = sphinx_wave2feat_convert_file(wtf, cmd_ln_str_r(config, "-i"),
1118  cmd_ln_str_r(config, "-o"));
1119 
1120  sphinx_wave2feat_free(wtf);
1121  cmd_ln_free_r(config);
1122  return rv;
1123 }
Command-line and other configurationparsing and handling.
SPHINXBASE_EXPORT cmd_ln_t * cmd_ln_retain(cmd_ln_t *cmdln)
Retain ownership of a command-line argument set.
Definition: cmd_ln.c:1029
Miscellaneous useful string functions.
hash_entry_t * ent
Current entry in that table.
Definition: hash_table.h:170
SPHINXBASE_EXPORT int32 hash_table_lookup(hash_table_t *h, const char *key, void **val)
Look up a key in a hash table and optionally return the associated value.
Definition: hash_table.c:309
int veclen
Length of each output vector.
Definition: sphinx_fe.c:88
#define ckd_calloc_2d(d1, d2, sz)
Macro for ckd_calloc_2d
Definition: ckd_alloc.h:270
#define ckd_calloc(n, sz)
Macros to simplify the use of above functions.
Definition: ckd_alloc.h:248
#define E_INFO
Print logging information to standard error stream.
Definition: err.h:147
output_type_t const * ot
Output type object.
Definition: sphinx_fe.c:94
Sphinx's memory allocation/deallocation routines.
SPHINXBASE_EXPORT int cmd_ln_free_r(cmd_ln_t *cmdln)
Release a command-line argument set and all associated strings.
Definition: cmd_ln.c:1036
File names related operation.
SPHINXBASE_EXPORT cmd_ln_t * cmd_ln_parse_r(cmd_ln_t *inout_cmdln, arg_t const *defn, int32 argc, char *argv[], int32 strict)
Parse a list of strings into argumetns.
Definition: cmd_ln.c:551
SPHINXBASE_EXPORT hash_iter_t * hash_table_iter(hash_table_t *h)
Start iterating over key-value pairs in a hash table.
Definition: hash_table.c:653
Line iterator for files.
Definition: pio.h:177
#define ckd_salloc(ptr)
Macro for ckd_salloc
Definition: ckd_alloc.h:264
#define hash_entry_val(e)
Access macros.
Definition: hash_table.h:175
SPHINXBASE_EXPORT char const * cmd_ln_str_r(cmd_ln_t *cmdln, char const *name)
Retrieve a string from a command-line object.
Definition: cmd_ln.c:949
SPHINXBASE_EXPORT hash_table_t * hash_table_new(int32 size, int32 casearg)
Allocate a new hash table for a given expected size.
Definition: hash_table.c:158
FILE * infh
Input file handle.
Definition: sphinx_fe.c:82
int refcount
Reference count.
Definition: sphinx_fe.c:77
SPHINXBASE_EXPORT void ckd_free(void *ptr)
Test and free a 1-D array.
Definition: ckd_alloc.c:241
#define E_WARN
Print warning information to standard error stream.
Definition: err.h:164
char * outfile
Path to output file.
Definition: sphinx_fe.c:81
SPHINXBASE_EXPORT void hash_table_free(hash_table_t *h)
Free the specified hash table; the caller is responsible for freeing the key strings pointed to by th...
Definition: hash_table.c:695
SPHINXBASE_EXPORT int build_directory(const char *path)
Create a directory and all of its parent directories, as needed.
Definition: pio.c:653
SPHINXBASE_EXPORT double atof_c(char const *str)
Locale independent version of atof().
Definition: strfuncs.c:56
int featsize
Size of feature buffer.
Definition: sphinx_fe.c:87
SPHINXBASE_EXPORT void lineiter_free(lineiter_t *li)
Stop reading lines from a file.
Definition: pio.c:358
RIFF 44-byte header structure for MS wav files.
Definition: sphinx_fe.c:98
FILE * outfh
Output file handle.
Definition: sphinx_fe.c:83
SPHINXBASE_EXPORT lineiter_t * lineiter_next(lineiter_t *li)
Move to the next line in the file.
Definition: pio.c:338
int byteswap
Whether byteswapping is necessary.
Definition: sphinx_fe.c:90
mfcc_t ** feat
Feature buffer.
Definition: sphinx_fe.c:85
SPHINXBASE_EXPORT lineiter_t * lineiter_start(FILE *fh)
Start reading lines from a file.
Definition: pio.c:255
int in_veclen
Length of each input vector (for cep<->spec).
Definition: sphinx_fe.c:89
Implementation of logging routines.
Both ends of string.
Definition: strfuncs.h:73
SPHINXBASE_EXPORT void * hash_table_enter(hash_table_t *h, const char *key, void *val)
Try to add a new entry with given key and associated value to hash table h.
Definition: hash_table.c:508
SPHINXBASE_EXPORT cmd_ln_t * cmd_ln_parse_file_r(cmd_ln_t *inout_cmdln, arg_t const *defn, char const *filename, int32 strict)
Parse an arguments file by deliminating on " \r\t\n" and putting each tokens into an argv[] for cmd_l...
Definition: cmd_ln.c:773
short * audio
Audio buffer.
Definition: sphinx_fe.c:84
SPHINXBASE_EXPORT int32 str2words(char *line, char **wptr, int32 n_wptr)
Convert a line to an array of "words", based on whitespace separators.
Definition: strfuncs.c:115
Opaque structure used to hold the results of command-line parsing.
char * infile
Path to input file.
Definition: sphinx_fe.c:80
SPHINXBASE_EXPORT hash_iter_t * hash_table_iter_next(hash_iter_t *itor)
Get the next key-value pair in iteration.
Definition: hash_table.c:663
SPHINXBASE_EXPORT char * string_join(const char *base,...)
Concatenate a NULL-terminated argument list of strings, returning a newly allocated string...
Definition: strfuncs.c:62
#define E_ERROR
Print error message to standard error stream.
Definition: err.h:169
SPHINXBASE_EXPORT void ckd_free_2d(void *ptr)
Free a 2-D array (ptr) previously allocated by ckd_calloc_2d.
Definition: ckd_alloc.c:252
int blocksize
Size of audio buffer.
Definition: sphinx_fe.c:86
#define cmd_ln_boolean_r(c, n)
Retrieve a boolean value from a command-line object.
Definition: cmd_ln.h:334
cmd_ln_t * config
Configuration parameters.
Definition: sphinx_fe.c:78
Hash table implementation.
Structure for the front-end computation.
Definition: fe_internal.h:124
SPHINXBASE_EXPORT void cmd_ln_set_str_r(cmd_ln_t *cmdln, char const *name, char const *str)
Set a string in a command-line object.
Definition: cmd_ln.c:989
SPHINXBASE_EXPORT char * string_trim(char *string, enum string_edge_e which)
Remove whitespace from a string, modifying it in-place.
Definition: strfuncs.c:89
fe_t * fe
Front end object.
Definition: sphinx_fe.c:79
#define E_ERROR_SYSTEM
Print error text; Call perror("");.
Definition: err.h:142
SPHINXBASE_EXPORT void path2dirname(const char *path, char *dir)
Strip off filename from the given path and copy the directory name into dir Caller must have allocate...
Definition: filename.c:90
file IO related operations.