SphinxBase  0.6
sphinx_lm_eval.c
Go to the documentation of this file.
1 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /* ====================================================================
3  * Copyright (c) 2008 Carnegie Mellon University. All rights
4  * reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright
11  * notice, this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright
14  * notice, this list of conditions and the following disclaimer in
15  * the documentation and/or other materials provided with the
16  * distribution.
17  *
18  * This work was supported in part by funding from the Defense Advanced
19  * Research Projects Agency and the National Science Foundation of the
20  * United States of America, and the CMU Sphinx Speech Consortium.
21  *
22  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33  *
34  * ====================================================================
35  *
36  */
41 #include <sphinxbase/logmath.h>
42 #include <sphinxbase/ngram_model.h>
43 #include <sphinxbase/cmd_ln.h>
44 #include <sphinxbase/ckd_alloc.h>
45 #include <sphinxbase/err.h>
46 #include <sphinxbase/pio.h>
47 #include <sphinxbase/strfuncs.h>
48 
49 #include <stdio.h>
50 #include <string.h>
51 #include <math.h>
52 
53 static const arg_t defn[] = {
54  { "-help",
56  "no",
57  "Shows the usage of the tool"},
58 
59  { "-logbase",
61  "1.0001",
62  "Base in which all log-likelihoods calculated" },
63 
64  { "-lm",
65  ARG_STRING,
66  NULL,
67  "Language model file"},
68 
69  { "-probdef",
70  ARG_STRING,
71  NULL,
72  "Probability definition file for classes in LM"},
73 
74  { "-lmctlfn",
75  ARG_STRING,
76  NULL,
77  "Control file listing a set of language models"},
78 
79  { "-lmname",
80  ARG_STRING,
81  NULL,
82  "Name of language model in -lmctlfn to use for all utterances" },
83 
84  { "-lsn",
85  ARG_STRING,
86  NULL,
87  "Transcription file to evaluate"},
88 
89  { "-text",
90  ARG_STRING,
91  "Text string to evaluate"},
92 
93  { "-mmap",
95  "no",
96  "Use memory-mapped I/O for reading binary LM files"},
97 
98  { "-lw",
100  "1.0",
101  "Language model weight" },
102 
103  { "-wip",
104  ARG_FLOAT32,
105  "1.0",
106  "Word insertion probability" },
107 
108  { "-uw",
109  ARG_FLOAT32,
110  "1.0",
111  "Unigram probability weight (interpolated with uniform distribution)"},
112 
113  { "-verbose",
114  ARG_BOOLEAN,
115  "no",
116  "Print details of perplexity calculation" },
117 
118  /* FIXME: Support -lmstartsym, -lmendsym, -lmctlfn, -ctl_lm */
119  { NULL, 0, NULL, NULL }
120 };
121 
122 static int verbose;
123 
124 static int
125 calc_entropy(ngram_model_t *lm, char **words, int32 n,
126  int32 *out_n_ccs, int32 *out_n_oovs, int32 *out_lm_score)
127 {
128  int32 *wids;
129  int32 startwid;
130  int32 i, ch, nccs, noovs, unk;
131 
132  if (n == 0)
133  return 0;
134 
135  unk = ngram_unknown_wid(lm);
136 
137  /* Reverse this array into an array of word IDs. */
138  wids = ckd_calloc(n, sizeof(*wids));
139  for (i = 0; i < n; ++i)
140  wids[n-i-1] = ngram_wid(lm, words[i]);
141  /* Skip <s> as it's a context cue (HACK, this should be configurable). */
142  startwid = ngram_wid(lm, "<s>");
143 
144  /* Now evaluate the list of words in reverse using the
145  * remainder of the array as the history. */
146  ch = noovs = nccs = 0;
147  for (i = 0; i < n; ++i) {
148  int32 n_used;
149  int32 prob;
150 
151  /* Skip <s> as it's a context cue (HACK, this should be configurable). */
152  if (wids[i] == startwid) {
153  ++nccs;
154  continue;
155  }
156  /* Skip and count OOVs. */
157  if (wids[i] == NGRAM_INVALID_WID || wids[i] == unk) {
158  ++noovs;
159  continue;
160  }
161  /* Sum up information for each N-gram */
162  prob = ngram_ng_score(lm,
163  wids[i], wids + i + 1,
164  n - i - 1, &n_used);
165  if (verbose) {
166  int m;
167  printf("log P(%s|", ngram_word(lm, wids[i]));
168  m = i + ngram_model_get_size(lm) - 1;
169  if (m >= n)
170  m = n - 1;
171  while (m > i) {
172  printf("%s ", ngram_word(lm, wids[m--]));
173  }
174  printf(") = %d\n", prob);
175  }
176  ch -= prob;
177  }
178 
179  if (out_n_ccs) *out_n_ccs = nccs;
180  if (out_n_oovs) *out_n_oovs = noovs;
181 
182  /* Calculate cross-entropy CH = - 1/N sum log P(W|H) */
183  n -= (nccs + noovs);
184  if (n <= 0)
185  return 0;
186  if (out_lm_score)
187  *out_lm_score = -ch;
188  return ch / n;
189 }
190 
191 static void
192 evaluate_file(ngram_model_t *lm, logmath_t *lmath, const char *lsnfn)
193 {
194  FILE *fh;
195  lineiter_t *litor;
196  int32 nccs, noovs, nwords, lscr;
197  float64 ch, log_to_log2;;
198 
199  if ((fh = fopen(lsnfn, "r")) == NULL)
200  E_FATAL_SYSTEM("failed to open transcript file %s", lsnfn);
201 
202  /* We have to keep ch in floating-point to avoid overflows, so
203  * we might as well use log2. */
204  log_to_log2 = log(logmath_get_base(lmath)) / log(2);
205  nccs = noovs = nwords = 0;
206  ch = 0.0;
207  for (litor = lineiter_start(fh); litor; litor = lineiter_next(litor)) {
208  char **words;
209  int32 n, tmp_ch, tmp_noovs, tmp_nccs, tmp_lscr;
210 
211  n = str2words(litor->buf, NULL, 0);
212  if (n < 0)
213  E_FATAL("str2words(line, NULL, 0) = %d, should not happen\n", n);
214  if (n == 0) /* Do nothing! */
215  continue;
216  words = ckd_calloc(n, sizeof(*words));
217  str2words(litor->buf, words, n);
218 
219  /* Remove any utterance ID (FIXME: has to be a single "word") */
220  if (words[n-1][0] == '('
221  && words[n-1][strlen(words[n-1])-1] == ')')
222  n = n - 1;
223 
224  tmp_ch = calc_entropy(lm, words, n, &tmp_nccs,
225  &tmp_noovs, &tmp_lscr);
226 
227  ch += (float64) tmp_ch * (n - tmp_nccs - tmp_noovs) * log_to_log2;
228  nccs += tmp_nccs;
229  noovs += tmp_noovs;
230  lscr += tmp_lscr;
231  nwords += n;
232 
233  ckd_free(words);
234  }
235 
236  ch /= (nwords - nccs - noovs);
237  printf("cross-entropy: %f bits\n", ch);
238 
239  /* Calculate perplexity pplx = exp CH */
240  printf("perplexity: %f\n", pow(2.0, ch));
241  printf("lm score: %d\n", lscr);
242 
243  /* Report OOVs and CCs */
244  printf("%d words evaluated\n", nwords);
245  printf("%d OOVs (%.2f%%), %d context cues removed\n",
246  noovs, (double)noovs / nwords * 100, nccs);
247 }
248 
249 static void
250 evaluate_string(ngram_model_t *lm, logmath_t *lmath, const char *text)
251 {
252  char *textfoo;
253  char **words;
254  int32 n, ch, noovs, nccs, lscr;
255 
256  /* Split it into an array of strings. */
257  textfoo = ckd_salloc(text);
258  n = str2words(textfoo, NULL, 0);
259  if (n < 0)
260  E_FATAL("str2words(textfoo, NULL, 0) = %d, should not happen\n", n);
261  if (n == 0) /* Do nothing! */
262  return;
263  words = ckd_calloc(n, sizeof(*words));
264  str2words(textfoo, words, n);
265 
266  ch = calc_entropy(lm, words, n, &nccs, &noovs, &lscr);
267 
268  printf("input: %s\n", text);
269  printf("cross-entropy: %f bits\n",
270  ch * log(logmath_get_base(lmath)) / log(2));
271 
272  /* Calculate perplexity pplx = exp CH */
273  printf("perplexity: %f\n", logmath_exp(lmath, ch));
274  printf("lm score: %d\n", lscr);
275 
276  /* Report OOVs and CCs */
277  printf("%d words evaluated\n", n);
278  printf("%d OOVs, %d context cues removed\n",
279  noovs, nccs);
280 
281  ckd_free(textfoo);
282  ckd_free(words);
283 }
284 
285 int
286 main(int argc, char *argv[])
287 {
288  cmd_ln_t *config;
289  ngram_model_t *lm = NULL;
290  logmath_t *lmath;
291  const char *lmfn, *probdefn, *lsnfn, *text;
292 
293  if ((config = cmd_ln_parse_r(NULL, defn, argc, argv, TRUE)) == NULL)
294  return 1;
295 
296  verbose = cmd_ln_boolean_r(config, "-verbose");
297 
298  /* Create log math object. */
299  if ((lmath = logmath_init
300  (cmd_ln_float64_r(config, "-logbase"), 0, 0)) == NULL) {
301  E_FATAL("Failed to initialize log math\n");
302  }
303 
304  /* Load the language model. */
305  lmfn = cmd_ln_str_r(config, "-lm");
306  if (lmfn == NULL
307  || (lm = ngram_model_read(config, lmfn,
308  NGRAM_AUTO, lmath)) == NULL) {
309  E_FATAL("Failed to load language model from %s\n",
310  cmd_ln_str_r(config, "-lm"));
311  }
312  if ((probdefn = cmd_ln_str_r(config, "-probdef")) != NULL)
313  ngram_model_read_classdef(lm, probdefn);
315  cmd_ln_float32_r(config, "-lw"),
316  cmd_ln_float32_r(config, "-wip"),
317  cmd_ln_float32_r(config, "-uw"));
318 
319  /* Now evaluate some text. */
320  lsnfn = cmd_ln_str_r(config, "-lsn");
321  text = cmd_ln_str_r(config, "-text");
322  if (lsnfn) {
323  evaluate_file(lm, lmath, lsnfn);
324  }
325  else if (text) {
326  evaluate_string(lm, lmath, text);
327  }
328 
329  return 0;
330 }
Command-line and other configurationparsing and handling.
SPHINXBASE_EXPORT ngram_model_t * ngram_model_read(cmd_ln_t *config, const char *file_name, ngram_file_type_t file_type, logmath_t *lmath)
Read an N-Gram model from a file on disk.
Definition: ngram_model.c:121
Miscellaneous useful string functions.
#define ckd_calloc(n, sz)
Macros to simplify the use of above functions.
Definition: ckd_alloc.h:248
SPHINXBASE_EXPORT int32 ngram_unknown_wid(ngram_model_t *model)
Get the unknown word ID for a language model.
Definition: ngram_model.c:644
Sphinx's memory allocation/deallocation routines.
SPHINXBASE_EXPORT int32 ngram_wid(ngram_model_t *model, const char *word)
Look up numerical word ID.
Definition: ngram_model.c:771
#define NGRAM_INVALID_WID
Impossible word ID.
Definition: ngram_model.h:84
SPHINXBASE_EXPORT cmd_ln_t * cmd_ln_parse_r(cmd_ln_t *inout_cmdln, arg_t const *defn, int32 argc, char *argv[], int32 strict)
Parse a list of strings into argumetns.
Definition: cmd_ln.c:551
#define ARG_STRING
String argument (optional).
Definition: cmd_ln.h:114
Line iterator for files.
Definition: pio.h:177
#define ckd_salloc(ptr)
Macro for ckd_salloc
Definition: ckd_alloc.h:264
SPHINXBASE_EXPORT char const * cmd_ln_str_r(cmd_ln_t *cmdln, char const *name)
Retrieve a string from a command-line object.
Definition: cmd_ln.c:949
SPHINXBASE_EXPORT void ckd_free(void *ptr)
Test and free a 1-D array.
Definition: ckd_alloc.c:241
#define E_FATAL_SYSTEM
Print error text; Call perror(""); exit(errno);.
Definition: err.h:132
SPHINXBASE_EXPORT logmath_t * logmath_init(float64 base, int shift, int use_table)
Initialize a log math computation table.
Definition: logmath.c:62
#define ARG_FLOAT32
Definition: cmd_ln.h:148
#define ARG_FLOAT64
Definition: cmd_ln.h:152
N-Gram language models.
SPHINXBASE_EXPORT int ngram_model_apply_weights(ngram_model_t *model, float32 lw, float32 wip, float32 uw)
Apply a language weight, insertion penalty, and unigram weight to a language model.
Definition: ngram_model.c:477
SPHINXBASE_EXPORT float64 logmath_get_base(logmath_t *lmath)
Get the log base.
Definition: logmath.c:368
SPHINXBASE_EXPORT lineiter_t * lineiter_next(lineiter_t *li)
Move to the next line in the file.
Definition: pio.c:338
SPHINXBASE_EXPORT lineiter_t * lineiter_start(FILE *fh)
Start reading lines from a file.
Definition: pio.c:255
SPHINXBASE_EXPORT int32 ngram_model_get_size(ngram_model_t *model)
Get the order of the N-gram model (i.e.
Definition: ngram_model.c:663
Implementation of logging routines.
#define ARG_BOOLEAN
Boolean (true/false) argument (optional).
Definition: cmd_ln.h:118
Argument definition structure.
SPHINXBASE_EXPORT int32 ngram_model_read_classdef(ngram_model_t *model, const char *file_name)
Read a class definition file and add classes to a language model.
Definition: ngram_model.c:1199
SPHINXBASE_EXPORT int32 str2words(char *line, char **wptr, int32 n_wptr)
Convert a line to an array of "words", based on whitespace separators.
Definition: strfuncs.c:115
Opaque structure used to hold the results of command-line parsing.
#define E_FATAL
Exit with non-zero status after error message.
Definition: err.h:127
#define cmd_ln_boolean_r(c, n)
Retrieve a boolean value from a command-line object.
Definition: cmd_ln.h:334
cmd_ln_t * config
Configuration parameters.
Definition: sphinx_fe.c:78
Common implementation of ngram_model_t.
SPHINXBASE_EXPORT const char * ngram_word(ngram_model_t *model, int32 wid)
Look up word string for numerical word ID.
Definition: ngram_model.c:782
Fast integer logarithmic addition operations.
SPHINXBASE_EXPORT float64 logmath_exp(logmath_t *lmath, int logb_p)
Convert integer log in base B to linear floating point.
Definition: logmath.c:456
Determine file type automatically.
Definition: ngram_model.h:78
SPHINXBASE_EXPORT int32 ngram_ng_score(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist, int32 *n_used)
Quick general N-Gram score lookup.
Definition: ngram_model.c:494
file IO related operations.