SphinxBase  0.6
ngram_model_set.c
Go to the documentation of this file.
1 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /* ====================================================================
3  * Copyright (c) 2008 Carnegie Mellon University. All rights
4  * reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright
11  * notice, this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright
14  * notice, this list of conditions and the following disclaimer in
15  * the documentation and/or other materials provided with the
16  * distribution.
17  *
18  * This work was supported in part by funding from the Defense Advanced
19  * Research Projects Agency and the National Science Foundation of the
20  * United States of America, and the CMU Sphinx Speech Consortium.
21  *
22  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33  *
34  * ====================================================================
35  *
36  */
42 #include <string.h>
43 #include <stdlib.h>
44 
45 #include "sphinxbase/err.h"
46 #include "sphinxbase/ckd_alloc.h"
47 #include "sphinxbase/strfuncs.h"
48 #include "sphinxbase/filename.h"
49 
50 #include "ngram_model_set.h"
51 
52 static ngram_funcs_t ngram_model_set_funcs;
53 
54 static int
55 my_compare(const void *a, const void *b)
56 {
57  /* Make sure <UNK> floats to the beginning. */
58  if (strcmp(*(char * const *)a, "<UNK>") == 0)
59  return -1;
60  else if (strcmp(*(char * const *)b, "<UNK>") == 0)
61  return 1;
62  else
63  return strcmp(*(char * const *)a, *(char * const *)b);
64 }
65 
66 static void
67 build_widmap(ngram_model_t *base, logmath_t *lmath, int32 n)
68 {
69  ngram_model_set_t *set = (ngram_model_set_t *)base;
70  ngram_model_t **models = set->lms;
71  hash_table_t *vocab;
72  glist_t hlist;
73  gnode_t *gn;
74  int32 i;
75 
76  /* Construct a merged vocabulary and a set of word-ID mappings. */
77  vocab = hash_table_new(models[0]->n_words, FALSE);
78  /* Create the set of merged words. */
79  for (i = 0; i < set->n_models; ++i) {
80  int32 j;
81  for (j = 0; j < models[i]->n_words; ++j) {
82  /* Ignore collisions. */
83  (void)hash_table_enter_int32(vocab, models[i]->word_str[j], j);
84  }
85  }
86  /* Create the array of words, then sort it. */
87  if (hash_table_lookup(vocab, "<UNK>", NULL) != 0)
88  (void)hash_table_enter_int32(vocab, "<UNK>", 0);
89  /* Now we know the number of unigrams, initialize the base model. */
90  ngram_model_init(base, &ngram_model_set_funcs, lmath, n, hash_table_inuse(vocab));
91  base->writable = FALSE; /* We will reuse the pointers from the submodels. */
92  i = 0;
93  hlist = hash_table_tolist(vocab, NULL);
94  for (gn = hlist; gn; gn = gnode_next(gn)) {
95  hash_entry_t *ent = gnode_ptr(gn);
96  base->word_str[i++] = (char *)ent->key;
97  }
98  glist_free(hlist);
99  qsort(base->word_str, base->n_words, sizeof(*base->word_str), my_compare);
100 
101  /* Now create the word ID mappings. */
102  if (set->widmap)
103  ckd_free_2d((void **)set->widmap);
104  set->widmap = (int32 **) ckd_calloc_2d(base->n_words, set->n_models,
105  sizeof(**set->widmap));
106  for (i = 0; i < base->n_words; ++i) {
107  int32 j;
108  /* Also create the master wid mapping. */
109  (void)hash_table_enter_int32(base->wid, base->word_str[i], i);
110  /* printf("%s: %d => ", base->word_str[i], i); */
111  for (j = 0; j < set->n_models; ++j) {
112  set->widmap[i][j] = ngram_wid(models[j], base->word_str[i]);
113  /* printf("%d ", set->widmap[i][j]); */
114  }
115  /* printf("\n"); */
116  }
117  hash_table_free(vocab);
118 }
119 
122  ngram_model_t **models,
123  char **names,
124  const float32 *weights,
125  int32 n_models)
126 {
127  ngram_model_set_t *model;
128  ngram_model_t *base;
129  logmath_t *lmath;
130  int32 i, n;
131 
132  if (n_models == 0) /* WTF */
133  return NULL;
134 
135  /* Do consistency checking on the models. They must all use the
136  * same logbase and shift. */
137  lmath = models[0]->lmath;
138  for (i = 1; i < n_models; ++i) {
139  if (logmath_get_base(models[i]->lmath) != logmath_get_base(lmath)
140  || logmath_get_shift(models[i]->lmath) != logmath_get_shift(lmath)) {
141  E_ERROR("Log-math parameters don't match, will not create LM set\n");
142  return NULL;
143  }
144  }
145 
146  /* Allocate the combined model, initialize it. */
147  model = ckd_calloc(1, sizeof(*model));
148  base = &model->base;
149  model->n_models = n_models;
150  model->lms = ckd_calloc(n_models, sizeof(*model->lms));
151  model->names = ckd_calloc(n_models, sizeof(*model->names));
152  /* Initialize weights to a uniform distribution */
153  model->lweights = ckd_calloc(n_models, sizeof(*model->lweights));
154  {
155  int32 uniform = logmath_log(lmath, 1.0/n_models);
156  for (i = 0; i < n_models; ++i)
157  model->lweights[i] = uniform;
158  }
159  /* Default to interpolate if weights were given. */
160  if (weights)
161  model->cur = -1;
162 
163  n = 0;
164  for (i = 0; i < n_models; ++i) {
165  model->lms[i] = models[i];
166  model->names[i] = ckd_salloc(names[i]);
167  if (weights)
168  model->lweights[i] = logmath_log(lmath, weights[i]);
169  /* N is the maximum of all merged models. */
170  if (models[i]->n > n)
171  n = models[i]->n;
172  }
173  /* Allocate the history mapping table. */
174  model->maphist = ckd_calloc(n - 1, sizeof(*model->maphist));
175 
176  /* Now build the word-ID mapping and merged vocabulary. */
177  build_widmap(base, lmath, n);
178  return base;
179 }
180 
183  const char *lmctlfile,
184  logmath_t *lmath)
185 {
186  FILE *ctlfp;
187  glist_t lms = NULL;
188  glist_t lmnames = NULL;
189  __BIGSTACKVARIABLE__ char str[1024];
190  ngram_model_t *set = NULL;
191  hash_table_t *classes;
192  char *basedir, *c;
193 
194  /* Read all the class definition files to accumulate a mapping of
195  * classnames to definitions. */
196  classes = hash_table_new(0, FALSE);
197  if ((ctlfp = fopen(lmctlfile, "r")) == NULL) {
198  E_ERROR_SYSTEM("Failed to open %s", lmctlfile);
199  return NULL;
200  }
201 
202  /* Try to find the base directory to append to relative paths in
203  * the lmctl file. */
204  if ((c = strrchr(lmctlfile, '/')) || (c = strrchr(lmctlfile, '\\'))) {
205  /* Include the trailing slash. */
206  basedir = ckd_calloc(c - lmctlfile + 2, 1);
207  memcpy(basedir, lmctlfile, c - lmctlfile + 1);
208  }
209  else {
210  basedir = NULL;
211  }
212  E_INFO("Reading LM control file '%s'\n", lmctlfile);
213  if (basedir)
214  E_INFO("Will prepend '%s' to unqualified paths\n", basedir);
215 
216  if (fscanf(ctlfp, "%1023s", str) == 1) {
217  if (strcmp(str, "{") == 0) {
218  /* Load LMclass files */
219  while ((fscanf(ctlfp, "%1023s", str) == 1)
220  && (strcmp(str, "}") != 0)) {
221  char *deffile;
222  if (basedir && !path_is_absolute(str))
223  deffile = string_join(basedir, str, NULL);
224  else
225  deffile = ckd_salloc(str);
226  E_INFO("Reading classdef from '%s'\n", deffile);
227  if (read_classdef_file(classes, deffile) < 0) {
228  ckd_free(deffile);
229  goto error_out;
230  }
231  ckd_free(deffile);
232  }
233 
234  if (strcmp(str, "}") != 0) {
235  E_ERROR("Unexpected EOF in %s\n", lmctlfile);
236  goto error_out;
237  }
238 
239  /* This might be the first LM name. */
240  if (fscanf(ctlfp, "%1023s", str) != 1)
241  str[0] = '\0';
242  }
243  }
244  else
245  str[0] = '\0';
246 
247  /* Read in one LM at a time and add classes to them as necessary. */
248  while (str[0] != '\0') {
249  char *lmfile;
250  ngram_model_t *lm;
251 
252  if (basedir && str[0] != '/' && str[0] != '\\')
253  lmfile = string_join(basedir, str, NULL);
254  else
255  lmfile = ckd_salloc(str);
256  E_INFO("Reading lm from '%s'\n", lmfile);
257  lm = ngram_model_read(config, lmfile, NGRAM_AUTO, lmath);
258  if (lm == NULL) {
259  ckd_free(lmfile);
260  goto error_out;
261  }
262  if (fscanf(ctlfp, "%1023s", str) != 1) {
263  E_ERROR("LMname missing after LMFileName '%s'\n", lmfile);
264  ckd_free(lmfile);
265  goto error_out;
266  }
267  ckd_free(lmfile);
268  lms = glist_add_ptr(lms, lm);
269  lmnames = glist_add_ptr(lmnames, ckd_salloc(str));
270 
271  if (fscanf(ctlfp, "%1023s", str) == 1) {
272  if (strcmp(str, "{") == 0) {
273  /* LM uses classes; read their names */
274  while ((fscanf(ctlfp, "%1023s", str) == 1) &&
275  (strcmp(str, "}") != 0)) {
276  void *val;
277  classdef_t *classdef;
278 
279  if (hash_table_lookup(classes, str, &val) == -1) {
280  E_ERROR("Unknown class %s in control file\n", str);
281  goto error_out;
282  }
283  classdef = val;
284  if (ngram_model_add_class(lm, str, 1.0,
285  classdef->words, classdef->weights,
286  classdef->n_words) < 0) {
287  goto error_out;
288  }
289  E_INFO("Added class %s containing %d words\n",
290  str, classdef->n_words);
291  }
292  if (strcmp(str, "}") != 0) {
293  E_ERROR("Unexpected EOF in %s\n", lmctlfile);
294  goto error_out;
295  }
296  if (fscanf(ctlfp, "%1023s", str) != 1)
297  str[0] = '\0';
298  }
299  }
300  else
301  str[0] = '\0';
302  }
303  fclose(ctlfp);
304 
305  /* Now construct arrays out of lms and lmnames, and build an
306  * ngram_model_set. */
307  lms = glist_reverse(lms);
308  lmnames = glist_reverse(lmnames);
309  {
310  int32 n_models;
311  ngram_model_t **lm_array;
312  char **name_array;
313  gnode_t *lm_node, *name_node;
314  int32 i;
315 
316  n_models = glist_count(lms);
317  lm_array = ckd_calloc(n_models, sizeof(*lm_array));
318  name_array = ckd_calloc(n_models, sizeof(*name_array));
319  lm_node = lms;
320  name_node = lmnames;
321  for (i = 0; i < n_models; ++i) {
322  lm_array[i] = gnode_ptr(lm_node);
323  name_array[i] = gnode_ptr(name_node);
324  lm_node = gnode_next(lm_node);
325  name_node = gnode_next(name_node);
326  }
327  set = ngram_model_set_init(config, lm_array, name_array,
328  NULL, n_models);
329  ckd_free(lm_array);
330  ckd_free(name_array);
331  }
332 error_out:
333  {
334  gnode_t *gn;
335  glist_t hlist;
336 
337  if (set == NULL) {
338  for (gn = lms; gn; gn = gnode_next(gn)) {
340  }
341  }
342  glist_free(lms);
343  for (gn = lmnames; gn; gn = gnode_next(gn)) {
344  ckd_free(gnode_ptr(gn));
345  }
346  glist_free(lmnames);
347  hlist = hash_table_tolist(classes, NULL);
348  for (gn = hlist; gn; gn = gnode_next(gn)) {
349  hash_entry_t *he = gnode_ptr(gn);
350  ckd_free((char *)he->key);
351  classdef_free(he->val);
352  }
353  glist_free(hlist);
354  hash_table_free(classes);
355  ckd_free(basedir);
356  }
357  return set;
358 }
359 
360 int32
362 {
363  ngram_model_set_t *set = (ngram_model_set_t *)base;
364  return set->n_models;
365 }
366 
369 {
370  ngram_model_set_t *set = (ngram_model_set_t *)base;
372 
373  if (set == NULL || set->n_models == 0)
374  return NULL;
375  itor = ckd_calloc(1, sizeof(*itor));
376  itor->set = set;
377  return itor;
378 }
379 
382 {
383  if (++itor->cur == itor->set->n_models) {
385  return NULL;
386  }
387  return itor;
388 }
389 
390 void
392 {
393  ckd_free(itor);
394 }
395 
398  char const **lmname)
399 {
400  if (lmname) *lmname = itor->set->names[itor->cur];
401  return itor->set->lms[itor->cur];
402 }
403 
406  const char *name)
407 {
408  ngram_model_set_t *set = (ngram_model_set_t *)base;
409  int32 i;
410 
411  if (name == NULL) {
412  if (set->cur == -1)
413  return NULL;
414  else
415  return set->lms[set->cur];
416  }
417 
418  /* There probably won't be very many submodels. */
419  for (i = 0; i < set->n_models; ++i)
420  if (0 == strcmp(set->names[i], name))
421  break;
422  if (i == set->n_models)
423  return NULL;
424  return set->lms[i];
425 }
426 
429  const char *name)
430 {
431  ngram_model_set_t *set = (ngram_model_set_t *)base;
432  int32 i;
433 
434  /* There probably won't be very many submodels. */
435  for (i = 0; i < set->n_models; ++i)
436  if (0 == strcmp(set->names[i], name))
437  break;
438  if (i == set->n_models)
439  return NULL;
440  set->cur = i;
441  return set->lms[set->cur];
442 }
443 
444 const char *
446 {
447  ngram_model_set_t *set = (ngram_model_set_t *)base;
448 
449  if (set->cur == -1)
450  return NULL;
451  else
452  return set->names[set->cur];
453 }
454 
455 int32
457  int32 set_wid)
458 {
459  ngram_model_set_t *set = (ngram_model_set_t *)base;
460 
461  if (set->cur == -1 || set_wid >= base->n_words)
462  return NGRAM_INVALID_WID;
463  else
464  return set->widmap[set_wid][set->cur];
465 }
466 
467 int32
469  int32 set_wid)
470 {
471  ngram_model_set_t *set = (ngram_model_set_t *)base;
472 
473  if (set_wid >= base->n_words)
474  return FALSE;
475  else if (set->cur == -1) {
476  int32 i;
477  for (i = 0; i < set->n_models; ++i) {
478  if (set->widmap[set_wid][i] != ngram_unknown_wid(set->lms[i]))
479  return TRUE;
480  }
481  return FALSE;
482  }
483  else
484  return (set->widmap[set_wid][set->cur]
485  != ngram_unknown_wid(set->lms[set->cur]));
486 }
487 
490  const char **names,
491  const float32 *weights)
492 {
493  ngram_model_set_t *set = (ngram_model_set_t *)base;
494 
495  /* If we have a set of weights here, then set them. */
496  if (names && weights) {
497  int32 i, j;
498 
499  /* We hope there aren't many models. */
500  for (i = 0; i < set->n_models; ++i) {
501  for (j = 0; j < set->n_models; ++j)
502  if (0 == strcmp(names[i], set->names[j]))
503  break;
504  if (j == set->n_models) {
505  E_ERROR("Unknown LM name %s\n", names[i]);
506  return NULL;
507  }
508  set->lweights[j] = logmath_log(base->lmath, weights[i]);
509  }
510  }
511  else if (weights) {
512  memcpy(set->lweights, weights, set->n_models * sizeof(*set->lweights));
513  }
514  /* Otherwise just enable existing weights. */
515  set->cur = -1;
516  return base;
517 }
518 
521  ngram_model_t *model,
522  const char *name,
523  float32 weight,
524  int reuse_widmap)
525 
526 {
527  ngram_model_set_t *set = (ngram_model_set_t *)base;
528  float32 fprob;
529  int32 scale, i;
530 
531  /* Add it to the array of lms. */
532  ++set->n_models;
533  set->lms = ckd_realloc(set->lms, set->n_models * sizeof(*set->lms));
534  set->lms[set->n_models - 1] = model;
535  set->names = ckd_realloc(set->names, set->n_models * sizeof(*set->names));
536  set->names[set->n_models - 1] = ckd_salloc(name);
537  /* Expand the history mapping table if necessary. */
538  if (model->n > base->n) {
539  base->n = model->n;
540  set->maphist = ckd_realloc(set->maphist,
541  (model->n - 1) * sizeof(*set->maphist));
542  }
543 
544  /* Renormalize the interpolation weights. */
545  fprob = weight * 1.0 / set->n_models;
546  set->lweights = ckd_realloc(set->lweights,
547  set->n_models * sizeof(*set->lweights));
548  set->lweights[set->n_models - 1] = logmath_log(base->lmath, fprob);
549  /* Now normalize everything else to fit it in. This is
550  * accomplished by simply scaling all the other probabilities
551  * by (1-fprob). */
552  scale = logmath_log(base->lmath, 1.0 - fprob);
553  for (i = 0; i < set->n_models - 1; ++i)
554  set->lweights[i] += scale;
555 
556  /* Reuse the old word ID mapping if requested. */
557  if (reuse_widmap) {
558  int32 **new_widmap;
559 
560  /* Tack another column onto the widmap array. */
561  new_widmap = (int32 **)ckd_calloc_2d(base->n_words, set->n_models,
562  sizeof (**new_widmap));
563  for (i = 0; i < base->n_words; ++i) {
564  /* Copy all the existing mappings. */
565  memcpy(new_widmap[i], set->widmap[i],
566  (set->n_models - 1) * sizeof(**new_widmap));
567  /* Create the new mapping. */
568  new_widmap[i][set->n_models-1] = ngram_wid(model, base->word_str[i]);
569  }
570  ckd_free_2d((void **)set->widmap);
571  set->widmap = new_widmap;
572  }
573  else {
574  build_widmap(base, base->lmath, base->n);
575  }
576  return model;
577 }
578 
581  const char *name,
582  int reuse_widmap)
583 {
584  ngram_model_set_t *set = (ngram_model_set_t *)base;
585  ngram_model_t *submodel;
586  int32 lmidx, scale, n, i;
587  float32 fprob;
588 
589  for (lmidx = 0; lmidx < set->n_models; ++lmidx)
590  if (0 == strcmp(name, set->names[lmidx]))
591  break;
592  if (lmidx == set->n_models)
593  return NULL;
594  submodel = set->lms[lmidx];
595 
596  /* Renormalize the interpolation weights by scaling them by
597  * 1/(1-fprob) */
598  fprob = logmath_exp(base->lmath, set->lweights[lmidx]);
599  scale = logmath_log(base->lmath, 1.0 - fprob);
600 
601  /* Remove it from the array of lms, renormalize remaining weights,
602  * and recalcluate n. */
603  --set->n_models;
604  n = 0;
605  ckd_free(set->names[lmidx]);
606  set->names[lmidx] = NULL;
607  for (i = 0; i < set->n_models; ++i) {
608  if (i >= lmidx) {
609  set->lms[i] = set->lms[i+1];
610  set->names[i] = set->names[i+1];
611  set->lweights[i] = set->lweights[i+1];
612  }
613  set->lweights[i] -= scale;
614  if (set->lms[i]->n > n)
615  n = set->lms[i]->n;
616  }
617  /* There's no need to shrink these arrays. */
618  set->lms[set->n_models] = NULL;
619  set->lweights[set->n_models] = base->log_zero;
620  /* No need to shrink maphist either. */
621 
622  /* Reuse the existing word ID mapping if requested. */
623  if (reuse_widmap) {
624  /* Just go through and shrink each row. */
625  for (i = 0; i < base->n_words; ++i) {
626  memmove(set->widmap[i] + lmidx, set->widmap[i] + lmidx + 1,
627  (set->n_models - lmidx) * sizeof(**set->widmap));
628  }
629  }
630  else {
631  build_widmap(base, base->lmath, n);
632  }
633  return submodel;
634 }
635 
636 void
638  const char **words,
639  int32 n_words)
640 {
641  ngram_model_set_t *set = (ngram_model_set_t *)base;
642  int32 i;
643 
644  /* Recreate the word mapping. */
645  if (base->writable) {
646  for (i = 0; i < base->n_words; ++i) {
647  ckd_free(base->word_str[i]);
648  }
649  }
650  ckd_free(base->word_str);
651  ckd_free_2d((void **)set->widmap);
652  base->writable = TRUE;
653  base->n_words = base->n_1g_alloc = n_words;
654  base->word_str = ckd_calloc(n_words, sizeof(*base->word_str));
655  set->widmap = (int32 **)ckd_calloc_2d(n_words, set->n_models, sizeof(**set->widmap));
656  hash_table_empty(base->wid);
657  for (i = 0; i < n_words; ++i) {
658  int32 j;
659  base->word_str[i] = ckd_salloc(words[i]);
660  (void)hash_table_enter_int32(base->wid, base->word_str[i], i);
661  for (j = 0; j < set->n_models; ++j) {
662  set->widmap[i][j] = ngram_wid(set->lms[j], base->word_str[i]);
663  }
664  }
665 }
666 
667 static int
668 ngram_model_set_apply_weights(ngram_model_t *base, float32 lw,
669  float32 wip, float32 uw)
670 {
671  ngram_model_set_t *set = (ngram_model_set_t *)base;
672  int32 i;
673 
674  /* Apply weights to each sub-model. */
675  for (i = 0; i < set->n_models; ++i)
676  ngram_model_apply_weights(set->lms[i], lw, wip, uw);
677  return 0;
678 }
679 
680 static int32
681 ngram_model_set_score(ngram_model_t *base, int32 wid,
682  int32 *history, int32 n_hist,
683  int32 *n_used)
684 {
685  ngram_model_set_t *set = (ngram_model_set_t *)base;
686  int32 mapwid;
687  int32 score;
688  int32 i;
689 
690  /* Truncate the history. */
691  if (n_hist > base->n - 1)
692  n_hist = base->n - 1;
693 
694  /* Interpolate if there is no current. */
695  if (set->cur == -1) {
696  score = base->log_zero;
697  for (i = 0; i < set->n_models; ++i) {
698  int32 j;
699  /* Map word and history IDs for each model. */
700  mapwid = set->widmap[wid][i];
701  for (j = 0; j < n_hist; ++j) {
702  if (history[j] == NGRAM_INVALID_WID)
703  set->maphist[j] = NGRAM_INVALID_WID;
704  else
705  set->maphist[j] = set->widmap[history[j]][i];
706  }
707  score = logmath_add(base->lmath, score,
708  set->lweights[i] +
709  ngram_ng_score(set->lms[i],
710  mapwid, set->maphist, n_hist, n_used));
711  }
712  }
713  else {
714  int32 j;
715  /* Map word and history IDs (FIXME: do this in a function?) */
716  mapwid = set->widmap[wid][set->cur];
717  for (j = 0; j < n_hist; ++j) {
718  if (history[j] == NGRAM_INVALID_WID)
719  set->maphist[j] = NGRAM_INVALID_WID;
720  else
721  set->maphist[j] = set->widmap[history[j]][set->cur];
722  }
723  score = ngram_ng_score(set->lms[set->cur],
724  mapwid, set->maphist, n_hist, n_used);
725  }
726 
727  return score;
728 }
729 
730 static int32
731 ngram_model_set_raw_score(ngram_model_t *base, int32 wid,
732  int32 *history, int32 n_hist,
733  int32 *n_used)
734 {
735  ngram_model_set_t *set = (ngram_model_set_t *)base;
736  int32 mapwid;
737  int32 score;
738  int32 i;
739 
740  /* Truncate the history. */
741  if (n_hist > base->n - 1)
742  n_hist = base->n - 1;
743 
744  /* Interpolate if there is no current. */
745  if (set->cur == -1) {
746  score = base->log_zero;
747  for (i = 0; i < set->n_models; ++i) {
748  int32 j;
749  /* Map word and history IDs for each model. */
750  mapwid = set->widmap[wid][i];
751  for (j = 0; j < n_hist; ++j) {
752  if (history[j] == NGRAM_INVALID_WID)
753  set->maphist[j] = NGRAM_INVALID_WID;
754  else
755  set->maphist[j] = set->widmap[history[j]][i];
756  }
757  score = logmath_add(base->lmath, score,
758  set->lweights[i] +
759  ngram_ng_prob(set->lms[i],
760  mapwid, set->maphist, n_hist, n_used));
761  }
762  }
763  else {
764  int32 j;
765  /* Map word and history IDs (FIXME: do this in a function?) */
766  mapwid = set->widmap[wid][set->cur];
767  for (j = 0; j < n_hist; ++j) {
768  if (history[j] == NGRAM_INVALID_WID)
769  set->maphist[j] = NGRAM_INVALID_WID;
770  else
771  set->maphist[j] = set->widmap[history[j]][set->cur];
772  }
773  score = ngram_ng_prob(set->lms[set->cur],
774  mapwid, set->maphist, n_hist, n_used);
775  }
776 
777  return score;
778 }
779 
780 static int32
781 ngram_model_set_add_ug(ngram_model_t *base,
782  int32 wid, int32 lweight)
783 {
784  ngram_model_set_t *set = (ngram_model_set_t *)base;
785  int32 *newwid;
786  int32 i, prob;
787 
788  /* At this point the word has already been added to the master
789  model and we have a new word ID for it. Add it to active
790  submodels and track the word IDs. */
791  newwid = ckd_calloc(set->n_models, sizeof(*newwid));
792  prob = base->log_zero;
793  for (i = 0; i < set->n_models; ++i) {
794  int32 wprob, n_hist;
795 
796  /* Only add to active models. */
797  if (set->cur == -1 || set->cur == i) {
798  /* Did this word already exist? */
799  newwid[i] = ngram_wid(set->lms[i], base->word_str[wid]);
800  if (newwid[i] == NGRAM_INVALID_WID) {
801  /* Add it to the submodel. */
802  newwid[i] = ngram_model_add_word(set->lms[i], base->word_str[wid],
803  logmath_exp(base->lmath, lweight));
804  if (newwid[i] == NGRAM_INVALID_WID) {
805  ckd_free(newwid);
806  return base->log_zero;
807  }
808  }
809  /* Now get the unigram probability for the new word and either
810  * interpolate it or use it (if this is the current model). */
811  wprob = ngram_ng_prob(set->lms[i], newwid[i], NULL, 0, &n_hist);
812  if (set->cur == i)
813  prob = wprob;
814  else if (set->cur == -1)
815  prob = logmath_add(base->lmath, prob, set->lweights[i] + wprob);
816  }
817  else {
818  newwid[i] = NGRAM_INVALID_WID;
819  }
820  }
821  /* Okay we have the word IDs for this in all the submodels. Now
822  do some complicated memory mangling to add this to the
823  widmap. */
824  set->widmap = ckd_realloc(set->widmap, base->n_words * sizeof(*set->widmap));
825  set->widmap[0] = ckd_realloc(set->widmap[0],
826  base->n_words
827  * set->n_models
828  * sizeof(**set->widmap));
829  for (i = 0; i < base->n_words; ++i)
830  set->widmap[i] = set->widmap[0] + i * set->n_models;
831  memcpy(set->widmap[wid], newwid, set->n_models * sizeof(*newwid));
832  ckd_free(newwid);
833  return prob;
834 }
835 
836 static void
837 ngram_model_set_free(ngram_model_t *base)
838 {
839  ngram_model_set_t *set = (ngram_model_set_t *)base;
840  int32 i;
841 
842  for (i = 0; i < set->n_models; ++i)
843  ngram_model_free(set->lms[i]);
844  ckd_free(set->lms);
845  for (i = 0; i < set->n_models; ++i)
846  ckd_free(set->names[i]);
847  ckd_free(set->names);
848  ckd_free(set->lweights);
849  ckd_free(set->maphist);
850  ckd_free_2d((void **)set->widmap);
851 }
852 
853 static void
854 ngram_model_set_flush(ngram_model_t *base)
855 {
856  ngram_model_set_t *set = (ngram_model_set_t *)base;
857  int32 i;
858 
859  for (i = 0; i < set->n_models; ++i)
860  ngram_model_flush(set->lms[i]);
861 }
862 
863 static ngram_funcs_t ngram_model_set_funcs = {
864  ngram_model_set_free, /* free */
865  ngram_model_set_apply_weights, /* apply_weights */
866  ngram_model_set_score, /* score */
867  ngram_model_set_raw_score, /* raw_score */
868  ngram_model_set_add_ug, /* add_ug */
869  ngram_model_set_flush /* flush */
870 };
SPHINXBASE_EXPORT int32 ngram_ng_prob(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist, int32 *n_used)
Quick "raw" probability lookup for a general N-Gram.
Definition: ngram_model.c:570
char ** names
Names for language models.
SPHINXBASE_EXPORT ngram_model_t * ngram_model_read(cmd_ln_t *config, const char *file_name, ngram_file_type_t file_type, logmath_t *lmath)
Read an N-Gram model from a file on disk.
Definition: ngram_model.c:121
Miscellaneous useful string functions.
SPHINXBASE_EXPORT int32 hash_table_lookup(hash_table_t *h, const char *key, void **val)
Look up a key in a hash table and optionally return the associated value.
Definition: hash_table.c:309
#define ckd_calloc_2d(d1, d2, sz)
Macro for ckd_calloc_2d
Definition: ckd_alloc.h:270
ngram_model_t * ngram_model_set_interp(ngram_model_t *base, const char **names, const float32 *weights)
Set interpolation weights for a set and enables interpolation.
#define ckd_calloc(n, sz)
Macros to simplify the use of above functions.
Definition: ckd_alloc.h:248
ngram_model_t * ngram_model_set_add(ngram_model_t *base, ngram_model_t *model, const char *name, float32 weight, int reuse_widmap)
Add a language model to a set.
SPHINXBASE_EXPORT int32 ngram_unknown_wid(ngram_model_t *model)
Get the unknown word ID for a language model.
Definition: ngram_model.c:644
#define hash_table_enter_int32(h, k, v)
Add a 32-bit integer value to a hash table.
Definition: hash_table.h:228
hash_table_t * wid
Mapping of unigram names to word IDs.
char ** word_str
Unigram names.
ngram_model_t * ngram_model_set_select(ngram_model_t *base, const char *name)
Select a single language model from a set for scoring.
#define E_INFO
Print logging information to standard error stream.
Definition: err.h:147
Sphinx's memory allocation/deallocation routines.
int32 * lweights
Log interpolation weights.
SPHINXBASE_EXPORT int32 ngram_wid(ngram_model_t *model, const char *word)
Look up numerical word ID.
Definition: ngram_model.c:771
SPHINXBASE_EXPORT glist_t hash_table_tolist(hash_table_t *h, int32 *count)
Build a glist of valid hash_entry_t pointers from the given hash table.
Definition: hash_table.c:623
#define NGRAM_INVALID_WID
Impossible word ID.
Definition: ngram_model.h:84
File names related operation.
int32 ** widmap
Word ID mapping for submodels.
SPHINXBASE_EXPORT int logmath_log(logmath_t *lmath, float64 p)
Convert linear floating point number to integer log in base B.
Definition: logmath.c:447
A node in a generic list.
Definition: glist.h:100
Subclass of ngram_model for grouping language models.
uint8 writable
Are word strings writable?
SPHINXBASE_EXPORT int ngram_model_free(ngram_model_t *model)
Release memory associated with an N-Gram model.
Definition: ngram_model.c:254
#define ckd_salloc(ptr)
Macro for ckd_salloc
Definition: ckd_alloc.h:264
int32 n_models
Number of models in this set.
SPHINXBASE_EXPORT hash_table_t * hash_table_new(int32 size, int32 casearg)
Allocate a new hash table for a given expected size.
Definition: hash_table.c:158
SPHINXBASE_EXPORT void hash_table_empty(hash_table_t *h)
Delete all entries from a hash_table.
Definition: hash_table.c:490
SPHINXBASE_EXPORT void ckd_free(void *ptr)
Test and free a 1-D array.
Definition: ckd_alloc.c:241
SPHINXBASE_EXPORT glist_t glist_add_ptr(glist_t g, void *ptr)
Create and prepend a new list node, with the given user-defined data, at the HEAD of the given generi...
Definition: glist.c:74
int32 * maphist
Word ID mapping for N-Gram history.
int32 log_zero
Zero probability, cached here for quick lookup.
SPHINXBASE_EXPORT void hash_table_free(hash_table_t *h)
Free the specified hash table; the caller is responsible for freeing the key strings pointed to by th...
Definition: hash_table.c:695
A note by ARCHAN at 20050510: Technically what we use is so-called "hash table with buckets" which is...
Definition: hash_table.h:149
int32 n_1g_alloc
Number of allocated word strings (for new word addition)
SPHINXBASE_EXPORT glist_t glist_reverse(glist_t g)
Reverse the order of the given glist.
Definition: glist.c:169
Iterator over a model set.
SPHINXBASE_EXPORT int logmath_get_shift(logmath_t *lmath)
Get the shift of the values in a log table.
Definition: logmath.c:386
SPHINXBASE_EXPORT int ngram_model_apply_weights(ngram_model_t *model, float32 lw, float32 wip, float32 uw)
Apply a language weight, insertion penalty, and unigram weight to a language model.
Definition: ngram_model.c:477
ngram_model_t base
Base ngram_model_t structure.
SPHINXBASE_EXPORT void glist_free(glist_t g)
Free the given generic list; user-defined data contained within is not automatically freed...
Definition: glist.c:133
int32 cur
Currently selected model, or -1 for none.
SPHINXBASE_EXPORT float64 logmath_get_base(logmath_t *lmath)
Get the log base.
Definition: logmath.c:368
ngram_model_set_iter_t * ngram_model_set_iter(ngram_model_t *base)
Begin iterating over language models in a set.
SPHINXBASE_EXPORT int path_is_absolute(const char *file)
Test whether a pathname is absolute for the current OS.
Definition: filename.c:126
ngram_model_t * ngram_model_set_init(cmd_ln_t *config, ngram_model_t **models, char **names, const float32 *weights, int32 n_models)
Create a set of language models sharing a common space of word IDs.
#define gnode_ptr(g)
Head of a list of gnodes.
Definition: glist.h:109
SPHINXBASE_EXPORT int32 ngram_model_add_word(ngram_model_t *model, const char *word, float32 weight)
Add a word (unigram) to the language model.
Definition: ngram_model.c:832
uint8 n
This is an n-gram model (1, 2, 3, ...).
Implementation of logging routines.
logmath_t * lmath
Log-math object.
ngram_model_t ** lms
Language models in this set.
ngram_model_t * ngram_model_set_lookup(ngram_model_t *base, const char *name)
Look up a language model by name from a set.
const char * ngram_model_set_current(ngram_model_t *base)
Get the current language model name, if any.
SPHINXBASE_EXPORT int32 ngram_model_add_class(ngram_model_t *model, const char *classname, float32 classweight, char **words, const float32 *weights, int32 n_words)
Add a new class to a language model.
Definition: ngram_model.c:1005
ngram_model_t * ngram_model_set_iter_model(ngram_model_set_iter_t *itor, char const **lmname)
Get language model and associated name from an iterator.
One class definition from a classdef file.
void ngram_model_set_map_words(ngram_model_t *base, const char **words, int32 n_words)
Set the word-to-ID mapping for this model set.
SPHINXBASE_EXPORT void ngram_model_flush(ngram_model_t *lm)
Flush any cached N-Gram information.
Definition: ngram_model.c:247
Opaque structure used to hold the results of command-line parsing.
SPHINXBASE_EXPORT char * string_join(const char *base,...)
Concatenate a NULL-terminated argument list of strings, returning a newly allocated string...
Definition: strfuncs.c:62
#define E_ERROR
Print error message to standard error stream.
Definition: err.h:169
Implementation-specific functions for operating on ngram_model_t objects.
SPHINXBASE_EXPORT void ckd_free_2d(void *ptr)
Free a 2-D array (ptr) previously allocated by ckd_calloc_2d.
Definition: ckd_alloc.c:252
int32 ngram_model_set_count(ngram_model_t *base)
Returns the number of language models in a set.
int32 ngram_model_set_known_wid(ngram_model_t *base, int32 set_wid)
Test whether a word ID corresponds to a known word in the current state of the language model set...
void ngram_model_set_iter_free(ngram_model_set_iter_t *itor)
Finish iteration over a langauge model set.
ngram_model_set_iter_t * ngram_model_set_iter_next(ngram_model_set_iter_t *itor)
Move to the next language model in a set.
Common implementation of ngram_model_t.
void * val
Key-length; the key string does not have to be a C-style NULL terminated string; it can have arbitrar...
Definition: hash_table.h:155
#define E_ERROR_SYSTEM
Print error text; Call perror("");.
Definition: err.h:142
ngram_model_t * ngram_model_set_read(cmd_ln_t *config, const char *lmctlfile, logmath_t *lmath)
Read a set of language models from a control file.
SPHINXBASE_EXPORT float64 logmath_exp(logmath_t *lmath, int logb_p)
Convert integer log in base B to linear floating point.
Definition: logmath.c:456
#define ckd_realloc(ptr, sz)
Macro for ckd_realloc
Definition: ckd_alloc.h:258
int32 ngram_model_set_current_wid(ngram_model_t *base, int32 set_wid)
Query the word-ID mapping for the current language model.
Set of language models.
SPHINXBASE_EXPORT int32 glist_count(glist_t g)
Count the number of element in a given link list.
Definition: glist.c:145
Determine file type automatically.
Definition: ngram_model.h:78
SPHINXBASE_EXPORT int32 ngram_ng_score(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist, int32 *n_used)
Quick general N-Gram score lookup.
Definition: ngram_model.c:494
ngram_model_t * ngram_model_set_remove(ngram_model_t *base, const char *name, int reuse_widmap)
Remove a language model from a set.
SPHINXBASE_EXPORT int logmath_add(logmath_t *lmath, int logb_p, int logb_q)
Add two values in log space (i.e.
Definition: logmath.c:392
int32 n_words
Number of actual word strings (NOT the same as the number of unigrams, due to class words)...