SphinxBase  0.6
ngram_model.c
1 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /* ====================================================================
3  * Copyright (c) 1999-2007 Carnegie Mellon University. All rights
4  * reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright
11  * notice, this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright
14  * notice, this list of conditions and the following disclaimer in
15  * the documentation and/or other materials provided with the
16  * distribution.
17  *
18  * This work was supported in part by funding from the Defense Advanced
19  * Research Projects Agency and the National Science Foundation of the
20  * United States of America, and the CMU Sphinx Speech Consortium.
21  *
22  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33  *
34  * ====================================================================
35  *
36  */
37 /*
38  * \file ngram_model.c N-Gram language models.
39  *
40  * Author: David Huggins-Daines, much code taken from sphinx3/src/libs3decoder/liblm
41  */
42 
43 #include <config.h>
44 
45 #include <string.h>
46 #include <assert.h>
47 
48 #ifdef HAVE_ICONV
49 #include <iconv.h>
50 #endif
51 
52 #include "sphinxbase/ngram_model.h"
53 #include "sphinxbase/ckd_alloc.h"
54 #include "sphinxbase/filename.h"
55 #include "sphinxbase/pio.h"
56 #include "sphinxbase/err.h"
57 #include "sphinxbase/logmath.h"
58 #include "sphinxbase/strfuncs.h"
59 #include "sphinxbase/case.h"
60 
61 #include "ngram_model_internal.h"
62 
64 ngram_file_name_to_type(const char *file_name)
65 {
66  const char *ext;
67 
68  ext = strrchr(file_name, '.');
69  if (ext == NULL) {
70  return NGRAM_INVALID;
71  }
72  if (0 == strcmp_nocase(ext, ".gz")) {
73  while (--ext >= file_name) {
74  if (*ext == '.') break;
75  }
76  if (ext < file_name) {
77  return NGRAM_INVALID;
78  }
79  }
80  else if (0 == strcmp_nocase(ext, ".bz2")) {
81  while (--ext >= file_name) {
82  if (*ext == '.') break;
83  }
84  if (ext < file_name) {
85  return NGRAM_INVALID;
86  }
87  }
88  /* We use strncmp because there might be a .gz on the end. */
89  if (0 == strncmp_nocase(ext, ".ARPA", 5))
90  return NGRAM_ARPA;
91  if (0 == strncmp_nocase(ext, ".DMP", 4))
92  return NGRAM_DMP;
93  return NGRAM_INVALID;
94  }
95 
97 ngram_str_to_type(const char *str_name)
98 {
99  if (0 == strcmp_nocase(str_name, "arpa"))
100  return NGRAM_ARPA;
101  if (0 == strcmp_nocase(str_name, "dmp"))
102  return NGRAM_DMP;
103  return NGRAM_INVALID;
104 }
105 
106 char const *
108 {
109  switch (type) {
110  case NGRAM_ARPA:
111  return "arpa";
112  case NGRAM_DMP:
113  return "dmp";
114  default:
115  return NULL;
116  }
117 }
118 
119 
120  ngram_model_t *
122  const char *file_name,
123  ngram_file_type_t file_type,
124  logmath_t *lmath)
125  {
126  ngram_model_t *model = NULL;
127 
128  switch (file_type) {
129  case NGRAM_AUTO: {
130  if ((model = ngram_model_arpa_read(config, file_name, lmath)) != NULL)
131  break;
132  if ((model = ngram_model_dmp_read(config, file_name, lmath)) != NULL)
133  break;
134  return NULL;
135  }
136  case NGRAM_ARPA:
137  model = ngram_model_arpa_read(config, file_name, lmath);
138  break;
139  case NGRAM_DMP:
140  model = ngram_model_dmp_read(config, file_name, lmath);
141  break;
142  default:
143  E_ERROR("language model file type not supported\n");
144  return NULL;
145  }
146 
147  /* Now set weights based on config if present. */
148  if (config) {
149  float32 lw = 1.0;
150  float32 wip = 1.0;
151  float32 uw = 1.0;
152 
153  if (cmd_ln_exists_r(config, "-lw"))
154  lw = cmd_ln_float32_r(config, "-lw");
155  if (cmd_ln_exists_r(config, "-wip"))
156  wip = cmd_ln_float32_r(config, "-wip");
157  if (cmd_ln_exists_r(config, "-uw"))
158  uw = cmd_ln_float32_r(config, "-uw");
159 
160  ngram_model_apply_weights(model, lw, wip, uw);
161  }
162 
163  return model;
164  }
165 
166  int
167  ngram_model_write(ngram_model_t *model, const char *file_name,
168  ngram_file_type_t file_type)
169  {
170  switch (file_type) {
171  case NGRAM_AUTO: {
172  file_type = ngram_file_name_to_type(file_name);
173  /* Default to ARPA (catches .lm and other things) */
174  if (file_type == NGRAM_INVALID)
175  file_type = NGRAM_ARPA;
176  return ngram_model_write(model, file_name, file_type);
177  }
178  case NGRAM_ARPA:
179  return ngram_model_arpa_write(model, file_name);
180  case NGRAM_DMP:
181  return ngram_model_dmp_write(model, file_name);
182  default:
183  E_ERROR("language model file type not supported\n");
184  return -1;
185  }
186  E_ERROR("language model file type not supported\n");
187  return -1;
188  }
189 
190  int32
191  ngram_model_init(ngram_model_t *base,
192  ngram_funcs_t *funcs,
193  logmath_t *lmath,
194  int32 n, int32 n_unigram)
195  {
196  base->refcount = 1;
197  base->funcs = funcs;
198  base->n = n;
199  /* If this was previously initialized... */
200  if (base->n_counts == NULL)
201  base->n_counts = ckd_calloc(3, sizeof(*base->n_counts));
202  /* Don't reset weights if logmath object hasn't changed. */
203  if (base->lmath != lmath) {
204  /* Set default values for weights. */
205  base->lw = 1.0;
206  base->log_wip = 0; /* i.e. 1.0 */
207  base->log_uw = 0; /* i.e. 1.0 */
208  base->log_uniform = logmath_log(lmath, 1.0 / n_unigram);
209  base->log_uniform_weight = logmath_get_zero(lmath);
210  base->log_zero = logmath_get_zero(lmath);
211  base->lmath = lmath;
212  }
213  /* Allocate or reallocate space for word strings. */
214  if (base->word_str) {
215  /* Free all previous word strings if they were allocated. */
216  if (base->writable) {
217  int32 i;
218  for (i = 0; i < base->n_words; ++i) {
219  ckd_free(base->word_str[i]);
220  base->word_str[i] = NULL;
221  }
222  }
223  base->word_str = ckd_realloc(base->word_str, n_unigram * sizeof(char *));
224  }
225  else
226  base->word_str = ckd_calloc(n_unigram, sizeof(char *));
227  /* NOTE: They are no longer case-insensitive since we are allowing
228  * other encodings for word strings. Beware. */
229  if (base->wid)
230  hash_table_empty(base->wid);
231  else
232  base->wid = hash_table_new(n_unigram, FALSE);
233  base->n_counts[0] = base->n_1g_alloc = base->n_words = n_unigram;
234 
235  return 0;
236 }
237 
240 {
241  ++model->refcount;
242  return model;
243 }
244 
245 
246 void
248 {
249  if (model->funcs && model->funcs->flush)
250  (*model->funcs->flush)(model);
251 }
252 
253 int
255 {
256  int i;
257 
258  if (model == NULL)
259  return 0;
260  if (--model->refcount > 0)
261  return model->refcount;
262  if (model->funcs && model->funcs->free)
263  (*model->funcs->free)(model);
264  if (model->writable) {
265  /* Free all words. */
266  for (i = 0; i < model->n_words; ++i) {
267  ckd_free(model->word_str[i]);
268  }
269  }
270  else {
271  /* Free all class words. */
272  for (i = 0; i < model->n_classes; ++i) {
273  ngram_class_t *lmclass;
274  int32 j;
275 
276  lmclass = model->classes[i];
277  for (j = 0; j < lmclass->n_words; ++j) {
278  ckd_free(model->word_str[lmclass->start_wid + j]);
279  }
280  for (j = 0; j < lmclass->n_hash; ++j) {
281  if (lmclass->nword_hash[j].wid != -1) {
282  ckd_free(model->word_str[lmclass->nword_hash[j].wid]);
283  }
284  }
285  }
286  }
287  for (i = 0; i < model->n_classes; ++i) {
288  ngram_class_free(model->classes[i]);
289  }
290  ckd_free(model->classes);
291  hash_table_free(model->wid);
292  ckd_free(model->word_str);
293  ckd_free(model->n_counts);
294  ckd_free(model);
295  return 0;
296 }
297 
298 int
300 {
301  int writable, i;
302  hash_table_t *new_wid;
303 
304  /* Were word strings already allocated? */
305  writable = model->writable;
306  /* Either way, we are going to allocate some word strings. */
307  model->writable = TRUE;
308 
309  /* And, don't forget, we need to rebuild the word to unigram ID
310  * mapping. */
311  new_wid = hash_table_new(model->n_words, FALSE);
312  for (i = 0; i < model->n_words; ++i) {
313  char *outstr;
314  if (writable) {
315  outstr = model->word_str[i];
316  }
317  else {
318  outstr = ckd_salloc(model->word_str[i]);
319  }
320  /* Don't case-fold <tags> or [classes] */
321  if (outstr[0] == '<' || outstr[0] == '[') {
322  }
323  else {
324  switch (kase) {
325  case NGRAM_UPPER:
326  ucase(outstr);
327  break;
328  case NGRAM_LOWER:
329  lcase(outstr);
330  break;
331  default:
332  ;
333  }
334  }
335  model->word_str[i] = outstr;
336 
337  /* Now update the hash table. We might have terrible
338  * collisions here, so warn about them. */
339  if (hash_table_enter_int32(new_wid, model->word_str[i], i) != i) {
340  E_WARN("Duplicate word in dictionary after conversion: %s\n",
341  model->word_str[i]);
342  }
343  }
344  /* Swap out the hash table. */
345  hash_table_free(model->wid);
346  model->wid = new_wid;
347  return 0;
348 }
349 
350 #ifdef HAVE_ICONV
351 int
352 ngram_model_recode(ngram_model_t *model, const char *from, const char *to)
353 {
354  iconv_t ic;
355  char *outbuf;
356  size_t maxlen;
357  int i, writable;
358  hash_table_t *new_wid;
359 
360  /* FIXME: Need to do a special case thing for the GB-HEX encoding
361  * used in Sphinx3 Mandarin models. */
362  if ((ic = iconv_open(to, from)) == (iconv_t)-1) {
363  E_ERROR_SYSTEM("iconv_open() failed");
364  return -1;
365  }
366  /* iconv(3) is a piece of crap and won't accept a NULL out buffer,
367  * unlike wcstombs(3). So we have to either call it over and over
368  * again until our buffer is big enough, or call it with a huge
369  * buffer and then copy things back to the output. We will use a
370  * mix of these two approaches here. We'll keep a single big
371  * buffer around, and expand it as necessary.
372  */
373  maxlen = 0;
374  for (i = 0; i < model->n_words; ++i) {
375  if (strlen(model->word_str[i]) > maxlen)
376  maxlen = strlen(model->word_str[i]);
377  }
378  /* Were word strings already allocated? */
379  writable = model->writable;
380  /* Either way, we are going to allocate some word strings. */
381  model->writable = TRUE;
382  /* Really should be big enough except for pathological cases. */
383  maxlen = maxlen * sizeof(int) + 15;
384  outbuf = ckd_calloc(maxlen, 1);
385  /* And, don't forget, we need to rebuild the word to unigram ID
386  * mapping. */
387  new_wid = hash_table_new(model->n_words, FALSE);
388  for (i = 0; i < model->n_words; ++i) {
389  ICONV_CONST char *in;
390  char *out;
391  size_t inleft, outleft, result;
392 
393  start_conversion:
394  in = (ICONV_CONST char *)model->word_str[i];
395  /* Yes, this assumes that we don't have any NUL bytes. */
396  inleft = strlen(in);
397  out = outbuf;
398  outleft = maxlen;
399 
400  while ((result = iconv(ic, &in, &inleft, &out, &outleft)) == (size_t)-1) {
401  if (errno != E2BIG) {
402  /* FIXME: if we already converted any words, then they
403  * are going to be in an inconsistent state. */
404  E_ERROR_SYSTEM("iconv() failed");
405  ckd_free(outbuf);
406  hash_table_free(new_wid);
407  return -1;
408  }
409  /* Reset the internal state of conversion. */
410  iconv(ic, NULL, NULL, NULL, NULL);
411  /* Make everything bigger. */
412  maxlen *= 2;
413  out = outbuf = ckd_realloc(outbuf, maxlen);
414  /* Reset the input pointers. */
415  in = (ICONV_CONST char *)model->word_str[i];
416  inleft = strlen(in);
417  }
418 
419  /* Now flush a shift-out sequence, if any. */
420  if ((result = iconv(ic, NULL, NULL, &out, &outleft)) == (size_t)-1) {
421  if (errno != E2BIG) {
422  /* FIXME: if we already converted any words, then they
423  * are going to be in an inconsistent state. */
424  E_ERROR_SYSTEM("iconv() failed (state reset sequence)");
425  ckd_free(outbuf);
426  hash_table_free(new_wid);
427  return -1;
428  }
429  /* Reset the internal state of conversion. */
430  iconv(ic, NULL, NULL, NULL, NULL);
431  /* Make everything bigger. */
432  maxlen *= 2;
433  outbuf = ckd_realloc(outbuf, maxlen);
434  /* Be very evil. */
435  goto start_conversion;
436  }
437 
438  result = maxlen - outleft;
439  /* Okay, that was hard, now let's go shopping. */
440  if (writable) {
441  /* Grow or shrink the output string as necessary. */
442  model->word_str[i] = ckd_realloc(model->word_str[i], result + 1);
443  model->word_str[i][result] = '\0';
444  }
445  else {
446  /* It actually was not allocated previously, so do that now. */
447  model->word_str[i] = ckd_calloc(result + 1, 1);
448  }
449  /* Copy the new thing in. */
450  memcpy(model->word_str[i], outbuf, result);
451 
452  /* Now update the hash table. We might have terrible
453  * collisions if a non-reversible conversion was requested.,
454  * so warn about them. */
455  if (hash_table_enter_int32(new_wid, model->word_str[i], i) != i) {
456  E_WARN("Duplicate word in dictionary after conversion: %s\n",
457  model->word_str[i]);
458  }
459  }
460  ckd_free(outbuf);
461  iconv_close(ic);
462  /* Swap out the hash table. */
463  hash_table_free(model->wid);
464  model->wid = new_wid;
465 
466  return 0;
467 }
468 #else /* !HAVE_ICONV */
469 int
470 ngram_model_recode(ngram_model_t *model, const char *from, const char *to)
471 {
472  return -1;
473 }
474 #endif /* !HAVE_ICONV */
475 
476 int
478  float32 lw, float32 wip, float32 uw)
479 {
480  return (*model->funcs->apply_weights)(model, lw, wip, uw);
481 }
482 
483 float32
484 ngram_model_get_weights(ngram_model_t *model, int32 *out_log_wip,
485  int32 *out_log_uw)
486 {
487  if (out_log_wip) *out_log_wip = model->log_wip;
488  if (out_log_uw) *out_log_uw = model->log_uw;
489  return model->lw;
490 }
491 
492 
493 int32
494 ngram_ng_score(ngram_model_t *model, int32 wid, int32 *history,
495  int32 n_hist, int32 *n_used)
496 {
497  int32 score, class_weight = 0;
498  int i;
499 
500  /* Closed vocabulary, OOV word probability is zero */
501  if (wid == NGRAM_INVALID_WID)
502  return model->log_zero;
503 
504  /* "Declassify" wid and history */
505  if (NGRAM_IS_CLASSWID(wid)) {
506  ngram_class_t *lmclass = model->classes[NGRAM_CLASSID(wid)];
507 
508  class_weight = ngram_class_prob(lmclass, wid);
509  if (class_weight == 1) /* Meaning, not found in class. */
510  return model->log_zero;
511  wid = lmclass->tag_wid;
512  }
513  for (i = 0; i < n_hist; ++i) {
514  if (history[i] != NGRAM_INVALID_WID && NGRAM_IS_CLASSWID(history[i]))
515  history[i] = model->classes[NGRAM_CLASSID(history[i])]->tag_wid;
516  }
517  score = (*model->funcs->score)(model, wid, history, n_hist, n_used);
518 
519  /* Multiply by unigram in-class weight. */
520  return score + class_weight;
521 }
522 
523 int32
524 ngram_score(ngram_model_t *model, const char *word, ...)
525 {
526  va_list history;
527  const char *hword;
528  int32 *histid;
529  int32 n_hist;
530  int32 n_used;
531  int32 prob;
532 
533  va_start(history, word);
534  n_hist = 0;
535  while ((hword = va_arg(history, const char *)) != NULL)
536  ++n_hist;
537  va_end(history);
538 
539  histid = ckd_calloc(n_hist, sizeof(*histid));
540  va_start(history, word);
541  n_hist = 0;
542  while ((hword = va_arg(history, const char *)) != NULL) {
543  histid[n_hist] = ngram_wid(model, hword);
544  ++n_hist;
545  }
546  va_end(history);
547 
548  prob = ngram_ng_score(model, ngram_wid(model, word),
549  histid, n_hist, &n_used);
550  ckd_free(histid);
551  return prob;
552 }
553 
554 int32
555 ngram_tg_score(ngram_model_t *model, int32 w3, int32 w2, int32 w1, int32 *n_used)
556 {
557  int32 hist[2];
558  hist[0] = w2;
559  hist[1] = w1;
560  return ngram_ng_score(model, w3, hist, 2, n_used);
561 }
562 
563 int32
564 ngram_bg_score(ngram_model_t *model, int32 w2, int32 w1, int32 *n_used)
565 {
566  return ngram_ng_score(model, w2, &w1, 1, n_used);
567 }
568 
569 int32
570 ngram_ng_prob(ngram_model_t *model, int32 wid, int32 *history,
571  int32 n_hist, int32 *n_used)
572 {
573  int32 prob, class_weight = 0;
574  int i;
575 
576  /* Closed vocabulary, OOV word probability is zero */
577  if (wid == NGRAM_INVALID_WID)
578  return model->log_zero;
579 
580  /* "Declassify" wid and history */
581  if (NGRAM_IS_CLASSWID(wid)) {
582  ngram_class_t *lmclass = model->classes[NGRAM_CLASSID(wid)];
583 
584  class_weight = ngram_class_prob(lmclass, wid);
585  if (class_weight == 1) /* Meaning, not found in class. */
586  return class_weight;
587  wid = lmclass->tag_wid;
588  }
589  for (i = 0; i < n_hist; ++i) {
590  if (history[i] != NGRAM_INVALID_WID && NGRAM_IS_CLASSWID(history[i]))
591  history[i] = model->classes[NGRAM_CLASSID(history[i])]->tag_wid;
592  }
593  prob = (*model->funcs->raw_score)(model, wid, history,
594  n_hist, n_used);
595  /* Multiply by unigram in-class weight. */
596  return prob + class_weight;
597 }
598 
599 int32
600 ngram_prob(ngram_model_t *model, const char *word, ...)
601 {
602  va_list history;
603  const char *hword;
604  int32 *histid;
605  int32 n_hist;
606  int32 n_used;
607  int32 prob;
608 
609  va_start(history, word);
610  n_hist = 0;
611  while ((hword = va_arg(history, const char *)) != NULL)
612  ++n_hist;
613  va_end(history);
614 
615  histid = ckd_calloc(n_hist, sizeof(*histid));
616  va_start(history, word);
617  n_hist = 0;
618  while ((hword = va_arg(history, const char *)) != NULL) {
619  histid[n_hist] = ngram_wid(model, hword);
620  ++n_hist;
621  }
622  va_end(history);
623 
624  prob = ngram_ng_prob(model, ngram_wid(model, word),
625  histid, n_hist, &n_used);
626  ckd_free(histid);
627  return prob;
628 }
629 
630 int32
632 {
633  int32 prob;
634 
635  /* Undo insertion penalty. */
636  prob = score - base->log_wip;
637  /* Undo language weight. */
638  prob = (int32)(prob / base->lw);
639 
640  return prob;
641 }
642 
643 int32
645 {
646  int32 val;
647 
648  /* FIXME: This could be memoized for speed if necessary. */
649  /* Look up <UNK>, if not found return NGRAM_INVALID_WID. */
650  if (hash_table_lookup_int32(model->wid, "<UNK>", &val) == -1)
651  return NGRAM_INVALID_WID;
652  else
653  return val;
654 }
655 
656 int32
658 {
659  return model->log_zero;
660 }
661 
662 int32
664 {
665  if (model != NULL)
666  return model->n;
667  return 0;
668 }
669 
670 int32 const *
672 {
673  if (model != NULL)
674  return model->n_counts;
675  return NULL;
676 }
677 
678 void
679 ngram_iter_init(ngram_iter_t *itor, ngram_model_t *model,
680  int m, int successor)
681 {
682  itor->model = model;
683  itor->wids = ckd_calloc(model->n, sizeof(*itor->wids));
684  itor->m = m;
685  itor->successor = successor;
686 }
687 
688 ngram_iter_t *
690 {
691  ngram_iter_t *itor;
692  /* The fact that m=n-1 is not exactly obvious. Prevent accidents. */
693  if (m >= model->n)
694  return NULL;
695  if (model->funcs->mgrams == NULL)
696  return NULL;
697  itor = (*model->funcs->mgrams)(model, m);
698  return itor;
699 }
700 
701 ngram_iter_t *
702 ngram_iter(ngram_model_t *model, const char *word, ...)
703 {
704  va_list history;
705  const char *hword;
706  int32 *histid;
707  int32 n_hist;
708  ngram_iter_t *itor;
709 
710  va_start(history, word);
711  n_hist = 0;
712  while ((hword = va_arg(history, const char *)) != NULL)
713  ++n_hist;
714  va_end(history);
715 
716  histid = ckd_calloc(n_hist, sizeof(*histid));
717  va_start(history, word);
718  n_hist = 0;
719  while ((hword = va_arg(history, const char *)) != NULL) {
720  histid[n_hist] = ngram_wid(model, hword);
721  ++n_hist;
722  }
723  va_end(history);
724 
725  itor = ngram_ng_iter(model, ngram_wid(model, word), histid, n_hist);
726  ckd_free(histid);
727  return itor;
728 }
729 
730 ngram_iter_t *
731 ngram_ng_iter(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist)
732 {
733  if (n_hist >= model->n)
734  return NULL;
735  if (model->funcs->iter == NULL)
736  return NULL;
737  return (*model->funcs->iter)(model, wid, history, n_hist);
738 }
739 
740 ngram_iter_t *
742 {
743  /* Stop when we are at the highest order N-Gram. */
744  if (itor->m == itor->model->n - 1)
745  return NULL;
746  return (*itor->model->funcs->successors)(itor);
747 }
748 
749 int32 const *
751  int32 *out_score,
752  int32 *out_bowt)
753 {
754  return (*itor->model->funcs->iter_get)(itor, out_score, out_bowt);
755 }
756 
757 ngram_iter_t *
759 {
760  return (*itor->model->funcs->iter_next)(itor);
761 }
762 
763 void
765 {
766  ckd_free(itor->wids);
767  (*itor->model->funcs->iter_free)(itor);
768 }
769 
770 int32
771 ngram_wid(ngram_model_t *model, const char *word)
772 {
773  int32 val;
774 
775  if (hash_table_lookup_int32(model->wid, word, &val) == -1)
776  return ngram_unknown_wid(model);
777  else
778  return val;
779 }
780 
781 const char *
782 ngram_word(ngram_model_t *model, int32 wid)
783 {
784  /* Remove any class tag */
785  wid = NGRAM_BASEWID(wid);
786  if (wid >= model->n_words)
787  return NULL;
788  return model->word_str[wid];
789 }
790 
794 int32
795 ngram_add_word_internal(ngram_model_t *model,
796  const char *word,
797  int32 classid)
798 {
799  void *dummy;
800  int32 wid;
801 
802  /* Take the next available word ID */
803  wid = model->n_words;
804  if (classid >= 0) {
805  wid = NGRAM_CLASSWID(wid, classid);
806  }
807  /* Check for hash collisions. */
808  if (hash_table_lookup(model->wid, word, &dummy) == 0) {
809  E_ERROR("Duplicate definition of word %s\n", word);
810  return NGRAM_INVALID_WID;
811  }
812  /* Reallocate word_str if necessary. */
813  if (model->n_words >= model->n_1g_alloc) {
814  model->n_1g_alloc += UG_ALLOC_STEP;
815  model->word_str = ckd_realloc(model->word_str,
816  sizeof(*model->word_str) * model->n_1g_alloc);
817  }
818  /* Add the word string in the appropriate manner. */
819  /* Class words are always dynamically allocated. */
820  model->word_str[model->n_words] = ckd_salloc(word);
821  /* Now enter it into the hash table. */
822  if (hash_table_enter_int32(model->wid, model->word_str[model->n_words], wid) != wid) {
823  E_ERROR("Hash insertion failed for word %s => %p (should not happen)\n",
824  model->word_str[model->n_words], (void *)(long)(wid));
825  }
826  /* Increment number of words. */
827  ++model->n_words;
828  return wid;
829 }
830 
831 int32
833  const char *word, float32 weight)
834 {
835  int32 wid, prob = model->log_zero;
836 
837  /* If we add word to unwritable model, we need to make it writable */
838  if (!model->writable) {
839  E_WARN("Can't add word '%s' to read-only language model. "
840  "Disable mmap with '-mmap no' to make it writable\n", word);
841  return -1;
842  }
843 
844  wid = ngram_add_word_internal(model, word, -1);
845  if (wid == NGRAM_INVALID_WID)
846  return wid;
847 
848  /* Do what needs to be done to add the word to the unigram. */
849  if (model->funcs && model->funcs->add_ug)
850  prob = (*model->funcs->add_ug)(model, wid, logmath_log(model->lmath, weight));
851  if (prob == 0) {
852  return -1;
853  }
854  return wid;
855 }
856 
858 ngram_class_new(ngram_model_t *model, int32 tag_wid, int32 start_wid, glist_t classwords)
859 {
860  ngram_class_t *lmclass;
861  gnode_t *gn;
862  float32 tprob;
863  int i;
864 
865  lmclass = ckd_calloc(1, sizeof(*lmclass));
866  lmclass->tag_wid = tag_wid;
867  /* wid_base is the wid (minus class tag) of the first word in the list. */
868  lmclass->start_wid = start_wid;
869  lmclass->n_words = glist_count(classwords);
870  lmclass->prob1 = ckd_calloc(lmclass->n_words, sizeof(*lmclass->prob1));
871  lmclass->nword_hash = NULL;
872  lmclass->n_hash = 0;
873  tprob = 0.0;
874  for (gn = classwords; gn; gn = gnode_next(gn)) {
875  tprob += gnode_float32(gn);
876  }
877  if (tprob > 1.1 || tprob < 0.9) {
878  E_WARN("Total class probability is %f, will normalize\n", tprob);
879  for (gn = classwords; gn; gn = gnode_next(gn)) {
880  gn->data.fl /= tprob;
881  }
882  }
883  for (i = 0, gn = classwords; gn; ++i, gn = gnode_next(gn)) {
884  lmclass->prob1[i] = logmath_log(model->lmath, gnode_float32(gn));
885  }
886 
887  return lmclass;
888 }
889 
890 int32
891 ngram_class_add_word(ngram_class_t *lmclass, int32 wid, int32 lweight)
892 {
893  int32 hash;
894 
895  if (lmclass->nword_hash == NULL) {
896  /* Initialize everything in it to -1 */
897  lmclass->nword_hash = ckd_malloc(NGRAM_HASH_SIZE * sizeof(*lmclass->nword_hash));
898  memset(lmclass->nword_hash, 0xff, NGRAM_HASH_SIZE * sizeof(*lmclass->nword_hash));
899  lmclass->n_hash = NGRAM_HASH_SIZE;
900  lmclass->n_hash_inuse = 0;
901  }
902  /* Stupidest possible hash function. This will work pretty well
903  * when this function is called repeatedly with contiguous word
904  * IDs, though... */
905  hash = wid & (lmclass->n_hash - 1);
906  if (lmclass->nword_hash[hash].wid == -1) {
907  /* Good, no collision. */
908  lmclass->nword_hash[hash].wid = wid;
909  lmclass->nword_hash[hash].prob1 = lweight;
910  ++lmclass->n_hash_inuse;
911  return hash;
912  }
913  else {
914  int32 next;
915  /* Collision... Find the end of the hash chain. */
916  while (lmclass->nword_hash[hash].next != -1)
917  hash = lmclass->nword_hash[hash].next;
918  assert(hash != -1);
919  /* Does we has any more bukkit? */
920  if (lmclass->n_hash_inuse == lmclass->n_hash) {
921  /* Oh noes! Ok, we makes more. */
922  lmclass->nword_hash = ckd_realloc(lmclass->nword_hash,
923  lmclass->n_hash * 2 * sizeof(*lmclass->nword_hash));
924  memset(lmclass->nword_hash + lmclass->n_hash,
925  0xff, lmclass->n_hash * sizeof(*lmclass->nword_hash));
926  /* Just use the next allocated one (easy) */
927  next = lmclass->n_hash;
928  lmclass->n_hash *= 2;
929  }
930  else {
931  /* Look for any available bucket. We hope this doesn't happen. */
932  for (next = 0; next < lmclass->n_hash; ++next)
933  if (lmclass->nword_hash[next].wid == -1)
934  break;
935  /* This should absolutely not happen. */
936  assert(next != lmclass->n_hash);
937  }
938  lmclass->nword_hash[next].wid = wid;
939  lmclass->nword_hash[next].prob1 = lweight;
940  lmclass->nword_hash[hash].next = next;
941  ++lmclass->n_hash_inuse;
942  return next;
943  }
944 }
945 
946 void
947 ngram_class_free(ngram_class_t *lmclass)
948 {
949  ckd_free(lmclass->nword_hash);
950  ckd_free(lmclass->prob1);
951  ckd_free(lmclass);
952 }
953 
954 int32
956  const char *classname,
957  const char *word,
958  float32 weight)
959 {
960  ngram_class_t *lmclass;
961  int32 classid, tag_wid, wid, i, scale;
962  float32 fprob;
963 
964  /* Find the class corresponding to classname. Linear search
965  * probably okay here since there won't be very many classes, and
966  * this doesn't have to be fast. */
967  tag_wid = ngram_wid(model, classname);
968  if (tag_wid == NGRAM_INVALID_WID) {
969  E_ERROR("No such word or class tag: %s\n", classname);
970  return tag_wid;
971  }
972  for (classid = 0; classid < model->n_classes; ++classid) {
973  if (model->classes[classid]->tag_wid == tag_wid)
974  break;
975  }
976  /* Hmm, no such class. It's probably not a good idea to create one. */
977  if (classid == model->n_classes) {
978  E_ERROR("Word %s is not a class tag (call ngram_model_add_class() first)\n", classname);
979  return NGRAM_INVALID_WID;
980  }
981  lmclass = model->classes[classid];
982 
983  /* Add this word to the model's set of words. */
984  wid = ngram_add_word_internal(model, word, classid);
985  if (wid == NGRAM_INVALID_WID)
986  return wid;
987 
988  /* This is the fixed probability of the new word. */
989  fprob = weight * 1.0f / (lmclass->n_words + lmclass->n_hash_inuse + 1);
990  /* Now normalize everything else to fit it in. This is
991  * accomplished by simply scaling all the other probabilities
992  * by (1-fprob). */
993  scale = logmath_log(model->lmath, 1.0 - fprob);
994  for (i = 0; i < lmclass->n_words; ++i)
995  lmclass->prob1[i] += scale;
996  for (i = 0; i < lmclass->n_hash; ++i)
997  if (lmclass->nword_hash[i].wid != -1)
998  lmclass->nword_hash[i].prob1 += scale;
999 
1000  /* Now add it to the class hash table. */
1001  return ngram_class_add_word(lmclass, wid, logmath_log(model->lmath, fprob));
1002 }
1003 
1004 int32
1006  const char *classname,
1007  float32 classweight,
1008  char **words,
1009  const float32 *weights,
1010  int32 n_words)
1011 {
1012  ngram_class_t *lmclass;
1013  glist_t classwords = NULL;
1014  int32 i, start_wid = -1;
1015  int32 classid, tag_wid;
1016 
1017  /* Check if classname already exists in model. If not, add it.*/
1018  if ((tag_wid = ngram_wid(model, classname)) == ngram_unknown_wid(model)) {
1019  tag_wid = ngram_model_add_word(model, classname, classweight);
1020  if (tag_wid == NGRAM_INVALID_WID)
1021  return -1;
1022  }
1023 
1024  if (model->n_classes == 128) {
1025  E_ERROR("Number of classes cannot exceed 128 (sorry)\n");
1026  return -1;
1027  }
1028  classid = model->n_classes;
1029  for (i = 0; i < n_words; ++i) {
1030  int32 wid;
1031 
1032  wid = ngram_add_word_internal(model, words[i], classid);
1033  if (wid == NGRAM_INVALID_WID)
1034  return -1;
1035  if (start_wid == -1)
1036  start_wid = NGRAM_BASEWID(wid);
1037  classwords = glist_add_float32(classwords, weights[i]);
1038  }
1039  classwords = glist_reverse(classwords);
1040  lmclass = ngram_class_new(model, tag_wid, start_wid, classwords);
1041  glist_free(classwords);
1042  if (lmclass == NULL)
1043  return -1;
1044 
1045  ++model->n_classes;
1046  if (model->classes == NULL)
1047  model->classes = ckd_calloc(1, sizeof(*model->classes));
1048  else
1049  model->classes = ckd_realloc(model->classes,
1050  model->n_classes * sizeof(*model->classes));
1051  model->classes[classid] = lmclass;
1052  return classid;
1053 }
1054 
1055 int32
1056 ngram_class_prob(ngram_class_t *lmclass, int32 wid)
1057 {
1058  int32 base_wid = NGRAM_BASEWID(wid);
1059 
1060  if (base_wid < lmclass->start_wid
1061  || base_wid > lmclass->start_wid + lmclass->n_words) {
1062  int32 hash;
1063 
1064  /* Look it up in the hash table. */
1065  hash = wid & (lmclass->n_hash - 1);
1066  while (hash != -1 && lmclass->nword_hash[hash].wid != wid)
1067  hash = lmclass->nword_hash[hash].next;
1068  if (hash == -1)
1069  return 1;
1070  return lmclass->nword_hash[hash].prob1;
1071  }
1072  else {
1073  return lmclass->prob1[base_wid - lmclass->start_wid];
1074  }
1075 }
1076 
1077 int32
1078 read_classdef_file(hash_table_t *classes, const char *file_name)
1079 {
1080  FILE *fp;
1081  int32 is_pipe;
1082  int inclass;
1083  int32 rv = -1;
1084  gnode_t *gn;
1085  glist_t classwords = NULL;
1086  glist_t classprobs = NULL;
1087  char *classname = NULL;
1088 
1089  if ((fp = fopen_comp(file_name, "r", &is_pipe)) == NULL) {
1090  E_ERROR("File %s not found\n", file_name);
1091  return -1;
1092  }
1093 
1094  inclass = FALSE;
1095  while (!feof(fp)) {
1096  char line[512];
1097  char *wptr[2];
1098  int n_words;
1099 
1100  if (fgets(line, sizeof(line), fp) == NULL)
1101  break;
1102 
1103  n_words = str2words(line, wptr, 2);
1104  if (n_words <= 0)
1105  continue;
1106 
1107  if (inclass) {
1108  /* Look for an end of class marker. */
1109  if (n_words == 2 && 0 == strcmp(wptr[0], "END")) {
1110  classdef_t *classdef;
1111  gnode_t *word, *weight;
1112  int32 i;
1113 
1114  if (classname == NULL || 0 != strcmp(wptr[1], classname))
1115  goto error_out;
1116  inclass = FALSE;
1117 
1118  /* Construct a class from the list of words collected. */
1119  classdef = ckd_calloc(1, sizeof(*classdef));
1120  classwords = glist_reverse(classwords);
1121  classprobs = glist_reverse(classprobs);
1122  classdef->n_words = glist_count(classwords);
1123  classdef->words = ckd_calloc(classdef->n_words,
1124  sizeof(*classdef->words));
1125  classdef->weights = ckd_calloc(classdef->n_words,
1126  sizeof(*classdef->weights));
1127  word = classwords;
1128  weight = classprobs;
1129  for (i = 0; i < classdef->n_words; ++i) {
1130  classdef->words[i] = gnode_ptr(word);
1131  classdef->weights[i] = gnode_float32(weight);
1132  word = gnode_next(word);
1133  weight = gnode_next(weight);
1134  }
1135 
1136  /* Add this class to the hash table. */
1137  if (hash_table_enter(classes, classname, classdef) != classdef) {
1138  classdef_free(classdef);
1139  goto error_out;
1140  }
1141 
1142  /* Reset everything. */
1143  glist_free(classwords);
1144  glist_free(classprobs);
1145  classwords = NULL;
1146  classprobs = NULL;
1147  classname = NULL;
1148  }
1149  else {
1150  float32 fprob;
1151 
1152  if (n_words == 2)
1153  fprob = (float32)atof_c(wptr[1]);
1154  else
1155  fprob = 1.0f;
1156  /* Add it to the list of words for this class. */
1157  classwords = glist_add_ptr(classwords, ckd_salloc(wptr[0]));
1158  classprobs = glist_add_float32(classprobs, fprob);
1159  }
1160  }
1161  else {
1162  /* Start a new LM class if the LMCLASS marker is seen */
1163  if (n_words == 2 && 0 == strcmp(wptr[0], "LMCLASS")) {
1164  if (inclass)
1165  goto error_out;
1166  inclass = TRUE;
1167  classname = ckd_salloc(wptr[1]);
1168  }
1169  /* Otherwise, just ignore whatever junk we got */
1170  }
1171  }
1172  rv = 0; /* Success. */
1173 
1174 error_out:
1175  /* Free all the stuff we might have allocated. */
1176  fclose_comp(fp, is_pipe);
1177  for (gn = classwords; gn; gn = gnode_next(gn))
1178  ckd_free(gnode_ptr(gn));
1179  glist_free(classwords);
1180  glist_free(classprobs);
1181  ckd_free(classname);
1182 
1183  return rv;
1184 }
1185 
1186 void
1187 classdef_free(classdef_t *classdef)
1188 {
1189  int32 i;
1190  for (i = 0; i < classdef->n_words; ++i)
1191  ckd_free(classdef->words[i]);
1192  ckd_free(classdef->words);
1193  ckd_free(classdef->weights);
1194  ckd_free(classdef);
1195 }
1196 
1197 
1198 int32
1200  const char *file_name)
1201 {
1202  hash_table_t *classes;
1203  glist_t hl = NULL;
1204  gnode_t *gn;
1205  int32 rv = -1;
1206 
1207  classes = hash_table_new(0, FALSE);
1208  if (read_classdef_file(classes, file_name) < 0) {
1209  hash_table_free(classes);
1210  return -1;
1211  }
1212 
1213  /* Create a new class in the language model for each classdef. */
1214  hl = hash_table_tolist(classes, NULL);
1215  for (gn = hl; gn; gn = gnode_next(gn)) {
1216  hash_entry_t *he = gnode_ptr(gn);
1217  classdef_t *classdef = he->val;
1218 
1219  if (ngram_model_add_class(model, he->key, 1.0,
1220  classdef->words,
1221  classdef->weights,
1222  classdef->n_words) < 0)
1223  goto error_out;
1224  }
1225  rv = 0;
1226 
1227 error_out:
1228  for (gn = hl; gn; gn = gnode_next(gn)) {
1229  hash_entry_t *he = gnode_ptr(gn);
1230  ckd_free((char *)he->key);
1231  classdef_free(he->val);
1232  }
1233  glist_free(hl);
1234  hash_table_free(classes);
1235  return rv;
1236 }
SPHINXBASE_EXPORT int32 ngram_ng_prob(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist, int32 *n_used)
Quick "raw" probability lookup for a general N-Gram.
Definition: ngram_model.c:570
struct ngram_funcs_s * funcs
Implementation-specific methods.
SPHINXBASE_EXPORT int32 hash_table_lookup_int32(hash_table_t *h, const char *key, int32 *val)
Look up a 32-bit integer value in a hash table.
Definition: hash_table.c:329
int32 next
Index of next bucket (or -1 for no collision)
SPHINXBASE_EXPORT glist_t glist_add_float32(glist_t g, float32 val)
Create and prepend a new list node containing a single-precision float.
Definition: glist.c:110
SPHINXBASE_EXPORT ngram_model_t * ngram_model_read(cmd_ln_t *config, const char *file_name, ngram_file_type_t file_type, logmath_t *lmath)
Read an N-Gram model from a file on disk.
Definition: ngram_model.c:121
Miscellaneous useful string functions.
SPHINXBASE_EXPORT int32 hash_table_lookup(hash_table_t *h, const char *key, void **val)
Look up a key in a hash table and optionally return the associated value.
Definition: hash_table.c:309
SPHINXBASE_EXPORT int ngram_model_write(ngram_model_t *model, const char *file_name, ngram_file_type_t format)
Write an N-Gram model to disk.
Definition: ngram_model.c:167
SPHINXBASE_EXPORT int ngram_model_casefold(ngram_model_t *model, int kase)
Case-fold word strings in an N-Gram model.
Definition: ngram_model.c:299
int32 log_uniform
Log of uniform (0-gram) probability.
#define ckd_calloc(n, sz)
Macros to simplify the use of above functions.
Definition: ckd_alloc.h:248
SPHINXBASE_EXPORT int32 ngram_unknown_wid(ngram_model_t *model)
Get the unknown word ID for a language model.
Definition: ngram_model.c:644
#define hash_table_enter_int32(h, k, v)
Add a 32-bit integer value to a hash table.
Definition: hash_table.h:228
hash_table_t * wid
Mapping of unigram names to word IDs.
char ** word_str
Unigram names.
SPHINXBASE_EXPORT void ngram_iter_free(ngram_iter_t *itor)
Terminate an M-gram iterator.
Definition: ngram_model.c:764
SPHINXBASE_EXPORT int32 const * ngram_model_get_counts(ngram_model_t *model)
Get the counts of the various N-grams in the model.
Definition: ngram_model.c:671
ngram_iter_t *(* iter_next)(ngram_iter_t *itor)
Implementation-specific function for iterating.
Sphinx's memory allocation/deallocation routines.
int32 log_uniform_weight
Log of uniform weight (i.e.
SPHINXBASE_EXPORT int32 ngram_wid(ngram_model_t *model, const char *word)
Look up numerical word ID.
Definition: ngram_model.c:771
SPHINXBASE_EXPORT int32 const * ngram_iter_get(ngram_iter_t *itor, int32 *out_score, int32 *out_bowt)
Get information from the current M-gram in an iterator.
Definition: ngram_model.c:750
int32 n_hash
Number of buckets in nword_hash (power of 2)
SPHINXBASE_EXPORT glist_t hash_table_tolist(hash_table_t *h, int32 *count)
Build a glist of valid hash_entry_t pointers from the given hash table.
Definition: hash_table.c:623
void(* free)(ngram_model_t *model)
Implementation-specific function for freeing an ngram_model_t.
#define NGRAM_INVALID_WID
Impossible word ID.
Definition: ngram_model.h:84
SPHINXBASE_EXPORT ngram_iter_t * ngram_iter_next(ngram_iter_t *itor)
Advance an M-gram iterator.
Definition: ngram_model.c:758
SPHINXBASE_EXPORT int cmd_ln_exists_r(cmd_ln_t *cmdln, char const *name)
Re-entrant version of cmd_ln_exists().
Definition: cmd_ln.c:929
File names related operation.
int(* apply_weights)(ngram_model_t *model, float32 lw, float32 wip, float32 uw)
Implementation-specific function for applying language model weights.
int16 m
Order of history.
SPHINXBASE_EXPORT int logmath_log(logmath_t *lmath, float64 p)
Convert linear floating point number to integer log in base B.
Definition: logmath.c:447
SPHINXBASE_EXPORT int32 ngram_tg_score(ngram_model_t *model, int32 w3, int32 w2, int32 w1, int32 *n_used)
Quick trigram score lookup.
Definition: ngram_model.c:555
A node in a generic list.
Definition: glist.h:100
uint8 writable
Are word strings writable?
ngram_iter_t *(* iter)(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist)
Implementation-specific function for iterating.
int32 * n_counts
Counts for 1, 2, 3, ...
SPHINXBASE_EXPORT int ngram_model_free(ngram_model_t *model)
Release memory associated with an N-Gram model.
Definition: ngram_model.c:254
#define ckd_salloc(ptr)
Macro for ckd_salloc
Definition: ckd_alloc.h:264
SPHINXBASE_EXPORT hash_table_t * hash_table_new(int32 size, int32 casearg)
Allocate a new hash table for a given expected size.
Definition: hash_table.c:158
SPHINXBASE_EXPORT void hash_table_empty(hash_table_t *h)
Delete all entries from a hash_table.
Definition: hash_table.c:490
SPHINXBASE_EXPORT ngram_iter_t * ngram_iter_successors(ngram_iter_t *itor)
Iterate over all M-gram successors of an M-1-gram.
Definition: ngram_model.c:741
SPHINXBASE_EXPORT void ckd_free(void *ptr)
Test and free a 1-D array.
Definition: ckd_alloc.c:241
SPHINXBASE_EXPORT glist_t glist_add_ptr(glist_t g, void *ptr)
Create and prepend a new list node, with the given user-defined data, at the HEAD of the given generi...
Definition: glist.c:74
int32 n_words
Number of base words for this class.
#define E_WARN
Print warning information to standard error stream.
Definition: err.h:164
SPHINXBASE_EXPORT int32 strcmp_nocase(const char *str1, const char *str2)
(FIXME! The implementation is incorrect!) Case insensitive string compare.
Definition: case.c:94
int32 log_zero
Zero probability, cached here for quick lookup.
SPHINXBASE_EXPORT void hash_table_free(hash_table_t *h)
Free the specified hash table; the caller is responsible for freeing the key strings pointed to by th...
Definition: hash_table.c:695
int refcount
Reference count.
Sphinx .DMP format.
Definition: ngram_model.h:80
A note by ARCHAN at 20050510: Technically what we use is so-called "hash table with buckets" which is...
Definition: hash_table.h:149
int32 n_1g_alloc
Number of allocated word strings (for new word addition)
SPHINXBASE_EXPORT double atof_c(char const *str)
Locale independent version of atof().
Definition: strfuncs.c:56
SPHINXBASE_EXPORT int32 strncmp_nocase(const char *str1, const char *str2, size_t len)
Like strcmp_nocase() but with a maximum length.
Definition: case.c:119
int32(* raw_score)(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist, int32 *n_used)
Implementation-specific function for querying raw language model probability.
N-Gram language models.
ARPABO text format (the standard).
Definition: ngram_model.h:79
SPHINXBASE_EXPORT glist_t glist_reverse(glist_t g)
Reverse the order of the given glist.
Definition: glist.c:169
SPHINXBASE_EXPORT int32 ngram_prob(ngram_model_t *model, const char *word,...)
Get the "raw" log-probability for a general N-Gram.
Definition: ngram_model.c:600
SPHINXBASE_EXPORT int ngram_model_apply_weights(ngram_model_t *model, float32 lw, float32 wip, float32 uw)
Apply a language weight, insertion penalty, and unigram weight to a language model.
Definition: ngram_model.c:477
int32 tag_wid
Base word ID for this class tag.
SPHINXBASE_EXPORT void glist_free(glist_t g)
Free the given generic list; user-defined data contained within is not automatically freed...
Definition: glist.c:133
SPHINXBASE_EXPORT ngram_model_t * ngram_model_retain(ngram_model_t *model)
Retain ownership of an N-Gram model.
Definition: ngram_model.c:239
SPHINXBASE_EXPORT int32 ngram_score(ngram_model_t *model, const char *word,...)
Get the score (scaled, interpolated log-probability) for a general N-Gram.
Definition: ngram_model.c:524
#define gnode_ptr(g)
Head of a list of gnodes.
Definition: glist.h:109
SPHINXBASE_EXPORT int32 ngram_model_get_size(ngram_model_t *model)
Get the order of the N-gram model (i.e.
Definition: ngram_model.c:663
int32 n_hash_inuse
Number of words in nword_hash.
SPHINXBASE_EXPORT char const * ngram_type_to_str(int type)
Get the canonical name for an N-Gram file type.
Definition: ngram_model.c:107
SPHINXBASE_EXPORT int32 ngram_bg_score(ngram_model_t *model, int32 w2, int32 w1, int32 *n_used)
Quick bigram score lookup.
Definition: ngram_model.c:564
SPHINXBASE_EXPORT int32 ngram_model_add_word(ngram_model_t *model, const char *word, float32 weight)
Add a word (unigram) to the language model.
Definition: ngram_model.c:832
uint8 n
This is an n-gram model (1, 2, 3, ...).
Implementation of logging routines.
logmath_t * lmath
Log-math object.
SPHINXBASE_EXPORT float32 ngram_model_get_weights(ngram_model_t *model, int32 *out_log_wip, int32 *out_log_uw)
Get the current weights from a language model.
Definition: ngram_model.c:484
SPHINXBASE_EXPORT int32 ngram_model_add_class(ngram_model_t *model, const char *classname, float32 classweight, char **words, const float32 *weights, int32 n_words)
Add a new class to a language model.
Definition: ngram_model.c:1005
SPHINXBASE_EXPORT void * hash_table_enter(hash_table_t *h, const char *key, void *val)
Try to add a new entry with given key and associated value to hash table h.
Definition: hash_table.c:508
SPHINXBASE_EXPORT FILE * fopen_comp(const char *file, const char *mode, int32 *ispipe)
Like fopen, but use popen and zcat if it is determined that "file" is compressed (i.e., has a .z, .Z, .gz, or .GZ extension).
Definition: pio.c:98
One class definition from a classdef file.
SPHINXBASE_EXPORT ngram_file_type_t ngram_str_to_type(const char *str_name)
Get the N-Gram file type from a string.
Definition: ngram_model.c:97
int32 log_uw
Log of unigram weight.
int32 start_wid
Starting base word ID for this class' words.
SPHINXBASE_EXPORT ngram_iter_t * ngram_ng_iter(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist)
Get an iterator over M-grams pointing to the specified M-gram.
Definition: ngram_model.c:731
SPHINXBASE_EXPORT int32 ngram_model_read_classdef(ngram_model_t *model, const char *file_name)
Read a class definition file and add classes to a language model.
Definition: ngram_model.c:1199
SPHINXBASE_EXPORT int logmath_get_zero(logmath_t *lmath)
Get the smallest possible value represented in this base.
Definition: logmath.c:374
int32(* add_ug)(ngram_model_t *model, int32 wid, int32 lweight)
Implementation-specific function for adding unigrams.
SPHINXBASE_EXPORT void ngram_model_flush(ngram_model_t *lm)
Flush any cached N-Gram information.
Definition: ngram_model.c:247
uint8 n_classes
Number of classes (maximum 128)
SPHINXBASE_EXPORT int32 str2words(char *line, char **wptr, int32 n_wptr)
Convert a line to an array of "words", based on whitespace separators.
Definition: strfuncs.c:115
Opaque structure used to hold the results of command-line parsing.
#define ckd_malloc(sz)
Macro for ckd_malloc
Definition: ckd_alloc.h:253
enum ngram_file_type_e ngram_file_type_t
File types for N-Gram files.
SPHINXBASE_EXPORT ngram_iter_t * ngram_iter(ngram_model_t *model, const char *word,...)
Get an iterator over M-grams pointing to the specified M-gram.
Definition: ngram_model.c:702
#define E_ERROR
Print error message to standard error stream.
Definition: err.h:169
void(* iter_free)(ngram_iter_t *itor)
Implementation-specific function for iterating.
Implementation-specific functions for operating on ngram_model_t objects.
SPHINXBASE_EXPORT ngram_file_type_t ngram_file_name_to_type(const char *file_name)
Guess the file type for an N-Gram model from the filename.
Definition: ngram_model.c:64
float32 lw
Language model scaling factor.
Base iterator structure for N-grams.
int32 const *(* iter_get)(ngram_iter_t *itor, int32 *out_score, int32 *out_bowt)
Implementation-specific function for iterating.
Not a valid file type.
Definition: ngram_model.h:77
Implementation of ngram_class_t.
ngram_iter_t *(* successors)(ngram_iter_t *itor)
Implementation-specific function for iterating.
SPHINXBASE_EXPORT void lcase(char *str)
Convert str to all lower case.
int32 prob1
Probability for this word.
int32 * wids
Scratch space for word IDs.
void(* flush)(ngram_model_t *model)
Implementation-specific function for purging N-Gram cache.
SPHINXBASE_EXPORT void ucase(char *str)
Convert str to all upper case.
Common implementation of ngram_model_t.
void * val
Key-length; the key string does not have to be a C-style NULL terminated string; it can have arbitrar...
Definition: hash_table.h:155
int32 wid
Word ID of this bucket.
SPHINXBASE_EXPORT int32 ngram_score_to_prob(ngram_model_t *model, int32 score)
Convert score to "raw" log-probability.
Definition: ngram_model.c:631
SPHINXBASE_EXPORT const char * ngram_word(ngram_model_t *model, int32 wid)
Look up word string for numerical word ID.
Definition: ngram_model.c:782
SPHINXBASE_EXPORT void fclose_comp(FILE *fp, int32 ispipe)
Close a file opened using fopen_comp.
Definition: pio.c:175
int32(* score)(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist, int32 *n_used)
Implementation-specific function for querying language model score.
SPHINXBASE_EXPORT int32 ngram_zero(ngram_model_t *model)
Get the "zero" log-probability value for a language model.
Definition: ngram_model.c:657
#define E_ERROR_SYSTEM
Print error text; Call perror("");.
Definition: err.h:142
Fast integer logarithmic addition operations.
struct ngram_class_s ** classes
Word class definitions.
#define ckd_realloc(ptr, sz)
Macro for ckd_realloc
Definition: ckd_alloc.h:258
int16 successor
Is this a successor iterator?
SPHINXBASE_EXPORT int32 ngram_model_add_class_word(ngram_model_t *model, const char *classname, const char *word, float32 weight)
Add a word to a class in a language model.
Definition: ngram_model.c:955
SPHINXBASE_EXPORT ngram_iter_t * ngram_model_mgrams(ngram_model_t *model, int m)
Iterate over all M-grams.
Definition: ngram_model.c:689
ngram_iter_t *(* mgrams)(ngram_model_t *model, int32 m)
Implementation-specific function for iterating.
SPHINXBASE_EXPORT int32 glist_count(glist_t g)
Count the number of element in a given link list.
Definition: glist.c:145
Determine file type automatically.
Definition: ngram_model.h:78
SPHINXBASE_EXPORT int32 ngram_ng_score(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist, int32 *n_used)
Quick general N-Gram score lookup.
Definition: ngram_model.c:494
file IO related operations.
Locale-independent implementation of case swapping operation.
int32 * prob1
Probability table for base words.
int32 log_wip
Log of word insertion penalty.
SPHINXBASE_EXPORT int ngram_model_recode(ngram_model_t *model, const char *from, const char *to)
Re-encode word strings in an N-Gram model.
Definition: ngram_model.c:470
int32 n_words
Number of actual word strings (NOT the same as the number of unigrams, due to class words)...