SphinxBase  0.6
ngram_model_arpa.c
1 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /* ====================================================================
3  * Copyright (c) 1999-2007 Carnegie Mellon University. All rights
4  * reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright
11  * notice, this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright
14  * notice, this list of conditions and the following disclaimer in
15  * the documentation and/or other materials provided with the
16  * distribution.
17  *
18  * This work was supported in part by funding from the Defense Advanced
19  * Research Projects Agency and the National Science Foundation of the
20  * United States of America, and the CMU Sphinx Speech Consortium.
21  *
22  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33  *
34  * ====================================================================
35  *
36  */
37 /*
38  * \file ngram_model_arpa.c ARPA format language models
39  *
40  * Author: David Huggins-Daines <dhuggins@cs.cmu.edu>
41  */
42 
43 #include "sphinxbase/ckd_alloc.h"
44 #include <string.h>
45 #include <limits.h>
46 #include <assert.h>
47 
48 #include "sphinxbase/err.h"
49 #include "sphinxbase/pio.h"
51 #include "sphinxbase/strfuncs.h"
52 
53 #include "ngram_model_arpa.h"
54 
55 static ngram_funcs_t ngram_model_arpa_funcs;
56 
57 #define TSEG_BASE(m,b) ((m)->lm3g.tseg_base[(b)>>LOG_BG_SEG_SZ])
58 #define FIRST_BG(m,u) ((m)->lm3g.unigrams[u].bigrams)
59 #define FIRST_TG(m,b) (TSEG_BASE((m),(b))+((m)->lm3g.bigrams[b].trigrams))
60 
61 /*
62  * Read and return #unigrams, #bigrams, #trigrams as stated in input file.
63  */
64 static int
65 ReadNgramCounts(lineiter_t **li, int32 * n_ug, int32 * n_bg, int32 * n_tg)
66 {
67  int32 ngram, ngram_cnt;
68 
69  /* skip file until past the '\data\' marker */
70  while (*li) {
71  string_trim((*li)->buf, STRING_BOTH);
72  if (strcmp((*li)->buf, "\\data\\") == 0)
73  break;
74  *li = lineiter_next(*li);
75  }
76  if (*li == NULL || strcmp((*li)->buf, "\\data\\") != 0) {
77  E_INFO("No \\data\\ mark in LM file\n");
78  return -1;
79  }
80 
81  *n_ug = *n_bg = *n_tg = 0;
82  while ((*li = lineiter_next(*li))) {
83  if (sscanf((*li)->buf, "ngram %d=%d", &ngram, &ngram_cnt) != 2)
84  break;
85  switch (ngram) {
86  case 1:
87  *n_ug = ngram_cnt;
88  break;
89  case 2:
90  *n_bg = ngram_cnt;
91  break;
92  case 3:
93  *n_tg = ngram_cnt;
94  break;
95  default:
96  E_ERROR("Unknown ngram (%d)\n", ngram);
97  return -1;
98  }
99  }
100  if (*li == NULL) {
101  E_ERROR("EOF while reading ngram counts\n");
102  return -1;
103  }
104 
105  /* Position iterator to the unigrams header '\1-grams:\' */
106  while ((*li = lineiter_next(*li))) {
107  string_trim((*li)->buf, STRING_BOTH);
108  if (strcmp((*li)->buf, "\\1-grams:") == 0)
109  break;
110  }
111  if (*li == NULL) {
112  E_ERROR_SYSTEM("Failed to read \\1-grams: mark");
113  return -1;
114  }
115 
116  if ((*n_ug <= 0) || (*n_bg < 0) || (*n_tg < 0)) {
117  E_ERROR("Bad or missing ngram count\n");
118  return -1;
119  }
120  return 0;
121 }
122 
123 /*
124  * Read in the unigrams from given file into the LM structure model.
125  * On entry to this procedure, the iterator is positioned to the
126  * header line '\1-grams:'.
127  */
128 static int
129 ReadUnigrams(lineiter_t **li, ngram_model_arpa_t * model)
130 {
131  ngram_model_t *base = &model->base;
132  int32 wcnt;
133  float p1;
134 
135  E_INFO("Reading unigrams\n");
136 
137  wcnt = 0;
138  while ((*li = lineiter_next(*li))) {
139  char *wptr[3], *name;
140  float32 bo_wt = 0.0f;
141  int n;
142 
143  string_trim((*li)->buf, STRING_BOTH);
144  if (strcmp((*li)->buf, "\\2-grams:") == 0
145  || strcmp((*li)->buf, "\\end\\") == 0)
146  break;
147 
148  if ((n = str2words((*li)->buf, wptr, 3)) < 2) {
149  if ((*li)->buf[0] != '\0')
150  E_WARN("Format error; unigram ignored: %s\n", (*li)->buf);
151  continue;
152  }
153  else {
154  p1 = (float)atof_c(wptr[0]);
155  name = wptr[1];
156  if (n == 3)
157  bo_wt = (float)atof_c(wptr[2]);
158  }
159 
160  if (wcnt >= base->n_counts[0]) {
161  E_ERROR("Too many unigrams\n");
162  return -1;
163  }
164 
165  /* Associate name with word id */
166  base->word_str[wcnt] = ckd_salloc(name);
167  if ((hash_table_enter(base->wid, base->word_str[wcnt], (void *)(long)wcnt))
168  != (void *)(long)wcnt) {
169  E_WARN("Duplicate word in dictionary: %s\n", base->word_str[wcnt]);
170  }
171  model->lm3g.unigrams[wcnt].prob1.l = logmath_log10_to_log(base->lmath, p1);
172  model->lm3g.unigrams[wcnt].bo_wt1.l = logmath_log10_to_log(base->lmath, bo_wt);
173  wcnt++;
174  }
175 
176  if (base->n_counts[0] != wcnt) {
177  E_WARN("lm_t.ucount(%d) != #unigrams read(%d)\n",
178  base->n_counts[0], wcnt);
179  base->n_counts[0] = wcnt;
180  base->n_words = wcnt;
181  }
182  return 0;
183 }
184 
185 /*
186  * Read bigrams from given file into given model structure.
187  */
188 static int
189 ReadBigrams(lineiter_t **li, ngram_model_arpa_t * model)
190 {
191  ngram_model_t *base = &model->base;
192  int32 w1, w2, prev_w1, bgcount;
193  bigram_t *bgptr;
194 
195  E_INFO("Reading bigrams\n");
196 
197  bgcount = 0;
198  bgptr = model->lm3g.bigrams;
199  prev_w1 = -1;
200 
201  while ((*li = lineiter_next(*li))) {
202  float32 p, bo_wt = 0.0f;
203  int32 p2, bo_wt2;
204  char *wptr[4], *word1, *word2;
205  int n;
206 
207  string_trim((*li)->buf, STRING_BOTH);
208  wptr[3] = NULL;
209  if ((n = str2words((*li)->buf, wptr, 4)) < 3) {
210  if ((*li)->buf[0] != '\0')
211  break;
212  continue;
213  }
214  else {
215  p = (float32)atof_c(wptr[0]);
216  word1 = wptr[1];
217  word2 = wptr[2];
218  if (wptr[3])
219  bo_wt = (float32)atof_c(wptr[3]);
220  }
221 
222  if ((w1 = ngram_wid(base, word1)) == NGRAM_INVALID_WID) {
223  E_ERROR("Unknown word: %s, skipping bigram (%s %s)\n",
224  word1, word1, word2);
225  continue;
226  }
227  if ((w2 = ngram_wid(base, word2)) == NGRAM_INVALID_WID) {
228  E_ERROR("Unknown word: %s, skipping bigram (%s %s)\n",
229  word2, word1, word2);
230  continue;
231  }
232 
233  /* FIXME: Should use logmath_t quantization here. */
234  /* HACK!! to quantize probs to 4 decimal digits */
235  p = (float32)((int32)(p * 10000)) / 10000;
236  bo_wt = (float32)((int32)(bo_wt * 10000)) / 10000;
237 
238  p2 = logmath_log10_to_log(base->lmath, p);
239  bo_wt2 = logmath_log10_to_log(base->lmath, bo_wt);
240 
241  if (bgcount >= base->n_counts[1]) {
242  E_ERROR("Too many bigrams\n");
243  return -1;
244  }
245 
246  bgptr->wid = w2;
247  bgptr->prob2 = sorted_id(&model->sorted_prob2, &p2);
248  if (base->n_counts[2] > 0)
249  bgptr->bo_wt2 = sorted_id(&model->sorted_bo_wt2, &bo_wt2);
250 
251  if (w1 != prev_w1) {
252  if (w1 < prev_w1) {
253  E_ERROR("Bigrams not in unigram order\n");
254  return -1;
255  }
256 
257  for (prev_w1++; prev_w1 <= w1; prev_w1++)
258  model->lm3g.unigrams[prev_w1].bigrams = bgcount;
259  prev_w1 = w1;
260  }
261  bgcount++;
262  bgptr++;
263 
264  if ((bgcount & 0x0000ffff) == 0) {
265  E_INFOCONT(".");
266  }
267  }
268  if (*li == NULL || ((strcmp((*li)->buf, "\\end\\") != 0)
269  && (strcmp((*li)->buf, "\\3-grams:") != 0))) {
270  E_ERROR("Bad bigram: %s\n", (*li)->buf);
271  return -1;
272  }
273 
274  for (prev_w1++; prev_w1 <= base->n_counts[0]; prev_w1++)
275  model->lm3g.unigrams[prev_w1].bigrams = bgcount;
276 
277  return 0;
278 }
279 
280 /*
281  * Very similar to ReadBigrams.
282  */
283 static int
284 ReadTrigrams(lineiter_t **li, ngram_model_arpa_t * model)
285 {
286  ngram_model_t *base = &model->base;
287  int32 i, w1, w2, w3, prev_w1, prev_w2, tgcount, prev_bg, bg, endbg;
288  int32 seg, prev_seg, prev_seg_lastbg;
289  trigram_t *tgptr;
290  bigram_t *bgptr;
291 
292  E_INFO("Reading trigrams\n");
293 
294  tgcount = 0;
295  tgptr = model->lm3g.trigrams;
296  prev_w1 = -1;
297  prev_w2 = -1;
298  prev_bg = -1;
299  prev_seg = -1;
300 
301  while ((*li = lineiter_next(*li))) {
302  float32 p;
303  int32 p3;
304  char *wptr[4], *word1, *word2, *word3;
305 
306  string_trim((*li)->buf, STRING_BOTH);
307  if (str2words((*li)->buf, wptr, 4) != 4) {
308  if ((*li)->buf[0] != '\0')
309  break;
310  continue;
311  }
312  else {
313  p = (float32)atof_c(wptr[0]);
314  word1 = wptr[1];
315  word2 = wptr[2];
316  word3 = wptr[3];
317  }
318 
319  if ((w1 = ngram_wid(base, word1)) == NGRAM_INVALID_WID) {
320  E_ERROR("Unknown word: %s, skipping trigram (%s %s %s)\n",
321  word1, word1, word2, word3);
322  continue;
323  }
324  if ((w2 = ngram_wid(base, word2)) == NGRAM_INVALID_WID) {
325  E_ERROR("Unknown word: %s, skipping trigram (%s %s %s)\n",
326  word2, word1, word2, word3);
327  continue;
328  }
329  if ((w3 = ngram_wid(base, word3)) == NGRAM_INVALID_WID) {
330  E_ERROR("Unknown word: %s, skipping trigram (%s %s %s)\n",
331  word3, word1, word2, word3);
332  continue;
333  }
334 
335  /* FIXME: Should use logmath_t quantization here. */
336  /* HACK!! to quantize probs to 4 decimal digits */
337  p = (float32)((int32)(p * 10000)) / 10000;
338  p3 = logmath_log10_to_log(base->lmath, p);
339 
340  if (tgcount >= base->n_counts[2]) {
341  E_ERROR("Too many trigrams\n");
342  return -1;
343  }
344 
345  tgptr->wid = w3;
346  tgptr->prob3 = sorted_id(&model->sorted_prob3, &p3);
347 
348  if ((w1 != prev_w1) || (w2 != prev_w2)) {
349  /* Trigram for a new bigram; update tg info for all previous bigrams */
350  if ((w1 < prev_w1) || ((w1 == prev_w1) && (w2 < prev_w2))) {
351  E_ERROR("Trigrams not in bigram order\n");
352  return -1;
353  }
354 
355  bg = (w1 !=
356  prev_w1) ? model->lm3g.unigrams[w1].bigrams : prev_bg + 1;
357  endbg = model->lm3g.unigrams[w1 + 1].bigrams;
358  bgptr = model->lm3g.bigrams + bg;
359  for (; (bg < endbg) && (bgptr->wid != w2); bg++, bgptr++);
360  if (bg >= endbg) {
361  E_ERROR("Missing bigram for trigram: %s", (*li)->buf);
362  return -1;
363  }
364 
365  /* bg = bigram entry index for <w1,w2>. Update tseg_base */
366  seg = bg >> LOG_BG_SEG_SZ;
367  for (i = prev_seg + 1; i <= seg; i++)
368  model->lm3g.tseg_base[i] = tgcount;
369 
370  /* Update trigrams pointers for all bigrams until bg */
371  if (prev_seg < seg) {
372  int32 tgoff = 0;
373 
374  if (prev_seg >= 0) {
375  tgoff = tgcount - model->lm3g.tseg_base[prev_seg];
376  if (tgoff > 65535) {
377  E_ERROR("Size of trigram segment is bigger than 65535, such a big language models are not supported, use smaller vocabulary\n");
378  return -1;
379  }
380  }
381 
382  prev_seg_lastbg = ((prev_seg + 1) << LOG_BG_SEG_SZ) - 1;
383  bgptr = model->lm3g.bigrams + prev_bg;
384  for (++prev_bg, ++bgptr; prev_bg <= prev_seg_lastbg;
385  prev_bg++, bgptr++)
386  bgptr->trigrams = tgoff;
387 
388  for (; prev_bg <= bg; prev_bg++, bgptr++)
389  bgptr->trigrams = 0;
390  }
391  else {
392  int32 tgoff;
393 
394  tgoff = tgcount - model->lm3g.tseg_base[prev_seg];
395  if (tgoff > 65535) {
396  E_ERROR("Size of trigram segment is bigger than 65535, such a big language models are not supported, use smaller vocabulary\n");
397  return -1;
398  }
399 
400  bgptr = model->lm3g.bigrams + prev_bg;
401  for (++prev_bg, ++bgptr; prev_bg <= bg; prev_bg++, bgptr++)
402  bgptr->trigrams = tgoff;
403  }
404 
405  prev_w1 = w1;
406  prev_w2 = w2;
407  prev_bg = bg;
408  prev_seg = seg;
409  }
410 
411  tgcount++;
412  tgptr++;
413 
414  if ((tgcount & 0x0000ffff) == 0) {
415  E_INFOCONT(".");
416  }
417  }
418  if (*li == NULL || strcmp((*li)->buf, "\\end\\") != 0) {
419  E_ERROR("Bad trigram: %s\n", (*li)->buf);
420  return -1;
421  }
422 
423  for (prev_bg++; prev_bg <= base->n_counts[1]; prev_bg++) {
424  if ((prev_bg & (BG_SEG_SZ - 1)) == 0)
425  model->lm3g.tseg_base[prev_bg >> LOG_BG_SEG_SZ] = tgcount;
426  if ((tgcount - model->lm3g.tseg_base[prev_bg >> LOG_BG_SEG_SZ]) > 65535) {
427  E_ERROR("Size of trigram segment is bigger than 65535, such a big language models are not supported, use smaller vocabulary\n");
428  return -1;
429  }
430  model->lm3g.bigrams[prev_bg].trigrams =
431  tgcount - model->lm3g.tseg_base[prev_bg >> LOG_BG_SEG_SZ];
432  }
433  return 0;
434 }
435 
436 static unigram_t *
437 new_unigram_table(int32 n_ug)
438 {
439  unigram_t *table;
440  int32 i;
441 
442  table = ckd_calloc(n_ug, sizeof(unigram_t));
443  for (i = 0; i < n_ug; i++) {
444  table[i].prob1.l = INT_MIN;
445  table[i].bo_wt1.l = INT_MIN;
446  }
447  return table;
448 }
449 
451 ngram_model_arpa_read(cmd_ln_t *config,
452  const char *file_name,
453  logmath_t *lmath)
454 {
455  lineiter_t *li;
456  FILE *fp;
457  int32 is_pipe;
458  int32 n_unigram;
459  int32 n_bigram;
460  int32 n_trigram;
461  int32 n;
462  ngram_model_arpa_t *model;
463  ngram_model_t *base;
464 
465  if ((fp = fopen_comp(file_name, "r", &is_pipe)) == NULL) {
466  E_ERROR("File %s not found\n", file_name);
467  return NULL;
468  }
469  li = lineiter_start(fp);
470 
471  /* Read #unigrams, #bigrams, #trigrams from file */
472  if (ReadNgramCounts(&li, &n_unigram, &n_bigram, &n_trigram) == -1) {
473  lineiter_free(li);
474  fclose_comp(fp, is_pipe);
475  return NULL;
476  }
477  E_INFO("ngrams 1=%d, 2=%d, 3=%d\n", n_unigram, n_bigram, n_trigram);
478 
479  /* Allocate space for LM, including initial OOVs and placeholders; initialize it */
480  model = ckd_calloc(1, sizeof(*model));
481  base = &model->base;
482  if (n_trigram > 0)
483  n = 3;
484  else if (n_bigram > 0)
485  n = 2;
486  else
487  n = 1;
488  /* Initialize base model. */
489  ngram_model_init(base, &ngram_model_arpa_funcs, lmath, n, n_unigram);
490  base->n_counts[0] = n_unigram;
491  base->n_counts[1] = n_bigram;
492  base->n_counts[2] = n_trigram;
493  base->writable = TRUE;
494 
495  /*
496  * Allocate one extra unigram and bigram entry: sentinels to terminate
497  * followers (bigrams and trigrams, respectively) of previous entry.
498  */
499  model->lm3g.unigrams = new_unigram_table(n_unigram + 1);
500  model->lm3g.bigrams =
501  ckd_calloc(n_bigram + 1, sizeof(bigram_t));
502  if (n_trigram > 0)
503  model->lm3g.trigrams =
504  ckd_calloc(n_trigram, sizeof(trigram_t));
505 
506  if (n_trigram > 0) {
507  model->lm3g.tseg_base =
508  ckd_calloc((n_bigram + 1) / BG_SEG_SZ + 1,
509  sizeof(int32));
510  }
511  if (ReadUnigrams(&li, model) == -1) {
512  fclose_comp(fp, is_pipe);
513  ngram_model_free(base);
514  return NULL;
515  }
516  E_INFO("%8d = #unigrams created\n", base->n_counts[0]);
517 
518  init_sorted_list(&model->sorted_prob2);
519  if (base->n_counts[2] > 0)
520  init_sorted_list(&model->sorted_bo_wt2);
521 
522  if (base->n_counts[1] > 0) {
523  if (ReadBigrams(&li, model) == -1) {
524  fclose_comp(fp, is_pipe);
525  ngram_model_free(base);
526  return NULL;
527  }
528 
529  base->n_counts[1] = FIRST_BG(model, base->n_counts[0]);
530  model->lm3g.n_prob2 = model->sorted_prob2.free;
531  model->lm3g.prob2 = vals_in_sorted_list(&model->sorted_prob2);
532  free_sorted_list(&model->sorted_prob2);
533  E_INFO("%8d = #bigrams created\n", base->n_counts[1]);
534  E_INFO("%8d = #prob2 entries\n", model->lm3g.n_prob2);
535  }
536 
537  if (base->n_counts[2] > 0) {
538  /* Create trigram bo-wts array */
539  model->lm3g.n_bo_wt2 = model->sorted_bo_wt2.free;
540  model->lm3g.bo_wt2 = vals_in_sorted_list(&model->sorted_bo_wt2);
541  free_sorted_list(&model->sorted_bo_wt2);
542  E_INFO("%8d = #bo_wt2 entries\n", model->lm3g.n_bo_wt2);
543 
544  init_sorted_list(&model->sorted_prob3);
545 
546  if (ReadTrigrams(&li, model) == -1) {
547  fclose_comp(fp, is_pipe);
548  ngram_model_free(base);
549  return NULL;
550  }
551 
552  base->n_counts[2] = FIRST_TG(model, base->n_counts[1]);
553  model->lm3g.n_prob3 = model->sorted_prob3.free;
554  model->lm3g.prob3 = vals_in_sorted_list(&model->sorted_prob3);
555  E_INFO("%8d = #trigrams created\n", base->n_counts[2]);
556  E_INFO("%8d = #prob3 entries\n", model->lm3g.n_prob3);
557 
558  free_sorted_list(&model->sorted_prob3);
559 
560  /* Initialize tginfo */
561  model->lm3g.tginfo = ckd_calloc(n_unigram, sizeof(tginfo_t *));
562  model->lm3g.le = listelem_alloc_init(sizeof(tginfo_t));
563  }
564 
565  lineiter_free(li);
566  fclose_comp(fp, is_pipe);
567  return base;
568 }
569 
570 int
571 ngram_model_arpa_write(ngram_model_t *model,
572  const char *file_name)
573 {
574  ngram_iter_t *itor;
575  FILE *fh;
576  int i;
577 
578  if ((fh = fopen(file_name, "w")) == NULL) {
579  E_ERROR_SYSTEM("Failed to open %s for writing", file_name);
580  return -1;
581  }
582  fprintf(fh, "This is an ARPA-format language model file, generated by CMU Sphinx\n");
583 
584  /* The ARPA format doesn't require any extra information that
585  * N-Gram iterators can't give us, so this is very
586  * straightforward compared with DMP writing. */
587 
588  /* Write N-gram counts. */
589  fprintf(fh, "\\data\\\n");
590  for (i = 0; i < model->n; ++i) {
591  fprintf(fh, "ngram %d=%d\n", i+1, model->n_counts[i]);
592  }
593 
594  /* Write N-grams */
595  for (i = 0; i < model->n; ++i) {
596  fprintf(fh, "\n\\%d-grams:\n", i + 1);
597  for (itor = ngram_model_mgrams(model, i); itor; itor = ngram_iter_next(itor)) {
598  int32 const *wids;
599  int32 score, bowt;
600  int j;
601 
602  wids = ngram_iter_get(itor, &score, &bowt);
603  fprintf(fh, "%.4f ", logmath_log_to_log10(model->lmath, score));
604  for (j = 0; j <= i; ++j) {
605  assert(wids[j] < model->n_counts[0]);
606  fprintf(fh, "%s ", model->word_str[wids[j]]);
607  }
608  if (i < model->n-1)
609  fprintf(fh, "%.4f", logmath_log_to_log10(model->lmath, bowt));
610  fprintf(fh, "\n");
611  }
612  }
613  fprintf(fh, "\n\\end\\\n");
614  return fclose(fh);
615 }
616 
617 static int
618 ngram_model_arpa_apply_weights(ngram_model_t *base, float32 lw,
619  float32 wip, float32 uw)
620 {
621  ngram_model_arpa_t *model = (ngram_model_arpa_t *)base;
622  lm3g_apply_weights(base, &model->lm3g, lw, wip, uw);
623  return 0;
624 }
625 
626 /* Lousy "templating" for things that are largely the same in DMP and
627  * ARPA models, except for the bigram and trigram types and some
628  * names. */
629 #define NGRAM_MODEL_TYPE ngram_model_arpa_t
630 #include "lm3g_templates.c"
631 
632 static void
633 ngram_model_arpa_free(ngram_model_t *base)
634 {
635  ngram_model_arpa_t *model = (ngram_model_arpa_t *)base;
636  ckd_free(model->lm3g.unigrams);
637  ckd_free(model->lm3g.bigrams);
638  ckd_free(model->lm3g.trigrams);
639  ckd_free(model->lm3g.prob2);
640  ckd_free(model->lm3g.bo_wt2);
641  ckd_free(model->lm3g.prob3);
642  lm3g_tginfo_free(base, &model->lm3g);
643  ckd_free(model->lm3g.tseg_base);
644 }
645 
646 static ngram_funcs_t ngram_model_arpa_funcs = {
647  ngram_model_arpa_free, /* free */
648  ngram_model_arpa_apply_weights, /* apply_weights */
649  lm3g_template_score, /* score */
650  lm3g_template_raw_score, /* raw_score */
651  lm3g_template_add_ug, /* add_ug */
652  lm3g_template_flush, /* flush */
653  lm3g_template_iter, /* iter */
654  lm3g_template_mgrams, /* mgrams */
655  lm3g_template_successors, /* successors */
656  lm3g_template_iter_get, /* iter_get */
657  lm3g_template_iter_next, /* iter_next */
658  lm3g_template_iter_free /* iter_free */
659 };
lmprob_t bo_wt1
Unigram backoff weight.
Definition: lm3g_model.h:93
listelem_alloc_t * le
List element allocator for tginfo.
Definition: lm3g_model.h:156
Miscellaneous useful string functions.
#define ckd_calloc(n, sz)
Macros to simplify the use of above functions.
Definition: ckd_alloc.h:248
int32 n_prob3
prob3 size
Definition: lm3g_model.h:151
hash_table_t * wid
Mapping of unigram names to word IDs.
char ** word_str
Unigram names.
#define E_INFO
Print logging information to standard error stream.
Definition: err.h:147
Sphinx's memory allocation/deallocation routines.
SPHINXBASE_EXPORT int32 ngram_wid(ngram_model_t *model, const char *word)
Look up numerical word ID.
Definition: ngram_model.c:771
SPHINXBASE_EXPORT int32 const * ngram_iter_get(ngram_iter_t *itor, int32 *out_score, int32 *out_bowt)
Get information from the current M-gram in an iterator.
Definition: ngram_model.c:750
lmprob_t * prob2
Table of actual bigram probs.
Definition: lm3g_model.h:146
Bigram structure.
#define NGRAM_INVALID_WID
Impossible word ID.
Definition: ngram_model.h:84
SPHINXBASE_EXPORT ngram_iter_t * ngram_iter_next(ngram_iter_t *itor)
Advance an M-gram iterator.
Definition: ngram_model.c:758
uint8 writable
Are word strings writable?
int32 * n_counts
Counts for 1, 2, 3, ...
SPHINXBASE_EXPORT int ngram_model_free(ngram_model_t *model)
Release memory associated with an N-Gram model.
Definition: ngram_model.c:254
Line iterator for files.
Definition: pio.h:177
#define ckd_salloc(ptr)
Macro for ckd_salloc
Definition: ckd_alloc.h:264
uint16 prob3
Index into array of actual trigram probs.
SPHINXBASE_EXPORT void ckd_free(void *ptr)
Test and free a 1-D array.
Definition: ckd_alloc.c:241
#define E_WARN
Print warning information to standard error stream.
Definition: err.h:164
Unigram structure (common among all lm3g implementations)
Definition: lm3g_model.h:91
int32 n_bo_wt2
bo_wt2 size
Definition: lm3g_model.h:149
int32 bigrams
Index of 1st entry in lm_t.bigrams[].
Definition: lm3g_model.h:94
Subclass of ngram_model for ARPA file reading.
SPHINXBASE_EXPORT double atof_c(char const *str)
Locale independent version of atof().
Definition: strfuncs.c:56
lmprob_t * prob3
Table of actual trigram probs.
Definition: lm3g_model.h:150
SPHINXBASE_EXPORT void lineiter_free(lineiter_t *li)
Stop reading lines from a file.
Definition: pio.c:358
Trigram information cache.
Definition: lm3g_model.h:129
Trigram structure.
SPHINXBASE_EXPORT lineiter_t * lineiter_next(lineiter_t *li)
Move to the next line in the file.
Definition: pio.c:338
SPHINXBASE_EXPORT lineiter_t * lineiter_start(FILE *fh)
Start reading lines from a file.
Definition: pio.c:255
ngram_model_t base
Base ngram_model_t structure.
Fast memory allocator for uniformly sized objects.
uint16 trigrams
Index of 1st entry in lm_t.trigrams[], RELATIVE TO its segment base (see above)
#define E_INFOCONT
Print logging information without header, to standard error stream.
Definition: err.h:153
uint8 n
This is an n-gram model (1, 2, 3, ...).
uint16 prob2
Index into array of actual bigram probs.
Implementation of logging routines.
logmath_t * lmath
Log-math object.
uint16 bo_wt2
Index into array of actual bigram backoff wts.
Both ends of string.
Definition: strfuncs.h:73
SPHINXBASE_EXPORT void * hash_table_enter(hash_table_t *h, const char *key, void *val)
Try to add a new entry with given key and associated value to hash table h.
Definition: hash_table.c:508
SPHINXBASE_EXPORT FILE * fopen_comp(const char *file, const char *mode, int32 *ispipe)
Like fopen, but use popen and zcat if it is determined that "file" is compressed (i.e., has a .z, .Z, .gz, or .GZ extension).
Definition: pio.c:98
lmprob_t prob1
Unigram probability.
Definition: lm3g_model.h:92
lm3g_model_t lm3g
Shared lm3g structure.
SPHINXBASE_EXPORT int32 str2words(char *line, char **wptr, int32 n_wptr)
Convert a line to an array of "words", based on whitespace separators.
Definition: strfuncs.c:115
Opaque structure used to hold the results of command-line parsing.
#define E_ERROR
Print error message to standard error stream.
Definition: err.h:169
lmprob_t * bo_wt2
Table of actual bigram backoff weights.
Definition: lm3g_model.h:148
Implementation-specific functions for operating on ngram_model_t objects.
SPHINXBASE_EXPORT listelem_alloc_t * listelem_alloc_init(size_t elemsize)
Initialize and return a list element allocator.
Base iterator structure for N-grams.
SPHINXBASE_EXPORT float64 logmath_log_to_log10(logmath_t *lmath, int logb_p)
Convert integer log in base B to base 10 log (in floating point).
Definition: logmath.c:480
SPHINXBASE_EXPORT char * string_trim(char *string, enum string_edge_e which)
Remove whitespace from a string, modifying it in-place.
Definition: strfuncs.c:89
Common implementation of ngram_model_t.
int32 free
first free element in list
Definition: lm3g_model.h:84
SPHINXBASE_EXPORT void fclose_comp(FILE *fp, int32 ispipe)
Close a file opened using fopen_comp.
Definition: pio.c:175
#define E_ERROR_SYSTEM
Print error text; Call perror("");.
Definition: err.h:142
uint32 wid
Index of unigram entry for this.
SPHINXBASE_EXPORT ngram_iter_t * ngram_model_mgrams(ngram_model_t *model, int m)
Iterate over all M-grams.
Definition: ngram_model.c:689
tginfo_t ** tginfo
tginfo[lw2] is head of linked list of trigram information for some cached subset of bigrams (*...
Definition: lm3g_model.h:154
int32 n_prob2
prob2 size
Definition: lm3g_model.h:147
file IO related operations.
SPHINXBASE_EXPORT int logmath_log10_to_log(logmath_t *lmath, float64 log_p)
Convert base 10 log (in floating point) to integer log in base B.
Definition: logmath.c:474
int32 * tseg_base
tseg_base[i>>LOG_BG_SEG_SZ] = index of 1st trigram for bigram segment (i>>LOG_BG_SEG_SZ) ...
Definition: lm3g_model.h:152
int32 n_words
Number of actual word strings (NOT the same as the number of unigrams, due to class words)...
uint32 wid
Index of unigram entry for this.