SphinxBase  0.6
ngram_model_dmp.c
1 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /* ====================================================================
3  * Copyright (c) 1999-2007 Carnegie Mellon University. All rights
4  * reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright
11  * notice, this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright
14  * notice, this list of conditions and the following disclaimer in
15  * the documentation and/or other materials provided with the
16  * distribution.
17  *
18  * This work was supported in part by funding from the Defense Advanced
19  * Research Projects Agency and the National Science Foundation of the
20  * United States of America, and the CMU Sphinx Speech Consortium.
21  *
22  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33  *
34  * ====================================================================
35  *
36  */
37 /*
38  * \file ngram_model_dmp.c DMP format language models
39  *
40  * Author: David Huggins-Daines <dhuggins@cs.cmu.edu>
41  */
42 
43 #include <assert.h>
44 #include <stdio.h>
45 #include <string.h>
46 #include <stdlib.h>
47 #include <limits.h>
48 
49 #include "sphinxbase/ckd_alloc.h"
50 #include "sphinxbase/pio.h"
51 #include "sphinxbase/err.h"
52 #include "sphinxbase/byteorder.h"
54 
55 #include "ngram_model_dmp.h"
56 
57 static const char darpa_hdr[] = "Darpa Trigram LM";
58 static ngram_funcs_t ngram_model_dmp_funcs;
59 
60 #define TSEG_BASE(m,b) ((m)->lm3g.tseg_base[(b)>>LOG_BG_SEG_SZ])
61 #define FIRST_BG(m,u) ((m)->lm3g.unigrams[u].bigrams)
62 #define FIRST_TG(m,b) (TSEG_BASE((m),(b))+((m)->lm3g.bigrams[b].trigrams))
63 
64 static unigram_t *
65 new_unigram_table(int32 n_ug)
66 {
67  unigram_t *table;
68  int32 i;
69 
70  table = ckd_calloc(n_ug, sizeof(unigram_t));
71  for (i = 0; i < n_ug; i++) {
72  table[i].prob1.f = -99.0;
73  table[i].bo_wt1.f = -99.0;
74  }
75  return table;
76 }
77 
79 ngram_model_dmp_read(cmd_ln_t *config,
80  const char *file_name,
81  logmath_t *lmath)
82 {
83  ngram_model_t *base;
84  ngram_model_dmp_t *model;
85  FILE *fp;
86  int do_mmap, do_swap;
87  int32 is_pipe;
88  int32 i, j, k, vn, n, ts;
89  int32 n_unigram;
90  int32 n_bigram;
91  int32 n_trigram;
92  char str[1024];
93  unigram_t *ugptr;
94  bigram_t *bgptr;
95  trigram_t *tgptr;
96  char *tmp_word_str;
97  char *map_base = NULL;
98  size_t offset = 0;
99 
100  base = NULL;
101  do_mmap = FALSE;
102  if (config)
103  do_mmap = cmd_ln_boolean_r(config, "-mmap");
104 
105  if ((fp = fopen_comp(file_name, "rb", &is_pipe)) == NULL) {
106  E_ERROR("Dump file %s not found\n", file_name);
107  goto error_out;
108  }
109 
110  if (is_pipe && do_mmap) {
111  E_WARN("Dump file is compressed, will not use memory-mapped I/O\n");
112  do_mmap = 0;
113  }
114 
115  do_swap = FALSE;
116  if (fread(&k, sizeof(k), 1, fp) != 1)
117  goto error_out;
118  if (k != strlen(darpa_hdr)+1) {
119  SWAP_INT32(&k);
120  if (k != strlen(darpa_hdr)+1) {
121  E_ERROR("Wrong magic header size number %x: %s is not a dump file\n", k, file_name);
122  goto error_out;
123  }
124  do_swap = 1;
125  }
126  if (fread(str, 1, k, fp) != (size_t) k) {
127  E_ERROR("Cannot read header\n");
128  goto error_out;
129  }
130  if (strncmp(str, darpa_hdr, k) != 0) {
131  E_ERROR("Wrong header %s: %s is not a dump file\n", darpa_hdr);
132  goto error_out;
133  }
134 
135  if (do_mmap) {
136  if (do_swap) {
137  E_INFO
138  ("Byteswapping required, will not use memory-mapped I/O for LM file\n");
139  do_mmap = 0;
140  }
141  else {
142  E_INFO("Will use memory-mapped I/O for LM file\n");
143 #ifdef __ADSPBLACKFIN__ /* This is true for both VisualDSP++ and uClinux. */
144  E_FATAL("memory mapping is not supported at the moment.");
145 #else
146 #endif
147  }
148  }
149 
150  if (fread(&k, sizeof(k), 1, fp) != 1)
151  goto error_out;
152  if (do_swap) SWAP_INT32(&k);
153  if (fread(str, 1, k, fp) != (size_t) k) {
154  E_ERROR("Cannot read LM filename in header\n");
155  goto error_out;
156  }
157 
158  /* read version#, if present (must be <= 0) */
159  if (fread(&vn, sizeof(vn), 1, fp) != 1)
160  goto error_out;
161  if (do_swap) SWAP_INT32(&vn);
162  if (vn <= 0) {
163  /* read and don't compare timestamps (we don't care) */
164  if (fread(&ts, sizeof(ts), 1, fp) != 1)
165  goto error_out;
166  if (do_swap) SWAP_INT32(&ts);
167 
168  /* read and skip format description */
169  for (;;) {
170  if (fread(&k, sizeof(k), 1, fp) != 1)
171  goto error_out;
172  if (do_swap) SWAP_INT32(&k);
173  if (k == 0)
174  break;
175  if (fread(str, 1, k, fp) != (size_t) k) {
176  E_ERROR("Failed to read word\n");
177  goto error_out;
178  }
179  }
180  /* read model->ucount */
181  if (fread(&n_unigram, sizeof(n_unigram), 1, fp) != 1)
182  goto error_out;
183  if (do_swap) SWAP_INT32(&n_unigram);
184  }
185  else {
186  n_unigram = vn;
187  }
188 
189  /* read model->bcount, tcount */
190  if (fread(&n_bigram, sizeof(n_bigram), 1, fp) != 1)
191  goto error_out;
192  if (do_swap) SWAP_INT32(&n_bigram);
193  if (fread(&n_trigram, sizeof(n_trigram), 1, fp) != 1)
194  goto error_out;
195  if (do_swap) SWAP_INT32(&n_trigram);
196  E_INFO("ngrams 1=%d, 2=%d, 3=%d\n", n_unigram, n_bigram, n_trigram);
197 
198  /* Allocate space for LM, including initial OOVs and placeholders; initialize it */
199  model = ckd_calloc(1, sizeof(*model));
200  base = &model->base;
201  if (n_trigram > 0)
202  n = 3;
203  else if (n_bigram > 0)
204  n = 2;
205  else
206  n = 1;
207  ngram_model_init(base, &ngram_model_dmp_funcs, lmath, n, n_unigram);
208  base->n_counts[0] = n_unigram;
209  base->n_counts[1] = n_bigram;
210  base->n_counts[2] = n_trigram;
211 
212  /* read unigrams (always in memory, as they contain dictionary
213  * mappings that can't be precomputed, and also could have OOVs added) */
214  model->lm3g.unigrams = new_unigram_table(n_unigram + 1);
215  ugptr = model->lm3g.unigrams;
216  for (i = 0; i <= n_unigram; ++i) {
217  /* Skip over the mapping ID, we don't care about it. */
218  if (fread(ugptr, sizeof(int32), 1, fp) != 1) {
219  E_ERROR("Failed to read maping id %d\n", i);
220  goto error_out;
221  }
222  /* Read the actual unigram structure. */
223  if (fread(ugptr, sizeof(unigram_t), 1, fp) != 1) {
224  E_ERROR("Failed to read unigrams data\n");
225  ngram_model_free(base);
226  fclose_comp(fp, is_pipe);
227  return NULL;
228  }
229  /* Byte swap if necessary. */
230  if (do_swap) {
231  SWAP_INT32(&ugptr->prob1.l);
232  SWAP_INT32(&ugptr->bo_wt1.l);
233  SWAP_INT32(&ugptr->bigrams);
234  }
235  /* Convert values to log. */
236  ugptr->prob1.l = logmath_log10_to_log(lmath, ugptr->prob1.f);
237  ugptr->bo_wt1.l = logmath_log10_to_log(lmath, ugptr->bo_wt1.f);
238  E_DEBUG(2, ("ug %d: prob %d bo %d bigrams %d\n",
239  i, ugptr->prob1.l, ugptr->bo_wt1.l, ugptr->bigrams));
240  ++ugptr;
241  }
242  E_INFO("%8d = LM.unigrams(+trailer) read\n", n_unigram);
243 
244  /* Now mmap() the file and read in the rest of the (read-only) stuff. */
245  if (do_mmap) {
246  offset = ftell(fp);
247 
248  /* Check for improper word alignment. */
249  if (offset & 0x3) {
250  E_WARN("-mmap specified, but trigram index is not word-aligned. Will not memory-map.\n");
251  do_mmap = FALSE;
252  }
253  else {
254  model->dump_mmap = mmio_file_read(file_name);
255  if (model->dump_mmap == NULL) {
256  do_mmap = FALSE;
257  }
258  else {
259  map_base = mmio_file_ptr(model->dump_mmap);
260  }
261  }
262  }
263 
264  if (n_bigram > 0) {
265  /* read bigrams */
266  if (do_mmap) {
267  model->lm3g.bigrams = (bigram_t *) (map_base + offset);
268  offset += (n_bigram + 1) * sizeof(bigram_t);
269  }
270  else {
271  model->lm3g.bigrams =
272  ckd_calloc(n_bigram + 1, sizeof(bigram_t));
273  if (fread(model->lm3g.bigrams, sizeof(bigram_t), n_bigram + 1, fp)
274  != (size_t) n_bigram + 1) {
275  E_ERROR("Failed to read bigrams data\n");
276  goto error_out;
277  }
278  if (do_swap) {
279  for (i = 0, bgptr = model->lm3g.bigrams; i <= n_bigram;
280  i++, bgptr++) {
281  SWAP_INT16(&bgptr->wid);
282  SWAP_INT16(&bgptr->prob2);
283  SWAP_INT16(&bgptr->bo_wt2);
284  SWAP_INT16(&bgptr->trigrams);
285  }
286  }
287  }
288  E_INFO("%8d = LM.bigrams(+trailer) read\n", n_bigram);
289  }
290 
291  /* read trigrams */
292  if (n_trigram > 0) {
293  if (do_mmap) {
294  model->lm3g.trigrams = (trigram_t *) (map_base + offset);
295  offset += n_trigram * sizeof(trigram_t);
296  }
297  else {
298  model->lm3g.trigrams =
299  ckd_calloc(n_trigram, sizeof(trigram_t));
300  if (fread
301  (model->lm3g.trigrams, sizeof(trigram_t), n_trigram, fp)
302  != (size_t) n_trigram) {
303  E_ERROR("Failed to read trigrams data\n");
304  goto error_out;
305  }
306  if (do_swap) {
307  for (i = 0, tgptr = model->lm3g.trigrams; i < n_trigram;
308  i++, tgptr++) {
309  SWAP_INT16(&tgptr->wid);
310  SWAP_INT16(&tgptr->prob3);
311  }
312  }
313  }
314  E_INFO("%8d = LM.trigrams read\n", n_trigram);
315  /* Initialize tginfo */
316  model->lm3g.tginfo = ckd_calloc(n_unigram, sizeof(tginfo_t *));
317  model->lm3g.le = listelem_alloc_init(sizeof(tginfo_t));
318  }
319 
320  if (n_bigram > 0) {
321  /* read n_prob2 and prob2 array (in memory) */
322  if (do_mmap)
323  fseek(fp, offset, SEEK_SET);
324  if (fread(&k, sizeof(k), 1, fp) != 1)
325  goto error_out;
326  if (do_swap) SWAP_INT32(&k);
327  model->lm3g.n_prob2 = k;
328  model->lm3g.prob2 = ckd_calloc(k, sizeof(*model->lm3g.prob2));
329  if (fread(model->lm3g.prob2, sizeof(*model->lm3g.prob2), k, fp) != (size_t) k) {
330  E_ERROR("fread(prob2) failed\n");
331  goto error_out;
332  }
333  for (i = 0; i < k; i++) {
334  if (do_swap)
335  SWAP_INT32(&model->lm3g.prob2[i].l);
336  /* Convert values to log. */
337  model->lm3g.prob2[i].l = logmath_log10_to_log(lmath, model->lm3g.prob2[i].f);
338  }
339  E_INFO("%8d = LM.prob2 entries read\n", k);
340  }
341 
342  /* read n_bo_wt2 and bo_wt2 array (in memory) */
343  if (base->n > 2) {
344  if (fread(&k, sizeof(k), 1, fp) != 1)
345  goto error_out;
346  if (do_swap) SWAP_INT32(&k);
347  model->lm3g.n_bo_wt2 = k;
348  model->lm3g.bo_wt2 = ckd_calloc(k, sizeof(*model->lm3g.bo_wt2));
349  if (fread(model->lm3g.bo_wt2, sizeof(*model->lm3g.bo_wt2), k, fp) != (size_t) k) {
350  E_ERROR("Failed to read backoff weights\n");
351  goto error_out;
352  }
353  for (i = 0; i < k; i++) {
354  if (do_swap)
355  SWAP_INT32(&model->lm3g.bo_wt2[i].l);
356  /* Convert values to log. */
357  model->lm3g.bo_wt2[i].l = logmath_log10_to_log(lmath, model->lm3g.bo_wt2[i].f);
358  }
359  E_INFO("%8d = LM.bo_wt2 entries read\n", k);
360  }
361 
362  /* read n_prob3 and prob3 array (in memory) */
363  if (base->n > 2) {
364  if (fread(&k, sizeof(k), 1, fp) != 1)
365  goto error_out;
366  if (do_swap) SWAP_INT32(&k);
367  model->lm3g.n_prob3 = k;
368  model->lm3g.prob3 = ckd_calloc(k, sizeof(*model->lm3g.prob3));
369  if (fread(model->lm3g.prob3, sizeof(*model->lm3g.prob3), k, fp) != (size_t) k) {
370  E_ERROR("Failed to read trigram probability\n");
371  goto error_out;
372  }
373  for (i = 0; i < k; i++) {
374  if (do_swap)
375  SWAP_INT32(&model->lm3g.prob3[i].l);
376  /* Convert values to log. */
377  model->lm3g.prob3[i].l = logmath_log10_to_log(lmath, model->lm3g.prob3[i].f);
378  }
379  E_INFO("%8d = LM.prob3 entries read\n", k);
380  }
381 
382  /* read tseg_base size and tseg_base */
383  if (do_mmap)
384  offset = ftell(fp);
385  if (n_trigram > 0) {
386  if (do_mmap) {
387  memcpy(&k, map_base + offset, sizeof(k));
388  offset += sizeof(int32);
389  model->lm3g.tseg_base = (int32 *) (map_base + offset);
390  offset += k * sizeof(int32);
391  }
392  else {
393  k = (n_bigram + 1) / BG_SEG_SZ + 1;
394  if (fread(&k, sizeof(k), 1, fp) != 1)
395  goto error_out;
396  if (do_swap) SWAP_INT32(&k);
397  model->lm3g.tseg_base = ckd_calloc(k, sizeof(int32));
398  if (fread(model->lm3g.tseg_base, sizeof(int32), k, fp) !=
399  (size_t) k) {
400  E_ERROR("Failed to read trigram index\n");
401  goto error_out;
402  }
403  if (do_swap)
404  for (i = 0; i < k; i++)
405  SWAP_INT32(&model->lm3g.tseg_base[i]);
406  }
407  E_INFO("%8d = LM.tseg_base entries read\n", k);
408  }
409 
410  /* read ascii word strings */
411  if (do_mmap) {
412  memcpy(&k, map_base + offset, sizeof(k));
413  offset += sizeof(int32);
414  tmp_word_str = (char *) (map_base + offset);
415  offset += k;
416  }
417  else {
418  base->writable = TRUE;
419  if (fread(&k, sizeof(k), 1, fp) != 1)
420  goto error_out;
421  if (do_swap) SWAP_INT32(&k);
422  tmp_word_str = ckd_calloc(k, 1);
423  if (fread(tmp_word_str, 1, k, fp) != (size_t) k) {
424  E_ERROR("Failed to read words\n");
425  goto error_out;
426  }
427  }
428 
429  /* First make sure string just read contains n_counts[0] words (PARANOIA!!) */
430  for (i = 0, j = 0; i < k; i++)
431  if (tmp_word_str[i] == '\0')
432  j++;
433  if (j != n_unigram) {
434  E_ERROR("Error reading word strings (%d doesn't match n_unigrams %d)\n",
435  j, n_unigram);
436  goto error_out;
437  }
438 
439  /* Break up string just read into words */
440  if (do_mmap) {
441  j = 0;
442  for (i = 0; i < n_unigram; i++) {
443  base->word_str[i] = tmp_word_str + j;
444  if (hash_table_enter(base->wid, base->word_str[i],
445  (void *)(long)i) != (void *)(long)i) {
446  E_WARN("Duplicate word in dictionary: %s\n", base->word_str[i]);
447  }
448  j += strlen(base->word_str[i]) + 1;
449  }
450  }
451  else {
452  j = 0;
453  for (i = 0; i < n_unigram; i++) {
454  base->word_str[i] = ckd_salloc(tmp_word_str + j);
455  if (hash_table_enter(base->wid, base->word_str[i],
456  (void *)(long)i) != (void *)(long)i) {
457  E_WARN("Duplicate word in dictionary: %s\n", base->word_str[i]);
458  }
459  j += strlen(base->word_str[i]) + 1;
460  }
461  free(tmp_word_str);
462  }
463  E_INFO("%8d = ascii word strings read\n", i);
464 
465  fclose_comp(fp, is_pipe);
466  return base;
467 
468 error_out:
469  if (fp)
470  fclose_comp(fp, is_pipe);
471  ngram_model_free(base);
472  return NULL;
473 }
474 
476 ngram_model_dmp_build(ngram_model_t *base)
477 {
478  ngram_model_dmp_t *model;
479  ngram_model_t *newbase;
480  ngram_iter_t *itor;
481  sorted_list_t sorted_prob2;
482  sorted_list_t sorted_bo_wt2;
483  sorted_list_t sorted_prob3;
484  bigram_t *bgptr;
485  trigram_t *tgptr;
486  int i, bgcount, tgcount, seg;
487 
488  if (base->funcs == &ngram_model_dmp_funcs) {
489  E_INFO("Using existing DMP model.\n");
490  return (ngram_model_dmp_t *)ngram_model_retain(base);
491  }
492 
493  /* Initialize new base model structure with params from base. */
494  E_INFO("Building DMP model...\n");
495  model = ckd_calloc(1, sizeof(*model));
496  newbase = &model->base;
497  ngram_model_init(newbase, &ngram_model_dmp_funcs,
498  logmath_retain(base->lmath),
499  base->n, base->n_counts[0]);
500  /* Copy N-gram counts over. */
501  memcpy(newbase->n_counts, base->n_counts,
502  base->n * sizeof(*base->n_counts));
503  /* Make sure word strings are freed. */
504  newbase->writable = TRUE;
505  /* Initialize unigram table and string table. */
506  model->lm3g.unigrams = new_unigram_table(newbase->n_counts[0] + 1);
507  for (itor = ngram_model_mgrams(base, 0); itor;
508  itor = ngram_iter_next(itor)) {
509  int32 prob1, bo_wt1;
510  int32 const *wids;
511 
512  /* Can't guarantee they will go in unigram order, so just to
513  * be correct, we do this... */
514  wids = ngram_iter_get(itor, &prob1, &bo_wt1);
515  model->lm3g.unigrams[wids[0]].prob1.l = prob1;
516  model->lm3g.unigrams[wids[0]].bo_wt1.l = bo_wt1;
517  newbase->word_str[wids[0]] = ckd_salloc(ngram_word(base, wids[0]));
518  if ((hash_table_enter_int32(newbase->wid,
519  newbase->word_str[wids[0]], wids[0]))
520  != wids[0]) {
521  E_WARN("Duplicate word in dictionary: %s\n", newbase->word_str[wids[0]]);
522  }
523  }
524  E_INFO("%8d = #unigrams created\n", newbase->n_counts[0]);
525 
526  if (newbase->n < 2)
527  return model;
528 
529  /* Construct quantized probability table for bigrams and
530  * (optionally) trigrams. Hesitate to use the "sorted list" thing
531  * since it isn't so useful, but it's there already. */
532  init_sorted_list(&sorted_prob2);
533  if (newbase->n > 2) {
534  init_sorted_list(&sorted_bo_wt2);
535  init_sorted_list(&sorted_prob3);
536  }
537  /* Construct bigram and trigram arrays. */
538  bgptr = model->lm3g.bigrams = ckd_calloc(newbase->n_counts[1] + 1, sizeof(bigram_t));
539  if (newbase->n > 2) {
540  tgptr = model->lm3g.trigrams = ckd_calloc(newbase->n_counts[2], sizeof(trigram_t));
541  model->lm3g.tseg_base =
542  ckd_calloc((newbase->n_counts[1] + 1) / BG_SEG_SZ + 1, sizeof(int32));
543  }
544  else
545  tgptr = NULL;
546  /* Since bigrams and trigrams have to be contiguous with others
547  * with the same N-1-gram, we traverse them in depth-first order
548  * to build the bigram and trigram arrays. */
549  for (i = 0; i < newbase->n_counts[0]; ++i) {
550  ngram_iter_t *uitor;
551  bgcount = bgptr - model->lm3g.bigrams;
552  /* First bigram index (same as next if no bigrams...) */
553  model->lm3g.unigrams[i].bigrams = bgcount;
554  E_DEBUG(2, ("unigram %d: %s => bigram %d\n", i, newbase->word_str[i], bgcount));
555  /* All bigrams corresponding to unigram i */
556  uitor = ngram_ng_iter(base, i, NULL, 0);
557  for (itor = ngram_iter_successors(uitor);
558  itor; ++bgptr, itor = ngram_iter_next(itor)) {
559  int32 prob2, bo_wt2;
560  int32 const *wids;
561  ngram_iter_t *titor;
562 
563  wids = ngram_iter_get(itor, &prob2, &bo_wt2);
564 
565  assert (bgptr - model->lm3g.bigrams < newbase->n_counts[1]);
566 
567  bgptr->wid = wids[1];
568  bgptr->prob2 = sorted_id(&sorted_prob2, &prob2);
569  if (newbase->n > 2) {
570  tgcount = (tgptr - model->lm3g.trigrams);
571  bgcount = (bgptr - model->lm3g.bigrams);
572 
573  /* Backoff weight (only if there are trigrams...) */
574  bgptr->bo_wt2 = sorted_id(&sorted_bo_wt2, &bo_wt2);
575 
576  /* Find bigram segment for this bigram (this isn't
577  * used unless there are trigrams) */
578  seg = bgcount >> LOG_BG_SEG_SZ;
579  /* If we just crossed a bigram segment boundary, then
580  * point tseg_base for the new segment to the current
581  * trigram pointer. */
582  if (seg != (bgcount - 1) >> LOG_BG_SEG_SZ)
583  model->lm3g.tseg_base[seg] = tgcount;
584  /* Now calculate the trigram offset. */
585  bgptr->trigrams = tgcount - model->lm3g.tseg_base[seg];
586  E_DEBUG(2, ("bigram %d %s %s => trigram %d:%d\n",
587  bgcount,
588  newbase->word_str[wids[0]],
589  newbase->word_str[wids[1]],
590  seg, bgptr->trigrams));
591 
592  /* And fill in successors' trigram info. */
593  for (titor = ngram_iter_successors(itor);
594  titor; ++tgptr, titor = ngram_iter_next(titor)) {
595  int32 prob3, dummy;
596 
597  assert(tgptr - model->lm3g.trigrams < newbase->n_counts[2]);
598  wids = ngram_iter_get(titor, &prob3, &dummy);
599  tgptr->wid = wids[2];
600  tgptr->prob3 = sorted_id(&sorted_prob3, &prob3);
601  E_DEBUG(2, ("trigram %d %s %s %s => prob %d\n",
602  tgcount,
603  newbase->word_str[wids[0]],
604  newbase->word_str[wids[1]],
605  newbase->word_str[wids[2]],
606  tgptr->prob3));
607  }
608  }
609  }
610  ngram_iter_free(uitor);
611  }
612  /* Add sentinal unigram and bigram records. */
613  bgcount = bgptr - model->lm3g.bigrams;
614  tgcount = tgptr - model->lm3g.trigrams;
615  seg = bgcount >> LOG_BG_SEG_SZ;
616  if (seg != (bgcount - 1) >> LOG_BG_SEG_SZ)
617  model->lm3g.tseg_base[seg] = tgcount;
618  model->lm3g.unigrams[i].bigrams = bgcount;
619  if (newbase->n > 2)
620  bgptr->trigrams = tgcount - model->lm3g.tseg_base[seg];
621 
622  /* Now create probability tables. */
623  model->lm3g.n_prob2 = sorted_prob2.free;
624  model->lm3g.prob2 = vals_in_sorted_list(&sorted_prob2);
625  E_INFO("%8d = #bigrams created\n", newbase->n_counts[1]);
626  E_INFO("%8d = #prob2 entries\n", model->lm3g.n_prob2);
627  free_sorted_list(&sorted_prob2);
628  if (newbase->n > 2) {
629  /* Create trigram bo-wts array. */
630  model->lm3g.n_bo_wt2 = sorted_bo_wt2.free;
631  model->lm3g.bo_wt2 = vals_in_sorted_list(&sorted_bo_wt2);
632  free_sorted_list(&sorted_bo_wt2);
633  E_INFO("%8d = #bo_wt2 entries\n", model->lm3g.n_bo_wt2);
634  /* Create trigram probability table. */
635  model->lm3g.n_prob3 = sorted_prob3.free;
636  model->lm3g.prob3 = vals_in_sorted_list(&sorted_prob3);
637  E_INFO("%8d = #trigrams created\n", newbase->n_counts[2]);
638  E_INFO("%8d = #prob3 entries\n", model->lm3g.n_prob3);
639  free_sorted_list(&sorted_prob3);
640  /* Initialize tginfo */
641  model->lm3g.tginfo = ckd_calloc(newbase->n_counts[0], sizeof(tginfo_t *));
642  model->lm3g.le = listelem_alloc_init(sizeof(tginfo_t));
643  }
644 
645  return model;
646 }
647 
648 static void
649 fwrite_int32(FILE *fh, int32 val)
650 {
651  fwrite(&val, 4, 1, fh);
652 }
653 
654 static void
655 fwrite_ug(FILE *fh, unigram_t *ug, logmath_t *lmath)
656 {
657  int32 bogus = -1;
658  float32 log10val;
659 
660  /* Bogus dictionary mapping field. */
661  fwrite(&bogus, 4, 1, fh);
662  /* Convert values to log10. */
663  log10val = logmath_log_to_log10(lmath, ug->prob1.l);
664  fwrite(&log10val, 4, 1, fh);
665  log10val = logmath_log_to_log10(lmath, ug->bo_wt1.l);
666  fwrite(&log10val, 4, 1, fh);
667  fwrite_int32(fh, ug->bigrams);
668 }
669 
670 static void
671 fwrite_bg(FILE *fh, bigram_t *bg)
672 {
673  fwrite(bg, sizeof(*bg), 1, fh);
674 }
675 
676 static void
677 fwrite_tg(FILE *fh, trigram_t *tg)
678 {
679  fwrite(tg, sizeof(*tg), 1, fh);
680 }
681 
684 static char const *fmtdesc[] = {
685  "BEGIN FILE FORMAT DESCRIPTION",
686  "Header string length (int32) and string (including trailing 0)",
687  "Original LM filename string-length (int32) and filename (including trailing 0)",
688  "(int32) version number (present iff value <= 0)",
689  "(int32) original LM file modification timestamp (iff version# present)",
690  "(int32) string-length and string (including trailing 0) (iff version# present)",
691  "... previous entry continued any number of times (iff version# present)",
692  "(int32) 0 (terminating sequence of strings) (iff version# present)",
693  "(int32) log_bg_seg_sz (present iff different from default value of LOG2_BG_SEG_SZ)",
694  "(int32) lm_t.ucount (must be > 0)",
695  "(int32) lm_t.bcount",
696  "(int32) lm_t.tcount",
697  "lm_t.ucount+1 unigrams (including sentinel)",
698  "lm_t.bcount+1 bigrams (including sentinel 64 bits (bg_t) each if version=-1/-2, 128 bits (bg32_t) each if version=-3",
699  "lm_t.tcount trigrams (present iff lm_t.tcount > 0 32 bits (tg_t) each if version=-1/-2, 64 bits (tg32_t) each if version=-3)",
700  "(int32) lm_t.n_prob2",
701  "(int32) lm_t.prob2[]",
702  "(int32) lm_t.n_bo_wt2 (present iff lm_t.tcount > 0)",
703  "(int32) lm_t.bo_wt2[] (present iff lm_t.tcount > 0)",
704  "(int32) lm_t.n_prob3 (present iff lm_t.tcount > 0)",
705  "(int32) lm_t.prob3[] (present iff lm_t.tcount > 0)",
706  "(int32) (lm_t.bcount+1)/BG_SEG_SZ+1 (present iff lm_t.tcount > 0)",
707  "(int32) lm_t.tseg_base[] (present iff lm_t.tcount > 0)",
708  "(int32) Sum(all word string-lengths, including trailing 0 for each)",
709  "All word strings (including trailing 0 for each)",
710  "END FILE FORMAT DESCRIPTION",
711  NULL,
712 };
713 
714 static void
715 ngram_model_dmp_write_header(FILE * fh)
716 {
717  int32 k;
718  k = strlen(darpa_hdr) + 1;
719  fwrite_int32(fh, k);
720  fwrite(darpa_hdr, 1, k, fh);
721 }
722 
723 static void
724 ngram_model_dmp_write_lm_filename(FILE * fh, const char *lmfile)
725 {
726  int32 k;
727 
728  k = strlen(lmfile) + 1;
729  fwrite_int32(fh, k);
730  fwrite(lmfile, 1, k, fh);
731 }
732 
733 #define LMDMP_VERSION_TG_16BIT -1
737 static void
738 ngram_model_dmp_write_version(FILE * fh, int32 mtime)
739 {
740  fwrite_int32(fh, LMDMP_VERSION_TG_16BIT); /* version # */
741  fwrite_int32(fh, mtime);
742 }
743 
744 static void
745 ngram_model_dmp_write_ngram_counts(FILE * fh, ngram_model_t *model)
746 {
747  fwrite_int32(fh, model->n_counts[0]);
748  fwrite_int32(fh, model->n_counts[1]);
749  fwrite_int32(fh, model->n_counts[2]);
750 }
751 
752 static void
753 ngram_model_dmp_write_fmtdesc(FILE * fh)
754 {
755  int32 i, k;
756  long pos;
757 
758  /* Write file format description into header */
759  for (i = 0; fmtdesc[i] != NULL; i++) {
760  k = strlen(fmtdesc[i]) + 1;
761  fwrite_int32(fh, k);
762  fwrite(fmtdesc[i], 1, k, fh);
763  }
764  /* Pad it out in order to achieve 32-bit alignment */
765  pos = ftell(fh);
766  k = pos & 3;
767  if (k) {
768  fwrite_int32(fh, 4-k);
769  fwrite("!!!!", 1, 4-k, fh);
770  }
771  fwrite_int32(fh, 0);
772 }
773 
774 static void
775 ngram_model_dmp_write_unigram(FILE *fh, ngram_model_t *model)
776 {
777  ngram_model_dmp_t *lm = (ngram_model_dmp_t *)model;
778  int32 i;
779 
780  for (i = 0; i <= model->n_counts[0]; i++) {
781  fwrite_ug(fh, &(lm->lm3g.unigrams[i]), model->lmath);
782  }
783 }
784 
785 
786 static void
787 ngram_model_dmp_write_bigram(FILE *fh, ngram_model_t *model)
788 {
789  ngram_model_dmp_t *lm = (ngram_model_dmp_t *)model;
790  int32 i;
791 
792  for (i = 0; i <= model->n_counts[1]; i++) {
793  fwrite_bg(fh, &(lm->lm3g.bigrams[i]));
794  }
795 
796 }
797 
798 static void
799 ngram_model_dmp_write_trigram(FILE *fh, ngram_model_t *model)
800 {
801  ngram_model_dmp_t *lm = (ngram_model_dmp_t *)model;
802  int32 i;
803 
804  for (i = 0; i < model->n_counts[2]; i++) {
805  fwrite_tg(fh, &(lm->lm3g.trigrams[i]));
806  }
807 }
808 
809 static void
810 ngram_model_dmp_write_bgprob(FILE *fh, ngram_model_t *model)
811 {
812  ngram_model_dmp_t *lm = (ngram_model_dmp_t *)model;
813  int32 i;
814 
815  fwrite_int32(fh, lm->lm3g.n_prob2);
816  for (i = 0; i < lm->lm3g.n_prob2; i++) {
817  float32 log10val = logmath_log_to_log10(model->lmath, lm->lm3g.prob2[i].l);
818  fwrite(&log10val, 4, 1, fh);
819  }
820 }
821 
822 static void
823 ngram_model_dmp_write_tgbowt(FILE *fh, ngram_model_t *model)
824 {
825  ngram_model_dmp_t *lm = (ngram_model_dmp_t *)model;
826  int32 i;
827 
828  fwrite_int32(fh, lm->lm3g.n_bo_wt2);
829  for (i = 0; i < lm->lm3g.n_bo_wt2; i++) {
830  float32 log10val = logmath_log_to_log10(model->lmath, lm->lm3g.bo_wt2[i].l);
831  fwrite(&log10val, 4, 1, fh);
832  }
833 }
834 
835 static void
836 ngram_model_dmp_write_tgprob(FILE *fh, ngram_model_t *model)
837 {
838  ngram_model_dmp_t *lm = (ngram_model_dmp_t *)model;
839  int32 i;
840 
841  fwrite_int32(fh, lm->lm3g.n_prob3);
842  for (i = 0; i < lm->lm3g.n_prob3; i++) {
843  float32 log10val = logmath_log_to_log10(model->lmath, lm->lm3g.prob3[i].l);
844  fwrite(&log10val, 4, 1, fh);
845  }
846 }
847 
848 static void
849 ngram_model_dmp_write_tg_segbase(FILE *fh, ngram_model_t *model)
850 {
851  ngram_model_dmp_t *lm = (ngram_model_dmp_t *)model;
852  int32 i, k;
853 
854  k = (model->n_counts[1] + 1) / BG_SEG_SZ + 1;
855  fwrite_int32(fh, k);
856  for (i = 0; i < k; i++)
857  fwrite_int32(fh, lm->lm3g.tseg_base[i]);
858 }
859 
860 static void
861 ngram_model_dmp_write_wordstr(FILE *fh, ngram_model_t *model)
862 {
863  int32 i, k;
864 
865  k = 0;
866  for (i = 0; i < model->n_counts[0]; i++)
867  k += strlen(model->word_str[i]) + 1;
868  fwrite_int32(fh, k);
869  for (i = 0; i < model->n_counts[0]; i++)
870  fwrite(model->word_str[i], 1,
871  strlen(model->word_str[i]) + 1, fh);
872 }
873 
874 int
875 ngram_model_dmp_write(ngram_model_t *base,
876  const char *file_name)
877 {
878  ngram_model_dmp_t *model;
879  ngram_model_t *newbase;
880  FILE *fh;
881 
882  /* First, construct a DMP model from the base model. */
883  model = ngram_model_dmp_build(base);
884  newbase = &model->base;
885 
886  /* Now write it, confident in the knowledge that it's the right
887  * kind of language model internally. */
888  if ((fh = fopen(file_name, "wb")) == NULL) {
889  E_ERROR("Cannot create file %s\n", file_name);
890  return -1;
891  }
892  ngram_model_dmp_write_header(fh);
893  ngram_model_dmp_write_lm_filename(fh, file_name);
894  ngram_model_dmp_write_version(fh, 0);
895  ngram_model_dmp_write_fmtdesc(fh);
896  ngram_model_dmp_write_ngram_counts(fh, newbase);
897  ngram_model_dmp_write_unigram(fh, newbase);
898  if (newbase->n > 1) {
899  ngram_model_dmp_write_bigram(fh, newbase);
900  if (newbase->n > 2) {
901  ngram_model_dmp_write_trigram(fh, newbase);
902  }
903  ngram_model_dmp_write_bgprob(fh, newbase);
904  if (newbase->n > 2) {
905  ngram_model_dmp_write_tgbowt(fh, newbase);
906  ngram_model_dmp_write_tgprob(fh, newbase);
907  ngram_model_dmp_write_tg_segbase(fh, newbase);
908  }
909  }
910  ngram_model_dmp_write_wordstr(fh, newbase);
911  ngram_model_free(newbase);
912 
913  return fclose(fh);
914 }
915 
916 static int
917 ngram_model_dmp_apply_weights(ngram_model_t *base, float32 lw,
918  float32 wip, float32 uw)
919 {
920  ngram_model_dmp_t *model = (ngram_model_dmp_t *)base;
921  lm3g_apply_weights(base, &model->lm3g, lw, wip, uw);
922  return 0;
923 }
924 
925 /* Lousy "templating" for things that are largely the same in DMP and
926  * ARPA models, except for the bigram and trigram types and some
927  * names. */
928 #define NGRAM_MODEL_TYPE ngram_model_dmp_t
929 #include "lm3g_templates.c"
930 
931 static void
932 ngram_model_dmp_free(ngram_model_t *base)
933 {
934  ngram_model_dmp_t *model = (ngram_model_dmp_t *)base;
935 
936  ckd_free(model->lm3g.unigrams);
937  ckd_free(model->lm3g.prob2);
938  if (model->dump_mmap) {
939  mmio_file_unmap(model->dump_mmap);
940  }
941  else {
942  ckd_free(model->lm3g.bigrams);
943  if (base->n > 2) {
944  ckd_free(model->lm3g.trigrams);
945  ckd_free(model->lm3g.tseg_base);
946  }
947  }
948  if (base->n > 2) {
949  ckd_free(model->lm3g.bo_wt2);
950  ckd_free(model->lm3g.prob3);
951  }
952 
953  lm3g_tginfo_free(base, &model->lm3g);
954 }
955 
956 static ngram_funcs_t ngram_model_dmp_funcs = {
957  ngram_model_dmp_free, /* free */
958  ngram_model_dmp_apply_weights, /* apply_weights */
959  lm3g_template_score, /* score */
960  lm3g_template_raw_score, /* raw_score */
961  lm3g_template_add_ug, /* add_ug */
962  lm3g_template_flush, /* flush */
963  lm3g_template_iter, /* iter */
964  lm3g_template_mgrams, /* mgrams */
965  lm3g_template_successors, /* successors */
966  lm3g_template_iter_get, /* iter_get */
967  lm3g_template_iter_next, /* iter_next */
968  lm3g_template_iter_free /* iter_free */
969 };
lmprob_t bo_wt1
Unigram backoff weight.
Definition: lm3g_model.h:93
listelem_alloc_t * le
List element allocator for tginfo.
Definition: lm3g_model.h:156
struct ngram_funcs_s * funcs
Implementation-specific methods.
#define ckd_calloc(n, sz)
Macros to simplify the use of above functions.
Definition: ckd_alloc.h:248
int32 n_prob3
prob3 size
Definition: lm3g_model.h:151
#define hash_table_enter_int32(h, k, v)
Add a 32-bit integer value to a hash table.
Definition: hash_table.h:228
hash_table_t * wid
Mapping of unigram names to word IDs.
char ** word_str
Unigram names.
SPHINXBASE_EXPORT void ngram_iter_free(ngram_iter_t *itor)
Terminate an M-gram iterator.
Definition: ngram_model.c:764
#define E_DEBUG(level, x)
Print debugging information to standard error stream.
Definition: err.h:212
#define E_INFO
Print logging information to standard error stream.
Definition: err.h:147
Sphinx's memory allocation/deallocation routines.
SPHINXBASE_EXPORT int32 const * ngram_iter_get(ngram_iter_t *itor, int32 *out_score, int32 *out_bowt)
Get information from the current M-gram in an iterator.
Definition: ngram_model.c:750
lmprob_t * prob2
Table of actual bigram probs.
Definition: lm3g_model.h:146
Bigram structure.
SPHINXBASE_EXPORT void mmio_file_unmap(mmio_file_t *mf)
Unmap a file, releasing memory associated with it.
Definition: mmio.c:240
SPHINXBASE_EXPORT ngram_iter_t * ngram_iter_next(ngram_iter_t *itor)
Advance an M-gram iterator.
Definition: ngram_model.c:758
uint8 writable
Are word strings writable?
int32 * n_counts
Counts for 1, 2, 3, ...
SPHINXBASE_EXPORT int ngram_model_free(ngram_model_t *model)
Release memory associated with an N-Gram model.
Definition: ngram_model.c:254
#define ckd_salloc(ptr)
Macro for ckd_salloc
Definition: ckd_alloc.h:264
uint16 prob3
Index into array of actual trigram probs.
SPHINXBASE_EXPORT ngram_iter_t * ngram_iter_successors(ngram_iter_t *itor)
Iterate over all M-gram successors of an M-1-gram.
Definition: ngram_model.c:741
Subclass of ngram_model for DMP file reading.
SPHINXBASE_EXPORT void ckd_free(void *ptr)
Test and free a 1-D array.
Definition: ckd_alloc.c:241
#define E_WARN
Print warning information to standard error stream.
Definition: err.h:164
ngram_model_t base
Base ngram_model_t structure.
Unigram structure (common among all lm3g implementations)
Definition: lm3g_model.h:91
int32 n_bo_wt2
bo_wt2 size
Definition: lm3g_model.h:149
int32 bigrams
Index of 1st entry in lm_t.bigrams[].
Definition: lm3g_model.h:94
lmprob_t * prob3
Table of actual trigram probs.
Definition: lm3g_model.h:150
SPHINXBASE_EXPORT void * mmio_file_ptr(mmio_file_t *mf)
Get a pointer to the memory mapped for a file.
Definition: mmio.c:251
Trigram information cache.
Definition: lm3g_model.h:129
Trigram structure.
SPHINXBASE_EXPORT logmath_t * logmath_retain(logmath_t *lmath)
Retain ownership of a log table.
Definition: logmath.c:335
SPHINXBASE_EXPORT ngram_model_t * ngram_model_retain(ngram_model_t *model)
Retain ownership of an N-Gram model.
Definition: ngram_model.c:239
Fast memory allocator for uniformly sized objects.
uint16 trigrams
Index of 1st entry in lm_t.trigrams[], RELATIVE TO its segment base (see above)
uint8 n
This is an n-gram model (1, 2, 3, ...).
uint16 prob2
Index into array of actual bigram probs.
Implementation of logging routines.
The sorted list.
Definition: lm3g_model.h:82
logmath_t * lmath
Log-math object.
uint16 bo_wt2
Index into array of actual bigram backoff wts.
SPHINXBASE_EXPORT void * hash_table_enter(hash_table_t *h, const char *key, void *val)
Try to add a new entry with given key and associated value to hash table h.
Definition: hash_table.c:508
SPHINXBASE_EXPORT FILE * fopen_comp(const char *file, const char *mode, int32 *ispipe)
Like fopen, but use popen and zcat if it is determined that "file" is compressed (i.e., has a .z, .Z, .gz, or .GZ extension).
Definition: pio.c:98
mmio_file_t * dump_mmap
mmap() of dump file (or NULL if none)
SPHINXBASE_EXPORT ngram_iter_t * ngram_ng_iter(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist)
Get an iterator over M-grams pointing to the specified M-gram.
Definition: ngram_model.c:731
lmprob_t prob1
Unigram probability.
Definition: lm3g_model.h:92
Opaque structure used to hold the results of command-line parsing.
#define E_FATAL
Exit with non-zero status after error message.
Definition: err.h:127
#define E_ERROR
Print error message to standard error stream.
Definition: err.h:169
lmprob_t * bo_wt2
Table of actual bigram backoff weights.
Definition: lm3g_model.h:148
Implementation-specific functions for operating on ngram_model_t objects.
SPHINXBASE_EXPORT listelem_alloc_t * listelem_alloc_init(size_t elemsize)
Initialize and return a list element allocator.
Base iterator structure for N-grams.
#define cmd_ln_boolean_r(c, n)
Retrieve a boolean value from a command-line object.
Definition: cmd_ln.h:334
SPHINXBASE_EXPORT float64 logmath_log_to_log10(logmath_t *lmath, int logb_p)
Convert integer log in base B to base 10 log (in floating point).
Definition: logmath.c:480
lm3g_model_t lm3g
Common lm3g_model_t structure.
Common implementation of ngram_model_t.
int32 free
first free element in list
Definition: lm3g_model.h:84
SPHINXBASE_EXPORT const char * ngram_word(ngram_model_t *model, int32 wid)
Look up word string for numerical word ID.
Definition: ngram_model.c:782
SPHINXBASE_EXPORT void fclose_comp(FILE *fp, int32 ispipe)
Close a file opened using fopen_comp.
Definition: pio.c:175
SPHINXBASE_EXPORT mmio_file_t * mmio_file_read(const char *filename)
Memory-map a file for reading.
Definition: mmio.c:206
uint32 wid
Index of unigram entry for this.
SPHINXBASE_EXPORT ngram_iter_t * ngram_model_mgrams(ngram_model_t *model, int m)
Iterate over all M-grams.
Definition: ngram_model.c:689
tginfo_t ** tginfo
tginfo[lw2] is head of linked list of trigram information for some cached subset of bigrams (*...
Definition: lm3g_model.h:154
int32 n_prob2
prob2 size
Definition: lm3g_model.h:147
file IO related operations.
SPHINXBASE_EXPORT int logmath_log10_to_log(logmath_t *lmath, float64 log_p)
Convert base 10 log (in floating point) to integer log in base B.
Definition: logmath.c:474
int32 * tseg_base
tseg_base[i>>LOG_BG_SEG_SZ] = index of 1st trigram for bigram segment (i>>LOG_BG_SEG_SZ) ...
Definition: lm3g_model.h:152
uint32 wid
Index of unigram entry for this.