52 #include "sphinxbase/byteorder.h"
55 #include "ngram_model_dmp.h"
57 static const char darpa_hdr[] =
"Darpa Trigram LM";
60 #define TSEG_BASE(m,b) ((m)->lm3g.tseg_base[(b)>>LOG_BG_SEG_SZ])
61 #define FIRST_BG(m,u) ((m)->lm3g.unigrams[u].bigrams)
62 #define FIRST_TG(m,b) (TSEG_BASE((m),(b))+((m)->lm3g.bigrams[b].trigrams))
65 new_unigram_table(int32 n_ug)
71 for (i = 0; i < n_ug; i++) {
72 table[i].
prob1.f = -99.0;
79 ngram_model_dmp_read(
cmd_ln_t *config,
80 const char *file_name,
88 int32 i, j, k, vn, n, ts;
97 char *map_base = NULL;
105 if ((fp =
fopen_comp(file_name,
"rb", &is_pipe)) == NULL) {
106 E_ERROR(
"Dump file %s not found\n", file_name);
110 if (is_pipe && do_mmap) {
111 E_WARN(
"Dump file is compressed, will not use memory-mapped I/O\n");
116 if (fread(&k,
sizeof(k), 1, fp) != 1)
118 if (k != strlen(darpa_hdr)+1) {
120 if (k != strlen(darpa_hdr)+1) {
121 E_ERROR(
"Wrong magic header size number %x: %s is not a dump file\n", k, file_name);
126 if (fread(str, 1, k, fp) != (
size_t) k) {
127 E_ERROR(
"Cannot read header\n");
130 if (strncmp(str, darpa_hdr, k) != 0) {
131 E_ERROR(
"Wrong header %s: %s is not a dump file\n", darpa_hdr);
138 (
"Byteswapping required, will not use memory-mapped I/O for LM file\n");
142 E_INFO(
"Will use memory-mapped I/O for LM file\n");
143 #ifdef __ADSPBLACKFIN__
144 E_FATAL(
"memory mapping is not supported at the moment.");
150 if (fread(&k,
sizeof(k), 1, fp) != 1)
152 if (do_swap) SWAP_INT32(&k);
153 if (fread(str, 1, k, fp) != (
size_t) k) {
154 E_ERROR(
"Cannot read LM filename in header\n");
159 if (fread(&vn,
sizeof(vn), 1, fp) != 1)
161 if (do_swap) SWAP_INT32(&vn);
164 if (fread(&ts,
sizeof(ts), 1, fp) != 1)
166 if (do_swap) SWAP_INT32(&ts);
170 if (fread(&k,
sizeof(k), 1, fp) != 1)
172 if (do_swap) SWAP_INT32(&k);
175 if (fread(str, 1, k, fp) != (
size_t) k) {
176 E_ERROR(
"Failed to read word\n");
181 if (fread(&n_unigram,
sizeof(n_unigram), 1, fp) != 1)
183 if (do_swap) SWAP_INT32(&n_unigram);
190 if (fread(&n_bigram,
sizeof(n_bigram), 1, fp) != 1)
192 if (do_swap) SWAP_INT32(&n_bigram);
193 if (fread(&n_trigram,
sizeof(n_trigram), 1, fp) != 1)
195 if (do_swap) SWAP_INT32(&n_trigram);
196 E_INFO(
"ngrams 1=%d, 2=%d, 3=%d\n", n_unigram, n_bigram, n_trigram);
203 else if (n_bigram > 0)
207 ngram_model_init(base, &ngram_model_dmp_funcs, lmath, n, n_unigram);
214 model->
lm3g.unigrams = new_unigram_table(n_unigram + 1);
215 ugptr = model->
lm3g.unigrams;
216 for (i = 0; i <= n_unigram; ++i) {
218 if (fread(ugptr,
sizeof(int32), 1, fp) != 1) {
219 E_ERROR(
"Failed to read maping id %d\n", i);
223 if (fread(ugptr,
sizeof(
unigram_t), 1, fp) != 1) {
224 E_ERROR(
"Failed to read unigrams data\n");
231 SWAP_INT32(&ugptr->
prob1.l);
232 SWAP_INT32(&ugptr->
bo_wt1.l);
238 E_DEBUG(2, (
"ug %d: prob %d bo %d bigrams %d\n",
242 E_INFO(
"%8d = LM.unigrams(+trailer) read\n", n_unigram);
250 E_WARN(
"-mmap specified, but trigram index is not word-aligned. Will not memory-map.\n");
268 offset += (n_bigram + 1) *
sizeof(
bigram_t);
271 model->
lm3g.bigrams =
273 if (fread(model->
lm3g.bigrams,
sizeof(
bigram_t), n_bigram + 1, fp)
274 != (
size_t) n_bigram + 1) {
275 E_ERROR(
"Failed to read bigrams data\n");
279 for (i = 0, bgptr = model->
lm3g.bigrams; i <= n_bigram;
281 SWAP_INT16(&bgptr->
wid);
282 SWAP_INT16(&bgptr->
prob2);
283 SWAP_INT16(&bgptr->
bo_wt2);
288 E_INFO(
"%8d = LM.bigrams(+trailer) read\n", n_bigram);
298 model->
lm3g.trigrams =
302 != (
size_t) n_trigram) {
303 E_ERROR(
"Failed to read trigrams data\n");
307 for (i = 0, tgptr = model->
lm3g.trigrams; i < n_trigram;
309 SWAP_INT16(&tgptr->
wid);
310 SWAP_INT16(&tgptr->
prob3);
314 E_INFO(
"%8d = LM.trigrams read\n", n_trigram);
323 fseek(fp, offset, SEEK_SET);
324 if (fread(&k,
sizeof(k), 1, fp) != 1)
326 if (do_swap) SWAP_INT32(&k);
330 E_ERROR(
"fread(prob2) failed\n");
333 for (i = 0; i < k; i++) {
339 E_INFO(
"%8d = LM.prob2 entries read\n", k);
344 if (fread(&k,
sizeof(k), 1, fp) != 1)
346 if (do_swap) SWAP_INT32(&k);
350 E_ERROR(
"Failed to read backoff weights\n");
353 for (i = 0; i < k; i++) {
359 E_INFO(
"%8d = LM.bo_wt2 entries read\n", k);
364 if (fread(&k,
sizeof(k), 1, fp) != 1)
366 if (do_swap) SWAP_INT32(&k);
370 E_ERROR(
"Failed to read trigram probability\n");
373 for (i = 0; i < k; i++) {
379 E_INFO(
"%8d = LM.prob3 entries read\n", k);
387 memcpy(&k, map_base + offset,
sizeof(k));
388 offset +=
sizeof(int32);
390 offset += k *
sizeof(int32);
393 k = (n_bigram + 1) / BG_SEG_SZ + 1;
394 if (fread(&k,
sizeof(k), 1, fp) != 1)
396 if (do_swap) SWAP_INT32(&k);
400 E_ERROR(
"Failed to read trigram index\n");
404 for (i = 0; i < k; i++)
407 E_INFO(
"%8d = LM.tseg_base entries read\n", k);
412 memcpy(&k, map_base + offset,
sizeof(k));
413 offset +=
sizeof(int32);
414 tmp_word_str = (
char *) (map_base + offset);
419 if (fread(&k,
sizeof(k), 1, fp) != 1)
421 if (do_swap) SWAP_INT32(&k);
423 if (fread(tmp_word_str, 1, k, fp) != (
size_t) k) {
424 E_ERROR(
"Failed to read words\n");
430 for (i = 0, j = 0; i < k; i++)
431 if (tmp_word_str[i] ==
'\0')
433 if (j != n_unigram) {
434 E_ERROR(
"Error reading word strings (%d doesn't match n_unigrams %d)\n",
442 for (i = 0; i < n_unigram; i++) {
443 base->
word_str[i] = tmp_word_str + j;
445 (
void *)(
long)i) != (
void *)(
long)i) {
453 for (i = 0; i < n_unigram; i++) {
456 (
void *)(
long)i) != (
void *)(
long)i) {
463 E_INFO(
"%8d = ascii word strings read\n", i);
486 int i, bgcount, tgcount, seg;
488 if (base->
funcs == &ngram_model_dmp_funcs) {
489 E_INFO(
"Using existing DMP model.\n");
494 E_INFO(
"Building DMP model...\n");
496 newbase = &model->
base;
497 ngram_model_init(newbase, &ngram_model_dmp_funcs,
506 model->
lm3g.unigrams = new_unigram_table(newbase->
n_counts[0] + 1);
515 model->
lm3g.unigrams[wids[0]].
prob1.l = prob1;
516 model->
lm3g.unigrams[wids[0]].
bo_wt1.l = bo_wt1;
519 newbase->
word_str[wids[0]], wids[0]))
521 E_WARN(
"Duplicate word in dictionary: %s\n", newbase->
word_str[wids[0]]);
532 init_sorted_list(&sorted_prob2);
533 if (newbase->
n > 2) {
534 init_sorted_list(&sorted_bo_wt2);
535 init_sorted_list(&sorted_prob3);
539 if (newbase->
n > 2) {
549 for (i = 0; i < newbase->
n_counts[0]; ++i) {
551 bgcount = bgptr - model->
lm3g.bigrams;
554 E_DEBUG(2, (
"unigram %d: %s => bigram %d\n", i, newbase->
word_str[i], bgcount));
565 assert (bgptr - model->
lm3g.bigrams < newbase->
n_counts[1]);
567 bgptr->
wid = wids[1];
568 bgptr->
prob2 = sorted_id(&sorted_prob2, &prob2);
569 if (newbase->
n > 2) {
570 tgcount = (tgptr - model->
lm3g.trigrams);
571 bgcount = (bgptr - model->
lm3g.bigrams);
574 bgptr->
bo_wt2 = sorted_id(&sorted_bo_wt2, &bo_wt2);
578 seg = bgcount >> LOG_BG_SEG_SZ;
582 if (seg != (bgcount - 1) >> LOG_BG_SEG_SZ)
586 E_DEBUG(2, (
"bigram %d %s %s => trigram %d:%d\n",
597 assert(tgptr - model->
lm3g.trigrams < newbase->
n_counts[2]);
599 tgptr->
wid = wids[2];
600 tgptr->
prob3 = sorted_id(&sorted_prob3, &prob3);
601 E_DEBUG(2, (
"trigram %d %s %s %s => prob %d\n",
613 bgcount = bgptr - model->
lm3g.bigrams;
614 tgcount = tgptr - model->
lm3g.trigrams;
615 seg = bgcount >> LOG_BG_SEG_SZ;
616 if (seg != (bgcount - 1) >> LOG_BG_SEG_SZ)
624 model->
lm3g.
prob2 = vals_in_sorted_list(&sorted_prob2);
627 free_sorted_list(&sorted_prob2);
628 if (newbase->
n > 2) {
631 model->
lm3g.
bo_wt2 = vals_in_sorted_list(&sorted_bo_wt2);
632 free_sorted_list(&sorted_bo_wt2);
636 model->
lm3g.
prob3 = vals_in_sorted_list(&sorted_prob3);
639 free_sorted_list(&sorted_prob3);
649 fwrite_int32(FILE *fh, int32 val)
651 fwrite(&val, 4, 1, fh);
661 fwrite(&bogus, 4, 1, fh);
664 fwrite(&log10val, 4, 1, fh);
666 fwrite(&log10val, 4, 1, fh);
673 fwrite(bg,
sizeof(*bg), 1, fh);
679 fwrite(tg,
sizeof(*tg), 1, fh);
684 static char const *fmtdesc[] = {
685 "BEGIN FILE FORMAT DESCRIPTION",
686 "Header string length (int32) and string (including trailing 0)",
687 "Original LM filename string-length (int32) and filename (including trailing 0)",
688 "(int32) version number (present iff value <= 0)",
689 "(int32) original LM file modification timestamp (iff version# present)",
690 "(int32) string-length and string (including trailing 0) (iff version# present)",
691 "... previous entry continued any number of times (iff version# present)",
692 "(int32) 0 (terminating sequence of strings) (iff version# present)",
693 "(int32) log_bg_seg_sz (present iff different from default value of LOG2_BG_SEG_SZ)",
694 "(int32) lm_t.ucount (must be > 0)",
695 "(int32) lm_t.bcount",
696 "(int32) lm_t.tcount",
697 "lm_t.ucount+1 unigrams (including sentinel)",
698 "lm_t.bcount+1 bigrams (including sentinel 64 bits (bg_t) each if version=-1/-2, 128 bits (bg32_t) each if version=-3",
699 "lm_t.tcount trigrams (present iff lm_t.tcount > 0 32 bits (tg_t) each if version=-1/-2, 64 bits (tg32_t) each if version=-3)",
700 "(int32) lm_t.n_prob2",
701 "(int32) lm_t.prob2[]",
702 "(int32) lm_t.n_bo_wt2 (present iff lm_t.tcount > 0)",
703 "(int32) lm_t.bo_wt2[] (present iff lm_t.tcount > 0)",
704 "(int32) lm_t.n_prob3 (present iff lm_t.tcount > 0)",
705 "(int32) lm_t.prob3[] (present iff lm_t.tcount > 0)",
706 "(int32) (lm_t.bcount+1)/BG_SEG_SZ+1 (present iff lm_t.tcount > 0)",
707 "(int32) lm_t.tseg_base[] (present iff lm_t.tcount > 0)",
708 "(int32) Sum(all word string-lengths, including trailing 0 for each)",
709 "All word strings (including trailing 0 for each)",
710 "END FILE FORMAT DESCRIPTION",
715 ngram_model_dmp_write_header(FILE * fh)
718 k = strlen(darpa_hdr) + 1;
720 fwrite(darpa_hdr, 1, k, fh);
724 ngram_model_dmp_write_lm_filename(FILE * fh,
const char *lmfile)
728 k = strlen(lmfile) + 1;
730 fwrite(lmfile, 1, k, fh);
733 #define LMDMP_VERSION_TG_16BIT -1
738 ngram_model_dmp_write_version(FILE * fh, int32 mtime)
740 fwrite_int32(fh, LMDMP_VERSION_TG_16BIT);
741 fwrite_int32(fh, mtime);
745 ngram_model_dmp_write_ngram_counts(FILE * fh,
ngram_model_t *model)
747 fwrite_int32(fh, model->
n_counts[0]);
748 fwrite_int32(fh, model->
n_counts[1]);
749 fwrite_int32(fh, model->
n_counts[2]);
753 ngram_model_dmp_write_fmtdesc(FILE * fh)
759 for (i = 0; fmtdesc[i] != NULL; i++) {
760 k = strlen(fmtdesc[i]) + 1;
762 fwrite(fmtdesc[i], 1, k, fh);
768 fwrite_int32(fh, 4-k);
769 fwrite(
"!!!!", 1, 4-k, fh);
775 ngram_model_dmp_write_unigram(FILE *fh,
ngram_model_t *model)
780 for (i = 0; i <= model->
n_counts[0]; i++) {
781 fwrite_ug(fh, &(lm->
lm3g.unigrams[i]), model->
lmath);
792 for (i = 0; i <= model->
n_counts[1]; i++) {
793 fwrite_bg(fh, &(lm->
lm3g.bigrams[i]));
799 ngram_model_dmp_write_trigram(FILE *fh,
ngram_model_t *model)
804 for (i = 0; i < model->
n_counts[2]; i++) {
805 fwrite_tg(fh, &(lm->
lm3g.trigrams[i]));
818 fwrite(&log10val, 4, 1, fh);
831 fwrite(&log10val, 4, 1, fh);
844 fwrite(&log10val, 4, 1, fh);
849 ngram_model_dmp_write_tg_segbase(FILE *fh,
ngram_model_t *model)
854 k = (model->
n_counts[1] + 1) / BG_SEG_SZ + 1;
856 for (i = 0; i < k; i++)
861 ngram_model_dmp_write_wordstr(FILE *fh,
ngram_model_t *model)
866 for (i = 0; i < model->
n_counts[0]; i++)
867 k += strlen(model->
word_str[i]) + 1;
869 for (i = 0; i < model->
n_counts[0]; i++)
871 strlen(model->
word_str[i]) + 1, fh);
876 const char *file_name)
883 model = ngram_model_dmp_build(base);
884 newbase = &model->
base;
888 if ((fh = fopen(file_name,
"wb")) == NULL) {
889 E_ERROR(
"Cannot create file %s\n", file_name);
892 ngram_model_dmp_write_header(fh);
893 ngram_model_dmp_write_lm_filename(fh, file_name);
894 ngram_model_dmp_write_version(fh, 0);
895 ngram_model_dmp_write_fmtdesc(fh);
896 ngram_model_dmp_write_ngram_counts(fh, newbase);
897 ngram_model_dmp_write_unigram(fh, newbase);
898 if (newbase->
n > 1) {
899 ngram_model_dmp_write_bigram(fh, newbase);
900 if (newbase->
n > 2) {
901 ngram_model_dmp_write_trigram(fh, newbase);
903 ngram_model_dmp_write_bgprob(fh, newbase);
904 if (newbase->
n > 2) {
905 ngram_model_dmp_write_tgbowt(fh, newbase);
906 ngram_model_dmp_write_tgprob(fh, newbase);
907 ngram_model_dmp_write_tg_segbase(fh, newbase);
910 ngram_model_dmp_write_wordstr(fh, newbase);
917 ngram_model_dmp_apply_weights(
ngram_model_t *base, float32 lw,
918 float32 wip, float32 uw)
921 lm3g_apply_weights(base, &model->
lm3g, lw, wip, uw);
928 #define NGRAM_MODEL_TYPE ngram_model_dmp_t
929 #include "lm3g_templates.c"
953 lm3g_tginfo_free(base, &model->
lm3g);
957 ngram_model_dmp_free,
958 ngram_model_dmp_apply_weights,
960 lm3g_template_raw_score,
961 lm3g_template_add_ug,
964 lm3g_template_mgrams,
965 lm3g_template_successors,
966 lm3g_template_iter_get,
967 lm3g_template_iter_next,
968 lm3g_template_iter_free
lmprob_t bo_wt1
Unigram backoff weight.
listelem_alloc_t * le
List element allocator for tginfo.
struct ngram_funcs_s * funcs
Implementation-specific methods.
#define ckd_calloc(n, sz)
Macros to simplify the use of above functions.
#define hash_table_enter_int32(h, k, v)
Add a 32-bit integer value to a hash table.
hash_table_t * wid
Mapping of unigram names to word IDs.
char ** word_str
Unigram names.
SPHINXBASE_EXPORT void ngram_iter_free(ngram_iter_t *itor)
Terminate an M-gram iterator.
#define E_DEBUG(level, x)
Print debugging information to standard error stream.
#define E_INFO
Print logging information to standard error stream.
Sphinx's memory allocation/deallocation routines.
SPHINXBASE_EXPORT int32 const * ngram_iter_get(ngram_iter_t *itor, int32 *out_score, int32 *out_bowt)
Get information from the current M-gram in an iterator.
lmprob_t * prob2
Table of actual bigram probs.
SPHINXBASE_EXPORT void mmio_file_unmap(mmio_file_t *mf)
Unmap a file, releasing memory associated with it.
SPHINXBASE_EXPORT ngram_iter_t * ngram_iter_next(ngram_iter_t *itor)
Advance an M-gram iterator.
uint8 writable
Are word strings writable?
int32 * n_counts
Counts for 1, 2, 3, ...
SPHINXBASE_EXPORT int ngram_model_free(ngram_model_t *model)
Release memory associated with an N-Gram model.
#define ckd_salloc(ptr)
Macro for ckd_salloc
uint16 prob3
Index into array of actual trigram probs.
SPHINXBASE_EXPORT ngram_iter_t * ngram_iter_successors(ngram_iter_t *itor)
Iterate over all M-gram successors of an M-1-gram.
Subclass of ngram_model for DMP file reading.
SPHINXBASE_EXPORT void ckd_free(void *ptr)
Test and free a 1-D array.
#define E_WARN
Print warning information to standard error stream.
ngram_model_t base
Base ngram_model_t structure.
Unigram structure (common among all lm3g implementations)
int32 n_bo_wt2
bo_wt2 size
int32 bigrams
Index of 1st entry in lm_t.bigrams[].
lmprob_t * prob3
Table of actual trigram probs.
SPHINXBASE_EXPORT void * mmio_file_ptr(mmio_file_t *mf)
Get a pointer to the memory mapped for a file.
Trigram information cache.
SPHINXBASE_EXPORT logmath_t * logmath_retain(logmath_t *lmath)
Retain ownership of a log table.
SPHINXBASE_EXPORT ngram_model_t * ngram_model_retain(ngram_model_t *model)
Retain ownership of an N-Gram model.
Fast memory allocator for uniformly sized objects.
uint16 trigrams
Index of 1st entry in lm_t.trigrams[], RELATIVE TO its segment base (see above)
uint8 n
This is an n-gram model (1, 2, 3, ...).
uint16 prob2
Index into array of actual bigram probs.
Implementation of logging routines.
logmath_t * lmath
Log-math object.
uint16 bo_wt2
Index into array of actual bigram backoff wts.
SPHINXBASE_EXPORT void * hash_table_enter(hash_table_t *h, const char *key, void *val)
Try to add a new entry with given key and associated value to hash table h.
SPHINXBASE_EXPORT FILE * fopen_comp(const char *file, const char *mode, int32 *ispipe)
Like fopen, but use popen and zcat if it is determined that "file" is compressed (i.e., has a .z, .Z, .gz, or .GZ extension).
mmio_file_t * dump_mmap
mmap() of dump file (or NULL if none)
SPHINXBASE_EXPORT ngram_iter_t * ngram_ng_iter(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist)
Get an iterator over M-grams pointing to the specified M-gram.
lmprob_t prob1
Unigram probability.
Opaque structure used to hold the results of command-line parsing.
#define E_FATAL
Exit with non-zero status after error message.
#define E_ERROR
Print error message to standard error stream.
lmprob_t * bo_wt2
Table of actual bigram backoff weights.
Implementation-specific functions for operating on ngram_model_t objects.
SPHINXBASE_EXPORT listelem_alloc_t * listelem_alloc_init(size_t elemsize)
Initialize and return a list element allocator.
Base iterator structure for N-grams.
#define cmd_ln_boolean_r(c, n)
Retrieve a boolean value from a command-line object.
SPHINXBASE_EXPORT float64 logmath_log_to_log10(logmath_t *lmath, int logb_p)
Convert integer log in base B to base 10 log (in floating point).
lm3g_model_t lm3g
Common lm3g_model_t structure.
Common implementation of ngram_model_t.
int32 free
first free element in list
SPHINXBASE_EXPORT const char * ngram_word(ngram_model_t *model, int32 wid)
Look up word string for numerical word ID.
SPHINXBASE_EXPORT void fclose_comp(FILE *fp, int32 ispipe)
Close a file opened using fopen_comp.
SPHINXBASE_EXPORT mmio_file_t * mmio_file_read(const char *filename)
Memory-map a file for reading.
uint32 wid
Index of unigram entry for this.
SPHINXBASE_EXPORT ngram_iter_t * ngram_model_mgrams(ngram_model_t *model, int m)
Iterate over all M-grams.
tginfo_t ** tginfo
tginfo[lw2] is head of linked list of trigram information for some cached subset of bigrams (*...
file IO related operations.
SPHINXBASE_EXPORT int logmath_log10_to_log(logmath_t *lmath, float64 log_p)
Convert base 10 log (in floating point) to integer log in base B.
int32 * tseg_base
tseg_base[i>>LOG_BG_SEG_SZ] = index of 1st trigram for bigram segment (i>>LOG_BG_SEG_SZ) ...
uint32 wid
Index of unigram entry for this.