53 #include "ngram_model_arpa.h"
57 #define TSEG_BASE(m,b) ((m)->lm3g.tseg_base[(b)>>LOG_BG_SEG_SZ])
58 #define FIRST_BG(m,u) ((m)->lm3g.unigrams[u].bigrams)
59 #define FIRST_TG(m,b) (TSEG_BASE((m),(b))+((m)->lm3g.bigrams[b].trigrams))
65 ReadNgramCounts(
lineiter_t **li, int32 * n_ug, int32 * n_bg, int32 * n_tg)
67 int32 ngram, ngram_cnt;
72 if (strcmp((*li)->buf,
"\\data\\") == 0)
76 if (*li == NULL || strcmp((*li)->buf,
"\\data\\") != 0) {
77 E_INFO(
"No \\data\\ mark in LM file\n");
81 *n_ug = *n_bg = *n_tg = 0;
83 if (sscanf((*li)->buf,
"ngram %d=%d", &ngram, &ngram_cnt) != 2)
96 E_ERROR(
"Unknown ngram (%d)\n", ngram);
101 E_ERROR(
"EOF while reading ngram counts\n");
108 if (strcmp((*li)->buf,
"\\1-grams:") == 0)
116 if ((*n_ug <= 0) || (*n_bg < 0) || (*n_tg < 0)) {
117 E_ERROR(
"Bad or missing ngram count\n");
135 E_INFO(
"Reading unigrams\n");
139 char *wptr[3], *name;
140 float32 bo_wt = 0.0f;
144 if (strcmp((*li)->buf,
"\\2-grams:") == 0
145 || strcmp((*li)->buf,
"\\end\\") == 0)
148 if ((n =
str2words((*li)->buf, wptr, 3)) < 2) {
149 if ((*li)->buf[0] !=
'\0')
150 E_WARN(
"Format error; unigram ignored: %s\n", (*li)->buf);
154 p1 = (float)
atof_c(wptr[0]);
157 bo_wt = (float)
atof_c(wptr[2]);
161 E_ERROR(
"Too many unigrams\n");
168 != (
void *)(
long)wcnt) {
169 E_WARN(
"Duplicate word in dictionary: %s\n", base->
word_str[wcnt]);
177 E_WARN(
"lm_t.ucount(%d) != #unigrams read(%d)\n",
192 int32 w1, w2, prev_w1, bgcount;
195 E_INFO(
"Reading bigrams\n");
198 bgptr = model->
lm3g.bigrams;
202 float32 p, bo_wt = 0.0f;
204 char *wptr[4], *word1, *word2;
209 if ((n =
str2words((*li)->buf, wptr, 4)) < 3) {
210 if ((*li)->buf[0] !=
'\0')
215 p = (float32)
atof_c(wptr[0]);
219 bo_wt = (float32)
atof_c(wptr[3]);
223 E_ERROR(
"Unknown word: %s, skipping bigram (%s %s)\n",
224 word1, word1, word2);
228 E_ERROR(
"Unknown word: %s, skipping bigram (%s %s)\n",
229 word2, word1, word2);
235 p = (float32)((int32)(p * 10000)) / 10000;
236 bo_wt = (float32)((int32)(bo_wt * 10000)) / 10000;
247 bgptr->
prob2 = sorted_id(&model->sorted_prob2, &p2);
249 bgptr->
bo_wt2 = sorted_id(&model->sorted_bo_wt2, &bo_wt2);
253 E_ERROR(
"Bigrams not in unigram order\n");
257 for (prev_w1++; prev_w1 <= w1; prev_w1++)
264 if ((bgcount & 0x0000ffff) == 0) {
268 if (*li == NULL || ((strcmp((*li)->buf,
"\\end\\") != 0)
269 && (strcmp((*li)->buf,
"\\3-grams:") != 0))) {
270 E_ERROR(
"Bad bigram: %s\n", (*li)->buf);
274 for (prev_w1++; prev_w1 <= base->
n_counts[0]; prev_w1++)
287 int32 i, w1, w2, w3, prev_w1, prev_w2, tgcount, prev_bg, bg, endbg;
288 int32 seg, prev_seg, prev_seg_lastbg;
292 E_INFO(
"Reading trigrams\n");
295 tgptr = model->
lm3g.trigrams;
304 char *wptr[4], *word1, *word2, *word3;
307 if (
str2words((*li)->buf, wptr, 4) != 4) {
308 if ((*li)->buf[0] !=
'\0')
313 p = (float32)
atof_c(wptr[0]);
320 E_ERROR(
"Unknown word: %s, skipping trigram (%s %s %s)\n",
321 word1, word1, word2, word3);
325 E_ERROR(
"Unknown word: %s, skipping trigram (%s %s %s)\n",
326 word2, word1, word2, word3);
330 E_ERROR(
"Unknown word: %s, skipping trigram (%s %s %s)\n",
331 word3, word1, word2, word3);
337 p = (float32)((int32)(p * 10000)) / 10000;
341 E_ERROR(
"Too many trigrams\n");
346 tgptr->
prob3 = sorted_id(&model->sorted_prob3, &p3);
348 if ((w1 != prev_w1) || (w2 != prev_w2)) {
350 if ((w1 < prev_w1) || ((w1 == prev_w1) && (w2 < prev_w2))) {
351 E_ERROR(
"Trigrams not in bigram order\n");
356 prev_w1) ? model->
lm3g.unigrams[w1].
bigrams : prev_bg + 1;
358 bgptr = model->
lm3g.bigrams + bg;
359 for (; (bg < endbg) && (bgptr->
wid != w2); bg++, bgptr++);
361 E_ERROR(
"Missing bigram for trigram: %s", (*li)->buf);
366 seg = bg >> LOG_BG_SEG_SZ;
367 for (i = prev_seg + 1; i <= seg; i++)
371 if (prev_seg < seg) {
377 E_ERROR(
"Size of trigram segment is bigger than 65535, such a big language models are not supported, use smaller vocabulary\n");
382 prev_seg_lastbg = ((prev_seg + 1) << LOG_BG_SEG_SZ) - 1;
383 bgptr = model->
lm3g.bigrams + prev_bg;
384 for (++prev_bg, ++bgptr; prev_bg <= prev_seg_lastbg;
388 for (; prev_bg <= bg; prev_bg++, bgptr++)
396 E_ERROR(
"Size of trigram segment is bigger than 65535, such a big language models are not supported, use smaller vocabulary\n");
400 bgptr = model->
lm3g.bigrams + prev_bg;
401 for (++prev_bg, ++bgptr; prev_bg <= bg; prev_bg++, bgptr++)
414 if ((tgcount & 0x0000ffff) == 0) {
418 if (*li == NULL || strcmp((*li)->buf,
"\\end\\") != 0) {
419 E_ERROR(
"Bad trigram: %s\n", (*li)->buf);
423 for (prev_bg++; prev_bg <= base->
n_counts[1]; prev_bg++) {
424 if ((prev_bg & (BG_SEG_SZ - 1)) == 0)
426 if ((tgcount - model->
lm3g.
tseg_base[prev_bg >> LOG_BG_SEG_SZ]) > 65535) {
427 E_ERROR(
"Size of trigram segment is bigger than 65535, such a big language models are not supported, use smaller vocabulary\n");
437 new_unigram_table(int32 n_ug)
443 for (i = 0; i < n_ug; i++) {
444 table[i].
prob1.l = INT_MIN;
445 table[i].
bo_wt1.l = INT_MIN;
451 ngram_model_arpa_read(
cmd_ln_t *config,
452 const char *file_name,
465 if ((fp =
fopen_comp(file_name,
"r", &is_pipe)) == NULL) {
466 E_ERROR(
"File %s not found\n", file_name);
472 if (ReadNgramCounts(&li, &n_unigram, &n_bigram, &n_trigram) == -1) {
477 E_INFO(
"ngrams 1=%d, 2=%d, 3=%d\n", n_unigram, n_bigram, n_trigram);
484 else if (n_bigram > 0)
489 ngram_model_init(base, &ngram_model_arpa_funcs, lmath, n, n_unigram);
499 model->
lm3g.unigrams = new_unigram_table(n_unigram + 1);
500 model->
lm3g.bigrams =
503 model->
lm3g.trigrams =
511 if (ReadUnigrams(&li, model) == -1) {
518 init_sorted_list(&model->sorted_prob2);
520 init_sorted_list(&model->sorted_bo_wt2);
523 if (ReadBigrams(&li, model) == -1) {
531 model->
lm3g.
prob2 = vals_in_sorted_list(&model->sorted_prob2);
532 free_sorted_list(&model->sorted_prob2);
540 model->
lm3g.
bo_wt2 = vals_in_sorted_list(&model->sorted_bo_wt2);
541 free_sorted_list(&model->sorted_bo_wt2);
544 init_sorted_list(&model->sorted_prob3);
546 if (ReadTrigrams(&li, model) == -1) {
554 model->
lm3g.
prob3 = vals_in_sorted_list(&model->sorted_prob3);
558 free_sorted_list(&model->sorted_prob3);
572 const char *file_name)
578 if ((fh = fopen(file_name,
"w")) == NULL) {
582 fprintf(fh,
"This is an ARPA-format language model file, generated by CMU Sphinx\n");
589 fprintf(fh,
"\\data\\\n");
590 for (i = 0; i < model->
n; ++i) {
591 fprintf(fh,
"ngram %d=%d\n", i+1, model->
n_counts[i]);
595 for (i = 0; i < model->
n; ++i) {
596 fprintf(fh,
"\n\\%d-grams:\n", i + 1);
604 for (j = 0; j <= i; ++j) {
605 assert(wids[j] < model->
n_counts[0]);
606 fprintf(fh,
"%s ", model->
word_str[wids[j]]);
613 fprintf(fh,
"\n\\end\\\n");
618 ngram_model_arpa_apply_weights(
ngram_model_t *base, float32 lw,
619 float32 wip, float32 uw)
622 lm3g_apply_weights(base, &model->
lm3g, lw, wip, uw);
629 #define NGRAM_MODEL_TYPE ngram_model_arpa_t
630 #include "lm3g_templates.c"
642 lm3g_tginfo_free(base, &model->
lm3g);
647 ngram_model_arpa_free,
648 ngram_model_arpa_apply_weights,
650 lm3g_template_raw_score,
651 lm3g_template_add_ug,
654 lm3g_template_mgrams,
655 lm3g_template_successors,
656 lm3g_template_iter_get,
657 lm3g_template_iter_next,
658 lm3g_template_iter_free
lmprob_t bo_wt1
Unigram backoff weight.
listelem_alloc_t * le
List element allocator for tginfo.
Miscellaneous useful string functions.
#define ckd_calloc(n, sz)
Macros to simplify the use of above functions.
hash_table_t * wid
Mapping of unigram names to word IDs.
char ** word_str
Unigram names.
#define E_INFO
Print logging information to standard error stream.
Sphinx's memory allocation/deallocation routines.
SPHINXBASE_EXPORT int32 ngram_wid(ngram_model_t *model, const char *word)
Look up numerical word ID.
SPHINXBASE_EXPORT int32 const * ngram_iter_get(ngram_iter_t *itor, int32 *out_score, int32 *out_bowt)
Get information from the current M-gram in an iterator.
lmprob_t * prob2
Table of actual bigram probs.
#define NGRAM_INVALID_WID
Impossible word ID.
SPHINXBASE_EXPORT ngram_iter_t * ngram_iter_next(ngram_iter_t *itor)
Advance an M-gram iterator.
uint8 writable
Are word strings writable?
int32 * n_counts
Counts for 1, 2, 3, ...
SPHINXBASE_EXPORT int ngram_model_free(ngram_model_t *model)
Release memory associated with an N-Gram model.
#define ckd_salloc(ptr)
Macro for ckd_salloc
uint16 prob3
Index into array of actual trigram probs.
SPHINXBASE_EXPORT void ckd_free(void *ptr)
Test and free a 1-D array.
#define E_WARN
Print warning information to standard error stream.
Unigram structure (common among all lm3g implementations)
int32 n_bo_wt2
bo_wt2 size
int32 bigrams
Index of 1st entry in lm_t.bigrams[].
Subclass of ngram_model for ARPA file reading.
SPHINXBASE_EXPORT double atof_c(char const *str)
Locale independent version of atof().
lmprob_t * prob3
Table of actual trigram probs.
SPHINXBASE_EXPORT void lineiter_free(lineiter_t *li)
Stop reading lines from a file.
Trigram information cache.
SPHINXBASE_EXPORT lineiter_t * lineiter_next(lineiter_t *li)
Move to the next line in the file.
SPHINXBASE_EXPORT lineiter_t * lineiter_start(FILE *fh)
Start reading lines from a file.
ngram_model_t base
Base ngram_model_t structure.
Fast memory allocator for uniformly sized objects.
uint16 trigrams
Index of 1st entry in lm_t.trigrams[], RELATIVE TO its segment base (see above)
#define E_INFOCONT
Print logging information without header, to standard error stream.
uint8 n
This is an n-gram model (1, 2, 3, ...).
uint16 prob2
Index into array of actual bigram probs.
Implementation of logging routines.
logmath_t * lmath
Log-math object.
uint16 bo_wt2
Index into array of actual bigram backoff wts.
SPHINXBASE_EXPORT void * hash_table_enter(hash_table_t *h, const char *key, void *val)
Try to add a new entry with given key and associated value to hash table h.
SPHINXBASE_EXPORT FILE * fopen_comp(const char *file, const char *mode, int32 *ispipe)
Like fopen, but use popen and zcat if it is determined that "file" is compressed (i.e., has a .z, .Z, .gz, or .GZ extension).
lmprob_t prob1
Unigram probability.
lm3g_model_t lm3g
Shared lm3g structure.
SPHINXBASE_EXPORT int32 str2words(char *line, char **wptr, int32 n_wptr)
Convert a line to an array of "words", based on whitespace separators.
Opaque structure used to hold the results of command-line parsing.
#define E_ERROR
Print error message to standard error stream.
lmprob_t * bo_wt2
Table of actual bigram backoff weights.
Implementation-specific functions for operating on ngram_model_t objects.
SPHINXBASE_EXPORT listelem_alloc_t * listelem_alloc_init(size_t elemsize)
Initialize and return a list element allocator.
Base iterator structure for N-grams.
SPHINXBASE_EXPORT float64 logmath_log_to_log10(logmath_t *lmath, int logb_p)
Convert integer log in base B to base 10 log (in floating point).
SPHINXBASE_EXPORT char * string_trim(char *string, enum string_edge_e which)
Remove whitespace from a string, modifying it in-place.
Common implementation of ngram_model_t.
int32 free
first free element in list
SPHINXBASE_EXPORT void fclose_comp(FILE *fp, int32 ispipe)
Close a file opened using fopen_comp.
#define E_ERROR_SYSTEM
Print error text; Call perror("");.
uint32 wid
Index of unigram entry for this.
SPHINXBASE_EXPORT ngram_iter_t * ngram_model_mgrams(ngram_model_t *model, int m)
Iterate over all M-grams.
tginfo_t ** tginfo
tginfo[lw2] is head of linked list of trigram information for some cached subset of bigrams (*...
file IO related operations.
SPHINXBASE_EXPORT int logmath_log10_to_log(logmath_t *lmath, float64 log_p)
Convert base 10 log (in floating point) to integer log in base B.
int32 * tseg_base
tseg_base[i>>LOG_BG_SEG_SZ] = index of 1st trigram for bigram segment (i>>LOG_BG_SEG_SZ) ...
int32 n_words
Number of actual word strings (NOT the same as the number of unigrams, due to class words)...
uint32 wid
Index of unigram entry for this.