45 #define BINARY_SEARCH_THRESH 16
47 find_bg(
bigram_t * bg, int32 n, int32 w)
54 while (e - b > BINARY_SEARCH_THRESH) {
58 else if (bg[i].wid > w)
65 for (i = b; (i < e) && (bg[i].wid != w); i++);
66 return ((i < e) ? i : -1);
70 lm3g_bg_score(NGRAM_MODEL_TYPE *model,
71 int32 lw1, int32 lw2, int32 *n_used)
76 if (lw1 < 0 || model->base.n < 2) {
78 return model->lm3g.unigrams[lw2].prob1.l;
81 b = FIRST_BG(model, lw1);
82 n = FIRST_BG(model, lw1 + 1) - b;
83 bg = model->lm3g.bigrams + b;
85 if ((i = find_bg(bg, n, lw2)) >= 0) {
93 score = model->lm3g.unigrams[lw1].bo_wt1.l + model->lm3g.unigrams[lw2].prob1.l;
100 load_tginfo(NGRAM_MODEL_TYPE *model, int32 lw1, int32 lw2)
110 tginfo->
next = model->lm3g.tginfo[lw2];
111 model->lm3g.tginfo[lw2] = tginfo;
114 b = model->lm3g.unigrams[lw1].bigrams;
115 n = model->lm3g.unigrams[lw1 + 1].bigrams - b;
116 bg = model->lm3g.bigrams + b;
118 if ((n > 0) && ((i = find_bg(bg, n, lw2)) >= 0)) {
119 tginfo->
bowt = model->lm3g.bo_wt2[bg[i].
bo_wt2].l;
123 t = FIRST_TG(model, b);
125 tginfo->
tg = model->lm3g.trigrams + t;
128 tginfo->
n_tg = FIRST_TG(model, b + 1) - t;
138 find_tg(
trigram_t * tg, int32 n, int32 w)
144 while (e - b > BINARY_SEARCH_THRESH) {
148 else if (tg[i].wid > w)
154 for (i = b; (i < e) && (tg[i].wid != w); i++);
155 return ((i < e) ? i : -1);
159 lm3g_tg_score(NGRAM_MODEL_TYPE *model, int32 lw1,
160 int32 lw2, int32 lw3, int32 *n_used)
167 if ((base->
n < 3) || (lw1 < 0) || (lw2 < 0))
168 return (lm3g_bg_score(model, lw2, lw3, n_used));
171 for (tginfo = model->lm3g.tginfo[lw2]; tginfo; tginfo = tginfo->
next) {
172 if (tginfo->
w1 == lw1)
174 prev_tginfo = tginfo;
178 load_tginfo(model, lw1, lw2);
179 tginfo = model->lm3g.tginfo[lw2];
181 else if (prev_tginfo) {
183 tginfo->
next = model->lm3g.tginfo[lw2];
184 model->lm3g.tginfo[lw2] = tginfo;
192 if ((i = find_tg(tg, n, lw3)) >= 0) {
198 score = tginfo->
bowt + lm3g_bg_score(model, lw2, lw3, n_used);
206 int32 *history, int32 n_hist,
209 NGRAM_MODEL_TYPE *model = (NGRAM_MODEL_TYPE *)base;
214 return model->lm3g.unigrams[wid].prob1.l;
216 return lm3g_bg_score(model, history[0], wid, n_used);
220 return lm3g_tg_score(model, history[1], history[0], wid, n_used);
226 int32 *history, int32 n_hist,
229 NGRAM_MODEL_TYPE *model = (NGRAM_MODEL_TYPE *)base;
237 score = model->lm3g.unigrams[wid].prob1.l - base->
log_wip;
239 score = (int32)(score / base->
lw);
241 if (strcmp(base->
word_str[wid],
"<s>") != 0) {
249 score = lm3g_bg_score(model, history[0], wid, n_used);
254 score = lm3g_tg_score(model, history[1], history[0], wid, n_used);
258 return (int32)((score - base->
log_wip) / base->
lw);
263 int32 wid, int32 lweight)
265 NGRAM_MODEL_TYPE *model = (NGRAM_MODEL_TYPE *)base;
266 return lm3g_add_ug(base, &model->lm3g, wid, lweight);
272 NGRAM_MODEL_TYPE *model = (NGRAM_MODEL_TYPE *)base;
273 lm3g_tginfo_reset(base, &model->lm3g);
285 int32 *history, int32 n_hist)
287 NGRAM_MODEL_TYPE *model = (NGRAM_MODEL_TYPE *)base;
290 ngram_iter_init((
ngram_iter_t *)itor, base, n_hist, FALSE);
294 itor->ug = model->lm3g.unigrams + wid;
297 else if (n_hist == 1) {
300 itor->ug = model->lm3g.unigrams + history[0];
301 b = FIRST_BG(model, history[0]);
302 n = FIRST_BG(model, history[0] + 1) - b;
303 itor->bg = model->lm3g.bigrams + b;
305 if ((i = find_bg(itor->bg, n, wid)) < 0) {
312 else if (n_hist == 2) {
316 itor->ug = model->lm3g.unigrams + history[1];
318 for (tginfo = model->lm3g.tginfo[history[0]];
319 tginfo; tginfo = tginfo->
next) {
320 if (tginfo->
w1 == history[1])
322 prev_tginfo = tginfo;
326 load_tginfo(model, history[1], history[0]);
327 tginfo = model->lm3g.tginfo[history[0]];
329 else if (prev_tginfo) {
331 tginfo->
next = model->lm3g.tginfo[history[0]];
332 model->lm3g.tginfo[history[0]] = tginfo;
339 itor->tg = tginfo->
tg;
340 if ((i = find_tg(itor->tg, n, wid)) >= 0) {
345 itor->bg = model->lm3g.bigrams;
346 while (FIRST_TG(model, (itor->bg - model->lm3g.bigrams + 1))
347 <= (itor->tg - model->lm3g.trigrams))
367 NGRAM_MODEL_TYPE *model = (NGRAM_MODEL_TYPE *)base;
371 itor->ug = model->lm3g.unigrams;
372 itor->bg = model->lm3g.bigrams;
373 itor->tg = model->lm3g.trigrams;
376 if (m > 1 && base->
n_counts[1] > 1) {
377 while (FIRST_TG(model, (itor->bg - model->lm3g.bigrams + 1))
378 <= (itor->tg - model->lm3g.trigrams))
383 if (m > 0 && base->
n_counts[0] > 1) {
384 while (itor->ug[1].
bigrams <= (itor->bg - model->lm3g.bigrams))
394 NGRAM_MODEL_TYPE *model = (NGRAM_MODEL_TYPE *)bitor->model;
403 if (((itor->ug + 1) - model->lm3g.unigrams < bitor->model->
n_counts[0] &&
404 itor->ug->
bigrams == (itor->ug + 1)->bigrams) ||
409 itor->bg = model->lm3g.bigrams + itor->ug->
bigrams;
415 if (((itor->bg + 1) - model->lm3g.bigrams < bitor->model->
n_counts[1] &&
416 FIRST_TG (model, itor->bg - model->lm3g.bigrams) ==
417 FIRST_TG (model, (itor->bg + 1) - model->lm3g.bigrams)) ||
418 FIRST_TG (model, itor->bg - model->lm3g.bigrams) == bitor->model->
n_counts[2])
422 itor->tg = (model->lm3g.trigrams
423 + FIRST_TG(model, (itor->bg - model->lm3g.bigrams)));
425 printf(
"%s %s => %d (%s)\n",
426 model->base.word_str[itor->ug - model->lm3g.unigrams],
427 model->base.word_str[itor->bg->
wid],
428 FIRST_TG(model, (itor->bg - model->lm3g.bigrams)),
429 model->base.word_str[itor->tg->
wid]);
438 ngram_iter_init((
ngram_iter_t *)itor, bitor->model, bitor->
m + 1, TRUE);
447 int32 *out_score, int32 *out_bowt)
449 NGRAM_MODEL_TYPE *model = (NGRAM_MODEL_TYPE *)base->model;
452 base->
wids[0] = itor->ug - model->lm3g.unigrams;
453 if (itor->bg) base->
wids[1] = itor->bg->
wid;
454 if (itor->tg) base->
wids[2] = itor->tg->
wid;
456 printf(
"itor_get: %d %d %d\n", base->
wids[0], base->
wids[1], base->
wids[2]);
461 *out_score = itor->ug->
prob1.l;
462 *out_bowt = itor->ug->
bo_wt1.l;
465 *out_score = model->lm3g.prob2[itor->bg->
prob2].l;
466 if (model->lm3g.bo_wt2)
467 *out_bowt = model->lm3g.bo_wt2[itor->bg->
bo_wt2].l;
472 *out_score = model->lm3g.prob3[itor->tg->
prob3].l;
484 NGRAM_MODEL_TYPE *model = (NGRAM_MODEL_TYPE *)base->model;
491 if (itor->ug - model->lm3g.unigrams >= base->model->
n_counts[0])
497 if (itor->bg - model->lm3g.bigrams >= base->model->
n_counts[1])
501 while (itor->bg - model->lm3g.bigrams >= itor->ug[1].
bigrams) {
507 if (itor->ug == model->lm3g.unigrams + base->model->
n_counts[0]) {
508 E_ERROR(
"Bigram %d has no valid unigram parent\n",
509 itor->bg - model->lm3g.bigrams);
517 if (itor->tg - model->lm3g.trigrams >= base->model->
n_counts[2])
520 while (itor->tg - model->lm3g.trigrams >=
521 FIRST_TG(model, (itor->bg - model->lm3g.bigrams + 1))) {
525 if (itor->bg == model->lm3g.bigrams + base->model->
n_counts[1]) {
526 E_ERROR(
"Trigram %d has no valid bigram parent\n",
527 itor->tg - model->lm3g.trigrams);
533 while (itor->bg - model->lm3g.bigrams >= itor->ug[1].
bigrams) {
535 if (itor->ug == model->lm3g.unigrams + base->model->
n_counts[0]) {
536 E_ERROR(
"Trigram %d has no valid unigram parent\n",
537 itor->tg - model->lm3g.trigrams);
lmprob_t bo_wt1
Unigram backoff weight.
int32 log_uniform
Log of uniform (0-gram) probability.
int32 w1
lw1 component of bigram lw1,lw2.
#define ckd_calloc(n, sz)
Macros to simplify the use of above functions.
char ** word_str
Unigram names.
SPHINXBASE_EXPORT void ngram_iter_free(ngram_iter_t *itor)
Terminate an M-gram iterator.
#define listelem_malloc(le)
Allocate a list element and return pointer to it.
int32 log_uniform_weight
Log of uniform weight (i.e.
SPHINXBASE_EXPORT int logmath_log(logmath_t *lmath, float64 p)
Convert linear floating point number to integer log in base B.
int32 * n_counts
Counts for 1, 2, 3, ...
int32 used
whether used since last lm_reset
uint16 prob3
Index into array of actual trigram probs.
SPHINXBASE_EXPORT void ckd_free(void *ptr)
Test and free a 1-D array.
Unigram structure (common among all lm3g implementations)
int32 bigrams
Index of 1st entry in lm_t.bigrams[].
trigram_t * tg
Trigrams for lw1,lw2.
Trigram information cache.
int32 n_tg
number tg for parent bigram lw1,lw2
uint8 n
This is an n-gram model (1, 2, 3, ...).
uint16 prob2
Index into array of actual bigram probs.
logmath_t * lmath
Log-math object.
uint16 bo_wt2
Index into array of actual bigram backoff wts.
struct tginfo_s * next
Next lw1 with same parent lw2; NULL if none.
lmprob_t prob1
Unigram probability.
#define E_ERROR
Print error message to standard error stream.
float32 lw
Language model scaling factor.
Base iterator structure for N-grams.
int32 * wids
Scratch space for word IDs.
Common implementation of ngram_model_t.
int32 bowt
tg bowt for lw1,lw2
SPHINXBASE_EXPORT float64 logmath_exp(logmath_t *lmath, int logb_p)
Convert integer log in base B to linear floating point.
uint32 wid
Index of unigram entry for this.
int16 successor
Is this a successor iterator?
int32 log_wip
Log of word insertion penalty.
uint32 wid
Index of unigram entry for this.