61 #include "ngram_model_internal.h"
68 ext = strrchr(file_name,
'.');
73 while (--ext >= file_name) {
74 if (*ext ==
'.')
break;
76 if (ext < file_name) {
81 while (--ext >= file_name) {
82 if (*ext ==
'.')
break;
84 if (ext < file_name) {
122 const char *file_name,
130 if ((model = ngram_model_arpa_read(config, file_name, lmath)) != NULL)
132 if ((model = ngram_model_dmp_read(config, file_name, lmath)) != NULL)
137 model = ngram_model_arpa_read(config, file_name, lmath);
140 model = ngram_model_dmp_read(config, file_name, lmath);
143 E_ERROR(
"language model file type not supported\n");
154 lw = cmd_ln_float32_r(config,
"-lw");
156 wip = cmd_ln_float32_r(config,
"-wip");
158 uw = cmd_ln_float32_r(config,
"-uw");
179 return ngram_model_arpa_write(model, file_name);
181 return ngram_model_dmp_write(model, file_name);
183 E_ERROR(
"language model file type not supported\n");
186 E_ERROR(
"language model file type not supported\n");
194 int32 n, int32 n_unigram)
203 if (base->
lmath != lmath) {
218 for (i = 0; i < base->
n_words; ++i) {
266 for (i = 0; i < model->
n_words; ++i) {
277 for (j = 0; j < lmclass->
n_words; ++j) {
280 for (j = 0; j < lmclass->
n_hash; ++j) {
281 if (lmclass->nword_hash[j].
wid != -1) {
288 ngram_class_free(model->
classes[i]);
312 for (i = 0; i < model->
n_words; ++i) {
321 if (outstr[0] ==
'<' || outstr[0] ==
'[') {
340 E_WARN(
"Duplicate word in dictionary after conversion: %s\n",
346 model->
wid = new_wid;
362 if ((ic = iconv_open(to, from)) == (iconv_t)-1) {
374 for (i = 0; i < model->
n_words; ++i) {
375 if (strlen(model->
word_str[i]) > maxlen)
376 maxlen = strlen(model->
word_str[i]);
383 maxlen = maxlen *
sizeof(int) + 15;
388 for (i = 0; i < model->
n_words; ++i) {
389 ICONV_CONST
char *in;
391 size_t inleft, outleft, result;
394 in = (ICONV_CONST
char *)model->
word_str[i];
400 while ((result = iconv(ic, &in, &inleft, &out, &outleft)) == (size_t)-1) {
401 if (errno != E2BIG) {
410 iconv(ic, NULL, NULL, NULL, NULL);
415 in = (ICONV_CONST
char *)model->
word_str[i];
420 if ((result = iconv(ic, NULL, NULL, &out, &outleft)) == (size_t)-1) {
421 if (errno != E2BIG) {
430 iconv(ic, NULL, NULL, NULL, NULL);
435 goto start_conversion;
438 result = maxlen - outleft;
450 memcpy(model->
word_str[i], outbuf, result);
456 E_WARN(
"Duplicate word in dictionary after conversion: %s\n",
464 model->
wid = new_wid;
478 float32 lw, float32 wip, float32 uw)
487 if (out_log_wip) *out_log_wip = model->
log_wip;
488 if (out_log_uw) *out_log_uw = model->
log_uw;
495 int32 n_hist, int32 *n_used)
497 int32 score, class_weight = 0;
505 if (NGRAM_IS_CLASSWID(wid)) {
508 class_weight = ngram_class_prob(lmclass, wid);
509 if (class_weight == 1)
513 for (i = 0; i < n_hist; ++i) {
517 score = (*model->
funcs->
score)(model, wid, history, n_hist, n_used);
520 return score + class_weight;
533 va_start(history, word);
535 while ((hword = va_arg(history,
const char *)) != NULL)
540 va_start(history, word);
542 while ((hword = va_arg(history,
const char *)) != NULL) {
543 histid[n_hist] =
ngram_wid(model, hword);
549 histid, n_hist, &n_used);
571 int32 n_hist, int32 *n_used)
573 int32 prob, class_weight = 0;
581 if (NGRAM_IS_CLASSWID(wid)) {
584 class_weight = ngram_class_prob(lmclass, wid);
585 if (class_weight == 1)
589 for (i = 0; i < n_hist; ++i) {
596 return prob + class_weight;
609 va_start(history, word);
611 while ((hword = va_arg(history,
const char *)) != NULL)
616 va_start(history, word);
618 while ((hword = va_arg(history,
const char *)) != NULL) {
619 histid[n_hist] =
ngram_wid(model, hword);
625 histid, n_hist, &n_used);
638 prob = (int32)(prob / base->
lw);
680 int m,
int successor)
710 va_start(history, word);
712 while ((hword = va_arg(history,
const char *)) != NULL)
717 va_start(history, word);
719 while ((hword = va_arg(history,
const char *)) != NULL) {
720 histid[n_hist] =
ngram_wid(model, hword);
733 if (n_hist >= model->
n)
737 return (*model->
funcs->
iter)(model, wid, history, n_hist);
744 if (itor->
m == itor->model->
n - 1)
754 return (*itor->model->
funcs->
iter_get)(itor, out_score, out_bowt);
785 wid = NGRAM_BASEWID(wid);
805 wid = NGRAM_CLASSWID(wid, classid);
809 E_ERROR(
"Duplicate definition of word %s\n", word);
823 E_ERROR(
"Hash insertion failed for word %s => %p (should not happen)\n",
833 const char *word, float32 weight)
839 E_WARN(
"Can't add word '%s' to read-only language model. "
840 "Disable mmap with '-mmap no' to make it writable\n", word);
844 wid = ngram_add_word_internal(model, word, -1);
871 lmclass->nword_hash = NULL;
874 for (gn = classwords; gn; gn = gnode_next(gn)) {
875 tprob += gnode_float32(gn);
877 if (tprob > 1.1 || tprob < 0.9) {
878 E_WARN(
"Total class probability is %f, will normalize\n", tprob);
879 for (gn = classwords; gn; gn = gnode_next(gn)) {
880 gn->data.fl /= tprob;
883 for (i = 0, gn = classwords; gn; ++i, gn = gnode_next(gn)) {
891 ngram_class_add_word(
ngram_class_t *lmclass, int32 wid, int32 lweight)
895 if (lmclass->nword_hash == NULL) {
897 lmclass->nword_hash =
ckd_malloc(NGRAM_HASH_SIZE *
sizeof(*lmclass->nword_hash));
898 memset(lmclass->nword_hash, 0xff, NGRAM_HASH_SIZE *
sizeof(*lmclass->nword_hash));
899 lmclass->
n_hash = NGRAM_HASH_SIZE;
905 hash = wid & (lmclass->
n_hash - 1);
906 if (lmclass->nword_hash[hash].
wid == -1) {
908 lmclass->nword_hash[hash].
wid = wid;
909 lmclass->nword_hash[hash].
prob1 = lweight;
916 while (lmclass->nword_hash[hash].
next != -1)
917 hash = lmclass->nword_hash[hash].
next;
922 lmclass->nword_hash =
ckd_realloc(lmclass->nword_hash,
923 lmclass->
n_hash * 2 *
sizeof(*lmclass->nword_hash));
924 memset(lmclass->nword_hash + lmclass->
n_hash,
925 0xff, lmclass->
n_hash *
sizeof(*lmclass->nword_hash));
932 for (next = 0; next < lmclass->
n_hash; ++next)
933 if (lmclass->nword_hash[next].
wid == -1)
936 assert(next != lmclass->
n_hash);
938 lmclass->nword_hash[next].
wid = wid;
939 lmclass->nword_hash[next].
prob1 = lweight;
940 lmclass->nword_hash[hash].
next = next;
956 const char *classname,
961 int32 classid, tag_wid, wid, i, scale;
969 E_ERROR(
"No such word or class tag: %s\n", classname);
972 for (classid = 0; classid < model->
n_classes; ++classid) {
978 E_ERROR(
"Word %s is not a class tag (call ngram_model_add_class() first)\n", classname);
981 lmclass = model->
classes[classid];
984 wid = ngram_add_word_internal(model, word, classid);
994 for (i = 0; i < lmclass->
n_words; ++i)
995 lmclass->
prob1[i] += scale;
996 for (i = 0; i < lmclass->
n_hash; ++i)
997 if (lmclass->nword_hash[i].
wid != -1)
998 lmclass->nword_hash[i].
prob1 += scale;
1001 return ngram_class_add_word(lmclass, wid,
logmath_log(model->
lmath, fprob));
1006 const char *classname,
1007 float32 classweight,
1009 const float32 *weights,
1014 int32 i, start_wid = -1;
1015 int32 classid, tag_wid;
1025 E_ERROR(
"Number of classes cannot exceed 128 (sorry)\n");
1029 for (i = 0; i < n_words; ++i) {
1032 wid = ngram_add_word_internal(model, words[i], classid);
1035 if (start_wid == -1)
1036 start_wid = NGRAM_BASEWID(wid);
1040 lmclass = ngram_class_new(model, tag_wid, start_wid, classwords);
1042 if (lmclass == NULL)
1051 model->
classes[classid] = lmclass;
1058 int32 base_wid = NGRAM_BASEWID(wid);
1060 if (base_wid < lmclass->start_wid
1065 hash = wid & (lmclass->
n_hash - 1);
1066 while (hash != -1 && lmclass->nword_hash[hash].
wid != wid)
1067 hash = lmclass->nword_hash[hash].
next;
1070 return lmclass->nword_hash[hash].
prob1;
1078 read_classdef_file(
hash_table_t *classes,
const char *file_name)
1087 char *classname = NULL;
1089 if ((fp =
fopen_comp(file_name,
"r", &is_pipe)) == NULL) {
1090 E_ERROR(
"File %s not found\n", file_name);
1100 if (fgets(line,
sizeof(line), fp) == NULL)
1109 if (n_words == 2 && 0 == strcmp(wptr[0],
"END")) {
1114 if (classname == NULL || 0 != strcmp(wptr[1], classname))
1123 classdef->words =
ckd_calloc(classdef->n_words,
1124 sizeof(*classdef->words));
1125 classdef->weights =
ckd_calloc(classdef->n_words,
1126 sizeof(*classdef->weights));
1128 weight = classprobs;
1129 for (i = 0; i < classdef->n_words; ++i) {
1131 classdef->weights[i] = gnode_float32(weight);
1132 word = gnode_next(word);
1133 weight = gnode_next(weight);
1138 classdef_free(classdef);
1153 fprob = (float32)
atof_c(wptr[1]);
1163 if (n_words == 2 && 0 == strcmp(wptr[0],
"LMCLASS")) {
1177 for (gn = classwords; gn; gn = gnode_next(gn))
1190 for (i = 0; i < classdef->n_words; ++i)
1200 const char *file_name)
1208 if (read_classdef_file(classes, file_name) < 0) {
1215 for (gn = hl; gn; gn = gnode_next(gn)) {
1222 classdef->n_words) < 0)
1228 for (gn = hl; gn; gn = gnode_next(gn)) {
1231 classdef_free(he->
val);
SPHINXBASE_EXPORT int32 ngram_ng_prob(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist, int32 *n_used)
Quick "raw" probability lookup for a general N-Gram.
struct ngram_funcs_s * funcs
Implementation-specific methods.
SPHINXBASE_EXPORT int32 hash_table_lookup_int32(hash_table_t *h, const char *key, int32 *val)
Look up a 32-bit integer value in a hash table.
int32 next
Index of next bucket (or -1 for no collision)
SPHINXBASE_EXPORT glist_t glist_add_float32(glist_t g, float32 val)
Create and prepend a new list node containing a single-precision float.
SPHINXBASE_EXPORT ngram_model_t * ngram_model_read(cmd_ln_t *config, const char *file_name, ngram_file_type_t file_type, logmath_t *lmath)
Read an N-Gram model from a file on disk.
Miscellaneous useful string functions.
SPHINXBASE_EXPORT int32 hash_table_lookup(hash_table_t *h, const char *key, void **val)
Look up a key in a hash table and optionally return the associated value.
SPHINXBASE_EXPORT int ngram_model_write(ngram_model_t *model, const char *file_name, ngram_file_type_t format)
Write an N-Gram model to disk.
SPHINXBASE_EXPORT int ngram_model_casefold(ngram_model_t *model, int kase)
Case-fold word strings in an N-Gram model.
int32 log_uniform
Log of uniform (0-gram) probability.
#define ckd_calloc(n, sz)
Macros to simplify the use of above functions.
SPHINXBASE_EXPORT int32 ngram_unknown_wid(ngram_model_t *model)
Get the unknown word ID for a language model.
#define hash_table_enter_int32(h, k, v)
Add a 32-bit integer value to a hash table.
hash_table_t * wid
Mapping of unigram names to word IDs.
char ** word_str
Unigram names.
SPHINXBASE_EXPORT void ngram_iter_free(ngram_iter_t *itor)
Terminate an M-gram iterator.
SPHINXBASE_EXPORT int32 const * ngram_model_get_counts(ngram_model_t *model)
Get the counts of the various N-grams in the model.
ngram_iter_t *(* iter_next)(ngram_iter_t *itor)
Implementation-specific function for iterating.
Sphinx's memory allocation/deallocation routines.
int32 log_uniform_weight
Log of uniform weight (i.e.
SPHINXBASE_EXPORT int32 ngram_wid(ngram_model_t *model, const char *word)
Look up numerical word ID.
SPHINXBASE_EXPORT int32 const * ngram_iter_get(ngram_iter_t *itor, int32 *out_score, int32 *out_bowt)
Get information from the current M-gram in an iterator.
int32 n_hash
Number of buckets in nword_hash (power of 2)
SPHINXBASE_EXPORT glist_t hash_table_tolist(hash_table_t *h, int32 *count)
Build a glist of valid hash_entry_t pointers from the given hash table.
void(* free)(ngram_model_t *model)
Implementation-specific function for freeing an ngram_model_t.
#define NGRAM_INVALID_WID
Impossible word ID.
SPHINXBASE_EXPORT ngram_iter_t * ngram_iter_next(ngram_iter_t *itor)
Advance an M-gram iterator.
SPHINXBASE_EXPORT int cmd_ln_exists_r(cmd_ln_t *cmdln, char const *name)
Re-entrant version of cmd_ln_exists().
File names related operation.
int(* apply_weights)(ngram_model_t *model, float32 lw, float32 wip, float32 uw)
Implementation-specific function for applying language model weights.
SPHINXBASE_EXPORT int logmath_log(logmath_t *lmath, float64 p)
Convert linear floating point number to integer log in base B.
SPHINXBASE_EXPORT int32 ngram_tg_score(ngram_model_t *model, int32 w3, int32 w2, int32 w1, int32 *n_used)
Quick trigram score lookup.
A node in a generic list.
uint8 writable
Are word strings writable?
ngram_iter_t *(* iter)(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist)
Implementation-specific function for iterating.
int32 * n_counts
Counts for 1, 2, 3, ...
SPHINXBASE_EXPORT int ngram_model_free(ngram_model_t *model)
Release memory associated with an N-Gram model.
#define ckd_salloc(ptr)
Macro for ckd_salloc
SPHINXBASE_EXPORT hash_table_t * hash_table_new(int32 size, int32 casearg)
Allocate a new hash table for a given expected size.
SPHINXBASE_EXPORT void hash_table_empty(hash_table_t *h)
Delete all entries from a hash_table.
SPHINXBASE_EXPORT ngram_iter_t * ngram_iter_successors(ngram_iter_t *itor)
Iterate over all M-gram successors of an M-1-gram.
SPHINXBASE_EXPORT void ckd_free(void *ptr)
Test and free a 1-D array.
SPHINXBASE_EXPORT glist_t glist_add_ptr(glist_t g, void *ptr)
Create and prepend a new list node, with the given user-defined data, at the HEAD of the given generi...
int32 n_words
Number of base words for this class.
#define E_WARN
Print warning information to standard error stream.
SPHINXBASE_EXPORT int32 strcmp_nocase(const char *str1, const char *str2)
(FIXME! The implementation is incorrect!) Case insensitive string compare.
int32 log_zero
Zero probability, cached here for quick lookup.
SPHINXBASE_EXPORT void hash_table_free(hash_table_t *h)
Free the specified hash table; the caller is responsible for freeing the key strings pointed to by th...
int refcount
Reference count.
A note by ARCHAN at 20050510: Technically what we use is so-called "hash table with buckets" which is...
int32 n_1g_alloc
Number of allocated word strings (for new word addition)
SPHINXBASE_EXPORT double atof_c(char const *str)
Locale independent version of atof().
SPHINXBASE_EXPORT int32 strncmp_nocase(const char *str1, const char *str2, size_t len)
Like strcmp_nocase() but with a maximum length.
int32(* raw_score)(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist, int32 *n_used)
Implementation-specific function for querying raw language model probability.
ARPABO text format (the standard).
SPHINXBASE_EXPORT glist_t glist_reverse(glist_t g)
Reverse the order of the given glist.
SPHINXBASE_EXPORT int32 ngram_prob(ngram_model_t *model, const char *word,...)
Get the "raw" log-probability for a general N-Gram.
SPHINXBASE_EXPORT int ngram_model_apply_weights(ngram_model_t *model, float32 lw, float32 wip, float32 uw)
Apply a language weight, insertion penalty, and unigram weight to a language model.
int32 tag_wid
Base word ID for this class tag.
SPHINXBASE_EXPORT void glist_free(glist_t g)
Free the given generic list; user-defined data contained within is not automatically freed...
SPHINXBASE_EXPORT ngram_model_t * ngram_model_retain(ngram_model_t *model)
Retain ownership of an N-Gram model.
SPHINXBASE_EXPORT int32 ngram_score(ngram_model_t *model, const char *word,...)
Get the score (scaled, interpolated log-probability) for a general N-Gram.
#define gnode_ptr(g)
Head of a list of gnodes.
SPHINXBASE_EXPORT int32 ngram_model_get_size(ngram_model_t *model)
Get the order of the N-gram model (i.e.
int32 n_hash_inuse
Number of words in nword_hash.
SPHINXBASE_EXPORT char const * ngram_type_to_str(int type)
Get the canonical name for an N-Gram file type.
SPHINXBASE_EXPORT int32 ngram_bg_score(ngram_model_t *model, int32 w2, int32 w1, int32 *n_used)
Quick bigram score lookup.
SPHINXBASE_EXPORT int32 ngram_model_add_word(ngram_model_t *model, const char *word, float32 weight)
Add a word (unigram) to the language model.
uint8 n
This is an n-gram model (1, 2, 3, ...).
Implementation of logging routines.
logmath_t * lmath
Log-math object.
SPHINXBASE_EXPORT float32 ngram_model_get_weights(ngram_model_t *model, int32 *out_log_wip, int32 *out_log_uw)
Get the current weights from a language model.
SPHINXBASE_EXPORT int32 ngram_model_add_class(ngram_model_t *model, const char *classname, float32 classweight, char **words, const float32 *weights, int32 n_words)
Add a new class to a language model.
SPHINXBASE_EXPORT void * hash_table_enter(hash_table_t *h, const char *key, void *val)
Try to add a new entry with given key and associated value to hash table h.
SPHINXBASE_EXPORT FILE * fopen_comp(const char *file, const char *mode, int32 *ispipe)
Like fopen, but use popen and zcat if it is determined that "file" is compressed (i.e., has a .z, .Z, .gz, or .GZ extension).
One class definition from a classdef file.
SPHINXBASE_EXPORT ngram_file_type_t ngram_str_to_type(const char *str_name)
Get the N-Gram file type from a string.
int32 log_uw
Log of unigram weight.
int32 start_wid
Starting base word ID for this class' words.
SPHINXBASE_EXPORT ngram_iter_t * ngram_ng_iter(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist)
Get an iterator over M-grams pointing to the specified M-gram.
SPHINXBASE_EXPORT int32 ngram_model_read_classdef(ngram_model_t *model, const char *file_name)
Read a class definition file and add classes to a language model.
SPHINXBASE_EXPORT int logmath_get_zero(logmath_t *lmath)
Get the smallest possible value represented in this base.
int32(* add_ug)(ngram_model_t *model, int32 wid, int32 lweight)
Implementation-specific function for adding unigrams.
SPHINXBASE_EXPORT void ngram_model_flush(ngram_model_t *lm)
Flush any cached N-Gram information.
uint8 n_classes
Number of classes (maximum 128)
SPHINXBASE_EXPORT int32 str2words(char *line, char **wptr, int32 n_wptr)
Convert a line to an array of "words", based on whitespace separators.
Opaque structure used to hold the results of command-line parsing.
#define ckd_malloc(sz)
Macro for ckd_malloc
enum ngram_file_type_e ngram_file_type_t
File types for N-Gram files.
SPHINXBASE_EXPORT ngram_iter_t * ngram_iter(ngram_model_t *model, const char *word,...)
Get an iterator over M-grams pointing to the specified M-gram.
#define E_ERROR
Print error message to standard error stream.
void(* iter_free)(ngram_iter_t *itor)
Implementation-specific function for iterating.
Implementation-specific functions for operating on ngram_model_t objects.
SPHINXBASE_EXPORT ngram_file_type_t ngram_file_name_to_type(const char *file_name)
Guess the file type for an N-Gram model from the filename.
float32 lw
Language model scaling factor.
Base iterator structure for N-grams.
int32 const *(* iter_get)(ngram_iter_t *itor, int32 *out_score, int32 *out_bowt)
Implementation-specific function for iterating.
Implementation of ngram_class_t.
ngram_iter_t *(* successors)(ngram_iter_t *itor)
Implementation-specific function for iterating.
SPHINXBASE_EXPORT void lcase(char *str)
Convert str to all lower case.
int32 prob1
Probability for this word.
int32 * wids
Scratch space for word IDs.
void(* flush)(ngram_model_t *model)
Implementation-specific function for purging N-Gram cache.
SPHINXBASE_EXPORT void ucase(char *str)
Convert str to all upper case.
Common implementation of ngram_model_t.
void * val
Key-length; the key string does not have to be a C-style NULL terminated string; it can have arbitrar...
int32 wid
Word ID of this bucket.
SPHINXBASE_EXPORT int32 ngram_score_to_prob(ngram_model_t *model, int32 score)
Convert score to "raw" log-probability.
SPHINXBASE_EXPORT const char * ngram_word(ngram_model_t *model, int32 wid)
Look up word string for numerical word ID.
SPHINXBASE_EXPORT void fclose_comp(FILE *fp, int32 ispipe)
Close a file opened using fopen_comp.
int32(* score)(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist, int32 *n_used)
Implementation-specific function for querying language model score.
SPHINXBASE_EXPORT int32 ngram_zero(ngram_model_t *model)
Get the "zero" log-probability value for a language model.
#define E_ERROR_SYSTEM
Print error text; Call perror("");.
Fast integer logarithmic addition operations.
struct ngram_class_s ** classes
Word class definitions.
#define ckd_realloc(ptr, sz)
Macro for ckd_realloc
int16 successor
Is this a successor iterator?
SPHINXBASE_EXPORT int32 ngram_model_add_class_word(ngram_model_t *model, const char *classname, const char *word, float32 weight)
Add a word to a class in a language model.
SPHINXBASE_EXPORT ngram_iter_t * ngram_model_mgrams(ngram_model_t *model, int m)
Iterate over all M-grams.
ngram_iter_t *(* mgrams)(ngram_model_t *model, int32 m)
Implementation-specific function for iterating.
SPHINXBASE_EXPORT int32 glist_count(glist_t g)
Count the number of element in a given link list.
Determine file type automatically.
SPHINXBASE_EXPORT int32 ngram_ng_score(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist, int32 *n_used)
Quick general N-Gram score lookup.
file IO related operations.
Locale-independent implementation of case swapping operation.
int32 * prob1
Probability table for base words.
int32 log_wip
Log of word insertion penalty.
SPHINXBASE_EXPORT int ngram_model_recode(ngram_model_t *model, const char *from, const char *to)
Re-encode word strings in an N-Gram model.
int32 n_words
Number of actual word strings (NOT the same as the number of unigrams, due to class words)...