55 my_compare(
const void *a,
const void *b)
58 if (strcmp(*(
char *
const *)a,
"<UNK>") == 0)
60 else if (strcmp(*(
char *
const *)b,
"<UNK>") == 0)
63 return strcmp(*(
char *
const *)a, *(
char *
const *)b);
79 for (i = 0; i < set->
n_models; ++i) {
81 for (j = 0; j < models[i]->
n_words; ++j) {
90 ngram_model_init(base, &ngram_model_set_funcs, lmath, n, hash_table_inuse(vocab));
94 for (gn = hlist; gn; gn = gnode_next(gn)) {
96 base->
word_str[i++] = (
char *)ent->key;
106 for (i = 0; i < base->
n_words; ++i) {
111 for (j = 0; j < set->
n_models; ++j) {
124 const float32 *weights,
137 lmath = models[0]->
lmath;
138 for (i = 1; i < n_models; ++i) {
141 E_ERROR(
"Log-math parameters don't match, will not create LM set\n");
156 for (i = 0; i < n_models; ++i)
164 for (i = 0; i < n_models; ++i) {
165 model->
lms[i] = models[i];
170 if (models[i]->n > n)
177 build_widmap(base, lmath, n);
183 const char *lmctlfile,
189 __BIGSTACKVARIABLE__
char str[1024];
197 if ((ctlfp = fopen(lmctlfile,
"r")) == NULL) {
204 if ((c = strrchr(lmctlfile,
'/')) || (c = strrchr(lmctlfile,
'\\'))) {
207 memcpy(basedir, lmctlfile, c - lmctlfile + 1);
212 E_INFO(
"Reading LM control file '%s'\n", lmctlfile);
214 E_INFO(
"Will prepend '%s' to unqualified paths\n", basedir);
216 if (fscanf(ctlfp,
"%1023s", str) == 1) {
217 if (strcmp(str,
"{") == 0) {
219 while ((fscanf(ctlfp,
"%1023s", str) == 1)
220 && (strcmp(str,
"}") != 0)) {
226 E_INFO(
"Reading classdef from '%s'\n", deffile);
227 if (read_classdef_file(classes, deffile) < 0) {
234 if (strcmp(str,
"}") != 0) {
235 E_ERROR(
"Unexpected EOF in %s\n", lmctlfile);
240 if (fscanf(ctlfp,
"%1023s", str) != 1)
248 while (str[0] !=
'\0') {
252 if (basedir && str[0] !=
'/' && str[0] !=
'\\')
256 E_INFO(
"Reading lm from '%s'\n", lmfile);
262 if (fscanf(ctlfp,
"%1023s", str) != 1) {
263 E_ERROR(
"LMname missing after LMFileName '%s'\n", lmfile);
271 if (fscanf(ctlfp,
"%1023s", str) == 1) {
272 if (strcmp(str,
"{") == 0) {
274 while ((fscanf(ctlfp,
"%1023s", str) == 1) &&
275 (strcmp(str,
"}") != 0)) {
280 E_ERROR(
"Unknown class %s in control file\n", str);
285 classdef->words, classdef->weights,
286 classdef->n_words) < 0) {
289 E_INFO(
"Added class %s containing %d words\n",
290 str, classdef->n_words);
292 if (strcmp(str,
"}") != 0) {
293 E_ERROR(
"Unexpected EOF in %s\n", lmctlfile);
296 if (fscanf(ctlfp,
"%1023s", str) != 1)
317 lm_array =
ckd_calloc(n_models,
sizeof(*lm_array));
318 name_array =
ckd_calloc(n_models,
sizeof(*name_array));
321 for (i = 0; i < n_models; ++i) {
324 lm_node = gnode_next(lm_node);
325 name_node = gnode_next(name_node);
338 for (gn = lms; gn; gn = gnode_next(gn)) {
343 for (gn = lmnames; gn; gn = gnode_next(gn)) {
348 for (gn = hlist; gn; gn = gnode_next(gn)) {
351 classdef_free(he->
val);
373 if (set == NULL || set->
n_models == 0)
383 if (++itor->cur == itor->set->
n_models) {
400 if (lmname) *lmname = itor->set->
names[itor->cur];
401 return itor->set->
lms[itor->cur];
415 return set->
lms[set->
cur];
420 if (0 == strcmp(set->
names[i], name))
436 if (0 == strcmp(set->
names[i], name))
441 return set->
lms[set->
cur];
461 if (set->
cur == -1 || set_wid >= base->
n_words)
475 else if (set->
cur == -1) {
477 for (i = 0; i < set->
n_models; ++i) {
491 const float32 *weights)
496 if (names && weights) {
500 for (i = 0; i < set->
n_models; ++i) {
502 if (0 == strcmp(names[i], set->
names[j]))
505 E_ERROR(
"Unknown LM name %s\n", names[i]);
538 if (model->
n > base->
n) {
541 (model->
n - 1) *
sizeof(*set->
maphist));
545 fprob = weight * 1.0 / set->
n_models;
553 for (i = 0; i < set->
n_models - 1; ++i)
562 sizeof (**new_widmap));
563 for (i = 0; i < base->
n_words; ++i) {
565 memcpy(new_widmap[i], set->
widmap[i],
566 (set->
n_models - 1) *
sizeof(**new_widmap));
574 build_widmap(base, base->
lmath, base->
n);
586 int32 lmidx, scale, n, i;
589 for (lmidx = 0; lmidx < set->
n_models; ++lmidx)
590 if (0 == strcmp(name, set->
names[lmidx]))
594 submodel = set->
lms[lmidx];
606 set->
names[lmidx] = NULL;
607 for (i = 0; i < set->
n_models; ++i) {
609 set->
lms[i] = set->
lms[i+1];
614 if (set->
lms[i]->
n > n)
625 for (i = 0; i < base->
n_words; ++i) {
626 memmove(set->
widmap[i] + lmidx, set->
widmap[i] + lmidx + 1,
631 build_widmap(base, base->
lmath, n);
646 for (i = 0; i < base->
n_words; ++i) {
657 for (i = 0; i < n_words; ++i) {
661 for (j = 0; j < set->
n_models; ++j) {
668 ngram_model_set_apply_weights(
ngram_model_t *base, float32 lw,
669 float32 wip, float32 uw)
682 int32 *history, int32 n_hist,
691 if (n_hist > base->
n - 1)
692 n_hist = base->
n - 1;
695 if (set->
cur == -1) {
697 for (i = 0; i < set->
n_models; ++i) {
700 mapwid = set->
widmap[wid][i];
701 for (j = 0; j < n_hist; ++j) {
710 mapwid, set->
maphist, n_hist, n_used));
717 for (j = 0; j < n_hist; ++j) {
724 mapwid, set->
maphist, n_hist, n_used);
732 int32 *history, int32 n_hist,
741 if (n_hist > base->
n - 1)
742 n_hist = base->
n - 1;
745 if (set->
cur == -1) {
747 for (i = 0; i < set->
n_models; ++i) {
750 mapwid = set->
widmap[wid][i];
751 for (j = 0; j < n_hist; ++j) {
760 mapwid, set->
maphist, n_hist, n_used));
767 for (j = 0; j < n_hist; ++j) {
774 mapwid, set->
maphist, n_hist, n_used);
782 int32 wid, int32 lweight)
793 for (i = 0; i < set->
n_models; ++i) {
797 if (set->
cur == -1 || set->
cur == i) {
814 else if (set->
cur == -1)
829 for (i = 0; i < base->
n_words; ++i)
831 memcpy(set->
widmap[wid], newwid, set->
n_models *
sizeof(*newwid));
864 ngram_model_set_free,
865 ngram_model_set_apply_weights,
866 ngram_model_set_score,
867 ngram_model_set_raw_score,
868 ngram_model_set_add_ug,
869 ngram_model_set_flush
SPHINXBASE_EXPORT int32 ngram_ng_prob(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist, int32 *n_used)
Quick "raw" probability lookup for a general N-Gram.
char ** names
Names for language models.
SPHINXBASE_EXPORT ngram_model_t * ngram_model_read(cmd_ln_t *config, const char *file_name, ngram_file_type_t file_type, logmath_t *lmath)
Read an N-Gram model from a file on disk.
Miscellaneous useful string functions.
SPHINXBASE_EXPORT int32 hash_table_lookup(hash_table_t *h, const char *key, void **val)
Look up a key in a hash table and optionally return the associated value.
#define ckd_calloc_2d(d1, d2, sz)
Macro for ckd_calloc_2d
ngram_model_t * ngram_model_set_interp(ngram_model_t *base, const char **names, const float32 *weights)
Set interpolation weights for a set and enables interpolation.
#define ckd_calloc(n, sz)
Macros to simplify the use of above functions.
ngram_model_t * ngram_model_set_add(ngram_model_t *base, ngram_model_t *model, const char *name, float32 weight, int reuse_widmap)
Add a language model to a set.
SPHINXBASE_EXPORT int32 ngram_unknown_wid(ngram_model_t *model)
Get the unknown word ID for a language model.
#define hash_table_enter_int32(h, k, v)
Add a 32-bit integer value to a hash table.
hash_table_t * wid
Mapping of unigram names to word IDs.
char ** word_str
Unigram names.
ngram_model_t * ngram_model_set_select(ngram_model_t *base, const char *name)
Select a single language model from a set for scoring.
#define E_INFO
Print logging information to standard error stream.
Sphinx's memory allocation/deallocation routines.
int32 * lweights
Log interpolation weights.
SPHINXBASE_EXPORT int32 ngram_wid(ngram_model_t *model, const char *word)
Look up numerical word ID.
SPHINXBASE_EXPORT glist_t hash_table_tolist(hash_table_t *h, int32 *count)
Build a glist of valid hash_entry_t pointers from the given hash table.
#define NGRAM_INVALID_WID
Impossible word ID.
File names related operation.
int32 ** widmap
Word ID mapping for submodels.
SPHINXBASE_EXPORT int logmath_log(logmath_t *lmath, float64 p)
Convert linear floating point number to integer log in base B.
A node in a generic list.
Subclass of ngram_model for grouping language models.
uint8 writable
Are word strings writable?
SPHINXBASE_EXPORT int ngram_model_free(ngram_model_t *model)
Release memory associated with an N-Gram model.
#define ckd_salloc(ptr)
Macro for ckd_salloc
int32 n_models
Number of models in this set.
SPHINXBASE_EXPORT hash_table_t * hash_table_new(int32 size, int32 casearg)
Allocate a new hash table for a given expected size.
SPHINXBASE_EXPORT void hash_table_empty(hash_table_t *h)
Delete all entries from a hash_table.
SPHINXBASE_EXPORT void ckd_free(void *ptr)
Test and free a 1-D array.
SPHINXBASE_EXPORT glist_t glist_add_ptr(glist_t g, void *ptr)
Create and prepend a new list node, with the given user-defined data, at the HEAD of the given generi...
int32 * maphist
Word ID mapping for N-Gram history.
int32 log_zero
Zero probability, cached here for quick lookup.
SPHINXBASE_EXPORT void hash_table_free(hash_table_t *h)
Free the specified hash table; the caller is responsible for freeing the key strings pointed to by th...
A note by ARCHAN at 20050510: Technically what we use is so-called "hash table with buckets" which is...
int32 n_1g_alloc
Number of allocated word strings (for new word addition)
SPHINXBASE_EXPORT glist_t glist_reverse(glist_t g)
Reverse the order of the given glist.
Iterator over a model set.
SPHINXBASE_EXPORT int logmath_get_shift(logmath_t *lmath)
Get the shift of the values in a log table.
SPHINXBASE_EXPORT int ngram_model_apply_weights(ngram_model_t *model, float32 lw, float32 wip, float32 uw)
Apply a language weight, insertion penalty, and unigram weight to a language model.
ngram_model_t base
Base ngram_model_t structure.
SPHINXBASE_EXPORT void glist_free(glist_t g)
Free the given generic list; user-defined data contained within is not automatically freed...
int32 cur
Currently selected model, or -1 for none.
SPHINXBASE_EXPORT float64 logmath_get_base(logmath_t *lmath)
Get the log base.
ngram_model_set_iter_t * ngram_model_set_iter(ngram_model_t *base)
Begin iterating over language models in a set.
SPHINXBASE_EXPORT int path_is_absolute(const char *file)
Test whether a pathname is absolute for the current OS.
ngram_model_t * ngram_model_set_init(cmd_ln_t *config, ngram_model_t **models, char **names, const float32 *weights, int32 n_models)
Create a set of language models sharing a common space of word IDs.
#define gnode_ptr(g)
Head of a list of gnodes.
SPHINXBASE_EXPORT int32 ngram_model_add_word(ngram_model_t *model, const char *word, float32 weight)
Add a word (unigram) to the language model.
uint8 n
This is an n-gram model (1, 2, 3, ...).
Implementation of logging routines.
logmath_t * lmath
Log-math object.
ngram_model_t ** lms
Language models in this set.
ngram_model_t * ngram_model_set_lookup(ngram_model_t *base, const char *name)
Look up a language model by name from a set.
const char * ngram_model_set_current(ngram_model_t *base)
Get the current language model name, if any.
SPHINXBASE_EXPORT int32 ngram_model_add_class(ngram_model_t *model, const char *classname, float32 classweight, char **words, const float32 *weights, int32 n_words)
Add a new class to a language model.
ngram_model_t * ngram_model_set_iter_model(ngram_model_set_iter_t *itor, char const **lmname)
Get language model and associated name from an iterator.
One class definition from a classdef file.
void ngram_model_set_map_words(ngram_model_t *base, const char **words, int32 n_words)
Set the word-to-ID mapping for this model set.
SPHINXBASE_EXPORT void ngram_model_flush(ngram_model_t *lm)
Flush any cached N-Gram information.
Opaque structure used to hold the results of command-line parsing.
SPHINXBASE_EXPORT char * string_join(const char *base,...)
Concatenate a NULL-terminated argument list of strings, returning a newly allocated string...
#define E_ERROR
Print error message to standard error stream.
Implementation-specific functions for operating on ngram_model_t objects.
SPHINXBASE_EXPORT void ckd_free_2d(void *ptr)
Free a 2-D array (ptr) previously allocated by ckd_calloc_2d.
int32 ngram_model_set_count(ngram_model_t *base)
Returns the number of language models in a set.
int32 ngram_model_set_known_wid(ngram_model_t *base, int32 set_wid)
Test whether a word ID corresponds to a known word in the current state of the language model set...
void ngram_model_set_iter_free(ngram_model_set_iter_t *itor)
Finish iteration over a langauge model set.
ngram_model_set_iter_t * ngram_model_set_iter_next(ngram_model_set_iter_t *itor)
Move to the next language model in a set.
Common implementation of ngram_model_t.
void * val
Key-length; the key string does not have to be a C-style NULL terminated string; it can have arbitrar...
#define E_ERROR_SYSTEM
Print error text; Call perror("");.
ngram_model_t * ngram_model_set_read(cmd_ln_t *config, const char *lmctlfile, logmath_t *lmath)
Read a set of language models from a control file.
SPHINXBASE_EXPORT float64 logmath_exp(logmath_t *lmath, int logb_p)
Convert integer log in base B to linear floating point.
#define ckd_realloc(ptr, sz)
Macro for ckd_realloc
int32 ngram_model_set_current_wid(ngram_model_t *base, int32 set_wid)
Query the word-ID mapping for the current language model.
SPHINXBASE_EXPORT int32 glist_count(glist_t g)
Count the number of element in a given link list.
Determine file type automatically.
SPHINXBASE_EXPORT int32 ngram_ng_score(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist, int32 *n_used)
Quick general N-Gram score lookup.
ngram_model_t * ngram_model_set_remove(ngram_model_t *base, const char *name, int reuse_widmap)
Remove a language model from a set.
SPHINXBASE_EXPORT int logmath_add(logmath_t *lmath, int logb_p, int logb_q)
Add two values in log space (i.e.
int32 n_words
Number of actual word strings (NOT the same as the number of unigrams, due to class words)...