43 #ifndef __NGRAM_MODEL_H__
44 #define __NGRAM_MODEL_H__
49 #include <sphinxbase/sphinxbase_export.h>
84 #define NGRAM_INVALID_WID -1
108 const char *file_name,
211 float32 lw, float32 wip, float32 uw);
265 int32 w3, int32 w2, int32 w1,
281 int32 n_hist, int32 *n_used);
304 int32 n_hist, int32 *n_used);
441 const char *word, float32 weight);
458 const char *file_name);
470 const char *classname,
473 const float32 *weights,
487 const char *classname,
519 const float32 *weights,
554 const char *lmctlfile,
599 char const **lmname);
637 const float32 *weights);
SPHINXBASE_EXPORT int32 ngram_ng_prob(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist, int32 *n_used)
Quick "raw" probability lookup for a general N-Gram.
SPHINXBASE_EXPORT ngram_model_t * ngram_model_set_init(cmd_ln_t *config, ngram_model_t **models, char **names, const float32 *weights, int32 n_models)
Create a set of language models sharing a common space of word IDs.
Command-line and other configurationparsing and handling.
SPHINXBASE_EXPORT ngram_model_t * ngram_model_read(cmd_ln_t *config, const char *file_name, ngram_file_type_t file_type, logmath_t *lmath)
Read an N-Gram model from a file on disk.
SPHINXBASE_EXPORT int ngram_model_write(ngram_model_t *model, const char *file_name, ngram_file_type_t format)
Write an N-Gram model to disk.
SPHINXBASE_EXPORT int ngram_model_casefold(ngram_model_t *model, int kase)
Case-fold word strings in an N-Gram model.
SPHINXBASE_EXPORT ngram_model_t * ngram_model_set_lookup(ngram_model_t *set, const char *name)
Look up a language model by name from a set.
SPHINXBASE_EXPORT int32 ngram_unknown_wid(ngram_model_t *model)
Get the unknown word ID for a language model.
SPHINXBASE_EXPORT void ngram_iter_free(ngram_iter_t *itor)
Terminate an M-gram iterator.
SPHINXBASE_EXPORT int32 const * ngram_model_get_counts(ngram_model_t *model)
Get the counts of the various N-grams in the model.
SPHINXBASE_EXPORT ngram_model_set_iter_t * ngram_model_set_iter_next(ngram_model_set_iter_t *itor)
Move to the next language model in a set.
SPHINXBASE_EXPORT int32 ngram_wid(ngram_model_t *model, const char *word)
Look up numerical word ID.
SPHINXBASE_EXPORT int32 const * ngram_iter_get(ngram_iter_t *itor, int32 *out_score, int32 *out_bowt)
Get information from the current M-gram in an iterator.
SPHINXBASE_EXPORT ngram_iter_t * ngram_iter_next(ngram_iter_t *itor)
Advance an M-gram iterator.
SPHINXBASE_EXPORT ngram_model_t * ngram_model_set_read(cmd_ln_t *config, const char *lmctlfile, logmath_t *lmath)
Read a set of language models from a control file.
SPHINXBASE_EXPORT int32 ngram_tg_score(ngram_model_t *model, int32 w3, int32 w2, int32 w1, int32 *n_used)
Quick trigram score lookup.
SPHINXBASE_EXPORT int ngram_model_free(ngram_model_t *model)
Release memory associated with an N-Gram model.
Basic type definitions used in Sphinx.
SPHINXBASE_EXPORT ngram_iter_t * ngram_iter_successors(ngram_iter_t *itor)
Iterate over all M-gram successors of an M-1-gram.
SPHINXBASE_EXPORT int32 ngram_model_set_count(ngram_model_t *set)
Returns the number of language models in a set.
SPHINXBASE_EXPORT ngram_model_t * ngram_model_set_add(ngram_model_t *set, ngram_model_t *model, const char *name, float32 weight, int reuse_widmap)
Add a language model to a set.
ARPABO text format (the standard).
SPHINXBASE_EXPORT int32 ngram_prob(ngram_model_t *model, const char *word,...)
Get the "raw" log-probability for a general N-Gram.
Iterator over a model set.
SPHINXBASE_EXPORT ngram_model_t * ngram_model_set_remove(ngram_model_t *set, const char *name, int reuse_widmap)
Remove a language model from a set.
SPHINXBASE_EXPORT int ngram_model_apply_weights(ngram_model_t *model, float32 lw, float32 wip, float32 uw)
Apply a language weight, insertion penalty, and unigram weight to a language model.
SPHINXBASE_EXPORT ngram_model_t * ngram_model_retain(ngram_model_t *model)
Retain ownership of an N-Gram model.
SPHINXBASE_EXPORT int32 ngram_score(ngram_model_t *model, const char *word,...)
Get the score (scaled, interpolated log-probability) for a general N-Gram.
SPHINXBASE_EXPORT int32 ngram_model_get_size(ngram_model_t *model)
Get the order of the N-gram model (i.e.
SPHINXBASE_EXPORT char const * ngram_type_to_str(int type)
Get the canonical name for an N-Gram file type.
SPHINXBASE_EXPORT void ngram_model_set_map_words(ngram_model_t *set, const char **words, int32 n_words)
Set the word-to-ID mapping for this model set.
SPHINXBASE_EXPORT int32 ngram_bg_score(ngram_model_t *model, int32 w2, int32 w1, int32 *n_used)
Quick bigram score lookup.
SPHINXBASE_EXPORT int32 ngram_model_add_word(ngram_model_t *model, const char *word, float32 weight)
Add a word (unigram) to the language model.
SPHINXBASE_EXPORT ngram_model_t * ngram_model_set_iter_model(ngram_model_set_iter_t *itor, char const **lmname)
Get language model and associated name from an iterator.
SPHINXBASE_EXPORT float32 ngram_model_get_weights(ngram_model_t *model, int32 *out_log_wip, int32 *out_log_uw)
Get the current weights from a language model.
SPHINXBASE_EXPORT int32 ngram_model_add_class(ngram_model_t *model, const char *classname, float32 classweight, char **words, const float32 *weights, int32 n_words)
Add a new class to a language model.
SPHINXBASE_EXPORT ngram_file_type_t ngram_str_to_type(const char *str_name)
Get the N-Gram file type from a string.
SPHINXBASE_EXPORT ngram_iter_t * ngram_ng_iter(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist)
Get an iterator over M-grams pointing to the specified M-gram.
SPHINXBASE_EXPORT int32 ngram_model_read_classdef(ngram_model_t *model, const char *file_name)
Read a class definition file and add classes to a language model.
SPHINXBASE_EXPORT ngram_model_t * ngram_model_set_interp(ngram_model_t *set, const char **names, const float32 *weights)
Set interpolation weights for a set and enables interpolation.
SPHINXBASE_EXPORT ngram_model_set_iter_t * ngram_model_set_iter(ngram_model_t *set)
Begin iterating over language models in a set.
SPHINXBASE_EXPORT void ngram_model_flush(ngram_model_t *lm)
Flush any cached N-Gram information.
Opaque structure used to hold the results of command-line parsing.
enum ngram_file_type_e ngram_file_type_t
File types for N-Gram files.
SPHINXBASE_EXPORT ngram_iter_t * ngram_iter(ngram_model_t *model, const char *word,...)
Get an iterator over M-grams pointing to the specified M-gram.
ngram_file_type_e
File types for N-Gram files.
SPHINXBASE_EXPORT void ngram_model_set_iter_free(ngram_model_set_iter_t *itor)
Finish iteration over a langauge model set.
Sphinx .DMP32 format (NOT SUPPORTED)
SPHINXBASE_EXPORT ngram_file_type_t ngram_file_name_to_type(const char *file_name)
Guess the file type for an N-Gram model from the filename.
SPHINXBASE_EXPORT const char * ngram_model_set_current(ngram_model_t *set)
Get the current language model name, if any.
Base iterator structure for N-grams.
Implementation of ngram_class_t.
ngram_case_e
Constants for case folding.
Common implementation of ngram_model_t.
SPHINXBASE_EXPORT int32 ngram_score_to_prob(ngram_model_t *model, int32 score)
Convert score to "raw" log-probability.
SPHINXBASE_EXPORT const char * ngram_word(ngram_model_t *model, int32 wid)
Look up word string for numerical word ID.
SPHINXBASE_EXPORT int32 ngram_zero(ngram_model_t *model)
Get the "zero" log-probability value for a language model.
Memory-mapped I/O wrappers for files.
SPHINXBASE_EXPORT ngram_model_t * ngram_model_set_select(ngram_model_t *set, const char *name)
Select a single language model from a set for scoring.
Fast integer logarithmic addition operations.
SPHINXBASE_EXPORT int32 ngram_model_set_current_wid(ngram_model_t *set, int32 set_wid)
Query the word-ID mapping for the current language model.
enum ngram_case_e ngram_case_t
Constants for case folding.
SPHINXBASE_EXPORT int32 ngram_model_add_class_word(ngram_model_t *model, const char *classname, const char *word, float32 weight)
Add a word to a class in a language model.
SPHINXBASE_EXPORT ngram_iter_t * ngram_model_mgrams(ngram_model_t *model, int m)
Iterate over all M-grams.
SPHINXBASE_EXPORT int32 ngram_model_set_known_wid(ngram_model_t *set, int32 set_wid)
Test whether a word ID corresponds to a known word in the current state of the language model set...
Determine file type automatically.
SPHINXBASE_EXPORT int32 ngram_ng_score(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist, int32 *n_used)
Quick general N-Gram score lookup.
SPHINXBASE_EXPORT int ngram_model_recode(ngram_model_t *model, const char *from, const char *to)
Re-encode word strings in an N-Gram model.