|
| language_model (const std::string &config_file) |
| Creates an N-gram language model based on the corpus specified in the config file.
|
|
| language_model (const std::string &config_file, size_t n) |
| Creates an N-gram language model based on the corpus specified in the config file. More...
|
|
std::string | generate (unsigned int seed) const |
| Randomly generates one token sequence based on <s> and </s> symbols. More...
|
|
std::string | next_token (const std::deque< std::string > &tokens, double random) const |
|
double | perplexity (const std::string &tokens) const |
|
double | perplexity_per_word (const std::string &tokens) const |
|
double | prob (std::deque< std::string > tokens) const |
|
|
void | learn_model (const std::string &config_file) |
| Builds the probabilities associated with this language model. More...
|
|
std::string | make_string (const std::deque< std::string > &tokens) const |
|
std::deque< std::string > | make_deque (const std::string &tokens) const |
|
|
std::unique_ptr< language_model > | interp_ |
| The language_model used to interpolate with this one for smoothing.
|
|
std::unordered_map< std::string, std::unordered_map< std::string, double > > | dist_ |
| Contains the N-gram distribution probabilities (N-1 words -> (w, prob))
|
|
size_t | N_ |
| The value of N in this n-gram.
|
|
|
static constexpr double | lambda_ = 0.7 |
| The interpolation coefficient for smoothing LM probabilities.
|
|
meta::lm::language_model::language_model |
( |
const std::string & |
config_file, |
|
|
size_t |
n |
|
) |
| |
Creates an N-gram language model based on the corpus specified in the config file.
- Parameters
-
n | The value of n, which overrides any setting in the config file |
std::string meta::lm::language_model::generate |
( |
unsigned int |
seed | ) |
const |
Randomly generates one token sequence based on <s> and </s> symbols.
- Returns
- a random sequence of tokens based on this language model
std::string meta::lm::language_model::next_token |
( |
const std::deque< std::string > & |
tokens, |
|
|
double |
random |
|
) |
| const |
- Parameters
-
tokens | The previous N - 1 tokens |
random | A random number on [0, 1] used for choosing the next token |
- Returns
- the next token based on the previous tokens
double meta::lm::language_model::perplexity |
( |
const std::string & |
tokens | ) |
const |
- Parameters
-
tokens | A sequence of tokens |
- Returns
- the perplexity of this token sequence given the current language model: \( \sqrt[n]{\prod_{i=1}^n\frac{1}{p(w_i|w_{i-n}\cdots w_{i-1})}} \)
double meta::lm::language_model::perplexity_per_word |
( |
const std::string & |
tokens | ) |
const |
- Parameters
-
tokens | A sequence of tokens |
- Returns
- the perplexity of this token sequence given the current language model normalized by the length of the sequence
double meta::lm::language_model::prob |
( |
std::deque< std::string > |
tokens | ) |
const |
- Parameters
-
tokens | A sequence of n tokens |
- Returns
- the probability of seeing the nth token based on the previous n
void meta::lm::language_model::learn_model |
( |
const std::string & |
config_file | ) |
|
|
private |
Builds the probabilities associated with this language model.
- Parameters
-
config_file | The config file that specifies the location of the corpus |
std::string meta::lm::language_model::make_string |
( |
const std::deque< std::string > & |
tokens | ) |
const |
|
private |
- Parameters
-
tokens | A deque of tokens to convert to a string |
- Returns
- the string version of the deque (space delimited)
std::deque< std::string > meta::lm::language_model::make_deque |
( |
const std::string & |
tokens | ) |
const |
|
private |
- Parameters
-
tokens | A string of space-delimited tokens to convert to a deque |
- Returns
- a deque of the tokens
The documentation for this class was generated from the following files: