ModErn Text Analysis
META Enumerates Textual Applications
language_model.h
Go to the documentation of this file.
1 
10 #ifndef META_LANGUAGE_MODEL_H_
11 #define META_LANGUAGE_MODEL_H_
12 
13 #include <deque>
14 #include <memory>
15 #include <string>
16 #include <unordered_map>
17 
18 namespace meta
19 {
20 namespace lm
21 {
23 {
24  public:
29  language_model(const std::string& config_file);
30 
36  language_model(const std::string& config_file, size_t n);
37 
42  std::string generate(unsigned int seed) const;
43 
49  std::string next_token(const std::deque<std::string>& tokens,
50  double random) const;
51 
58  double perplexity(const std::string& tokens) const;
59 
65  double perplexity_per_word(const std::string& tokens) const;
66 
72  double prob(std::deque<std::string> tokens) const;
73 
74  private:
75 
81  void learn_model(const std::string& config_file);
82 
87  std::string make_string(const std::deque<std::string>& tokens) const;
88 
93  std::deque<std::string> make_deque(const std::string& tokens) const;
94 
96  std::unique_ptr<language_model> interp_;
97 
99  std::unordered_map
100  <std::string, std::unordered_map<std::string, double>> dist_;
101 
103  size_t N_;
104 
106  constexpr static double lambda_ = 0.7;
107 };
108 }
109 }
110 
111 #endif
112 
void learn_model(const std::string &config_file)
Builds the probabilities associated with this language model.
Definition: language_model.cpp:53
std::string next_token(const std::deque< std::string > &tokens, double random) const
Definition: language_model.cpp:102
std::string generate(unsigned int seed) const
Randomly generates one token sequence based on and symbols.
Definition: language_model.cpp:121
double perplexity_per_word(const std::string &tokens) const
Definition: language_model.cpp:189
static constexpr double lambda_
The interpolation coefficient for smoothing LM probabilities.
Definition: language_model.h:106
std::string make_string(const std::deque< std::string > &tokens) const
Definition: language_model.cpp:206
The ModErn Text Analysis toolkit is a suite of natural language processing, classification, information retreival, data mining, and other applications of text processing.
Definition: analyzer.h:24
double prob(std::deque< std::string > tokens) const
Definition: language_model.cpp:147
Definition: language_model.h:22
double perplexity(const std::string &tokens) const
Definition: language_model.cpp:172
std::unique_ptr< language_model > interp_
The language_model used to interpolate with this one for smoothing.
Definition: language_model.h:96
std::unordered_map< std::string, std::unordered_map< std::string, double > > dist_
Contains the N-gram distribution probabilities (N-1 words -> (w, prob))
Definition: language_model.h:100
language_model(const std::string &config_file)
Creates an N-gram language model based on the corpus specified in the config file.
Definition: language_model.cpp:28
size_t N_
The value of N in this n-gram.
Definition: language_model.h:103
std::deque< std::string > make_deque(const std::string &tokens) const
Definition: language_model.cpp:194