ModErn Text Analysis
META Enumerates Textual Applications
lda_gibbs.h
Go to the documentation of this file.
1 
10 #ifndef META_LDA_GIBBS_H_
11 #define META_LDA_GIBBS_H_
12 
13 #include <random>
14 
15 #include "stats/multinomial.h"
16 #include "topics/lda_model.h"
17 #include "util/dense_matrix.h"
18 
19 namespace meta
20 {
21 namespace topics
22 {
23 
29 class lda_gibbs : public lda_model
30 {
31  public:
45  lda_gibbs(std::shared_ptr<index::forward_index> idx, uint64_t num_topics,
46  double alpha, double beta);
47 
51  virtual ~lda_gibbs() = default;
52 
65  virtual void run(uint64_t num_iters, double convergence = 1e-6);
66 
67  protected:
79  topic_id sample_topic(term_id term, doc_id doc);
80 
91  virtual double compute_sampling_weight(term_id term, doc_id doc,
92  topic_id topic) const;
93 
101  virtual double
102  compute_term_topic_probability(term_id term,
103  topic_id topic) const override;
104 
112  virtual double compute_doc_topic_probability(doc_id doc,
113  topic_id topic) const override;
114 
120  virtual void initialize();
121 
129  virtual void perform_iteration(uint64_t iter, bool init = false);
130 
139  virtual void decrease_counts(topic_id topic, term_id term, doc_id doc);
140 
149  virtual void increase_counts(topic_id topic, term_id term, doc_id doc);
150 
154  double corpus_log_likelihood() const;
155 
159  lda_gibbs& operator=(const lda_gibbs&) = delete;
160 
164  lda_gibbs(const lda_gibbs& other) = delete;
165 
174  std::vector<std::vector<topic_id>> doc_word_topic_;
175 
179  std::vector<stats::multinomial<term_id>> phi_;
180 
184  std::vector<stats::multinomial<topic_id>> theta_;
185 
189  std::mt19937_64 rng_;
190 };
191 }
192 }
193 
194 #endif
lda_gibbs & operator=(const lda_gibbs &)=delete
lda_gibbs cannot be copy assigned.
double corpus_log_likelihood() const
Definition: lda_gibbs.cpp:160
A LDA topic model implemented using a collapsed gibbs sampler.
Definition: lda_gibbs.h:29
An LDA topic model base class.
Definition: lda_model.h:25
std::mt19937_64 rng_
The random number generator for the sampler.
Definition: lda_gibbs.h:189
virtual void perform_iteration(uint64_t iter, bool init=false)
Performs a sampling iteration.
Definition: lda_gibbs.cpp:111
std::vector< std::vector< topic_id > > doc_word_topic_
The topic assignment for every word in every document.
Definition: lda_gibbs.h:174
virtual ~lda_gibbs()=default
Destructor: virtual for potential subclassing.
lda_gibbs(std::shared_ptr< index::forward_index > idx, uint64_t num_topics, double alpha, double beta)
Constructs the lda model over the given documents, with the given number of topics, and hyperparameters and for the priors on (topic distributions) and (topic proportions), respectively.
Definition: lda_gibbs.cpp:18
std::vector< stats::multinomial< topic_id > > theta_
The topic distributions for each document, .
Definition: lda_gibbs.h:184
virtual void increase_counts(topic_id topic, term_id term, doc_id doc)
Increases all counts associated with the given topic, term, and document by one.
Definition: lda_gibbs.cpp:154
virtual double compute_term_topic_probability(term_id term, topic_id topic) const override
Definition: lda_gibbs.cpp:94
virtual double compute_doc_topic_probability(doc_id doc, topic_id topic) const override
Definition: lda_gibbs.cpp:100
The ModErn Text Analysis toolkit is a suite of natural language processing, classification, information retreival, data mining, and other applications of text processing.
Definition: analyzer.h:24
virtual void decrease_counts(topic_id topic, term_id term, doc_id doc)
Decreases all counts associated with the given topic, term, and document by one.
Definition: lda_gibbs.cpp:148
virtual void initialize()
Initializes the first set of topic assignments for inference.
Definition: lda_gibbs.cpp:106
virtual void run(uint64_t num_iters, double convergence=1e-6)
Runs the sampler for a maximum number of iterations, or until the given convergence criterion is met...
Definition: lda_gibbs.cpp:44
std::vector< stats::multinomial< term_id > > phi_
The word distributions for each topic, .
Definition: lda_gibbs.h:179
topic_id sample_topic(term_id term, doc_id doc)
Samples a topic from the full conditional distribution .
Definition: lda_gibbs.cpp:76
virtual double compute_sampling_weight(term_id term, doc_id doc, topic_id topic) const
Computes a weight proportional to .
Definition: lda_gibbs.cpp:87