ModErn Text Analysis
META Enumerates Textual Applications
sequence_analyzer.h
Go to the documentation of this file.
1 
10 #ifndef META_SEQUENCE_SEQUENCE_ANALYZER_H_
11 #define META_SEQUENCE_SEQUENCE_ANALYZER_H_
12 
13 #include <algorithm>
14 #include <vector>
15 #include <functional>
16 #include <unordered_map>
17 
18 #include "meta.h"
19 #include "sequence/sequence.h"
20 #include "util/invertible_map.h"
21 
22 namespace meta
23 {
24 namespace sequence
25 {
26 
50 {
51  public:
55  sequence_analyzer() = default;
56 
63  sequence_analyzer(const std::string& prefix);
64 
68  sequence_analyzer(const sequence_analyzer&) = default;
69 
74 
79 
84 
89  void load(const std::string& prefix);
90 
96  void save(const std::string& prefix) const;
97 
104  void analyze(sequence& sequence);
105 
113  void analyze(sequence& sequence, uint64_t idx);
114 
122  void analyze(sequence& sequence) const;
123 
130  void analyze(sequence& sequence, uint64_t idx) const;
131 
139  feature_id feature(const std::string& feature);
140 
150  feature_id feature(const std::string& feature) const;
151 
155  uint64_t num_features() const;
156 
161  label_id label(tag_t lbl) const;
162 
167  tag_t tag(label_id lbl) const;
168 
172  uint64_t num_labels() const;
173 
177  const std::string& prefix() const;
178 
183 
190  template <class Function>
191  void add_observation_function(Function&& function)
192  {
193  obs_fns_.emplace_back(std::forward<Function>(function));
194  }
195 
200  class collector
201  {
202  public:
208  {
209  // nothing
210  }
211 
216  {
217  using pair = std::pair<feature_id, double>;
218  std::sort(feats_.begin(), feats_.end(),
219  [](const pair& lhs, const pair& rhs)
220  {
221  return lhs.first < rhs.first;
222  });
223 
224  obs_->features(std::move(feats_));
225  }
226 
232  virtual void add(const std::string& feat, double amount)
233  {
234  feats_.emplace_back(feature(feat), amount);
235  }
236 
237  protected:
242  virtual feature_id feature(const std::string& feat) = 0;
243 
248  };
249 
250  class exception : public std::runtime_error
251  {
252  public:
253  using std::runtime_error::runtime_error;
254  };
255 
256  private:
260  template <class Analyzer>
261  class basic_collector : public collector
262  {
263  public:
269  basic_collector(Analyzer* analyzer, observation* obs)
270  : collector{obs}, analyzer_{analyzer}
271  {
272  // nothing
273  }
274 
275  protected:
277  Analyzer* analyzer_;
278 
279  virtual feature_id feature(const std::string& feat)
280  {
281  return analyzer_->feature(feat);
282  }
283  };
284 
285  public:
289  class default_collector : public basic_collector<sequence_analyzer>
290  {
291  public:
293  };
294 
299  class const_collector : public basic_collector<const sequence_analyzer>
300  {
301  public:
303 
304  // special case add to not actually add if a brand new feature id
305  // is found
306  virtual void add(const std::string& feat, double amount)
307  {
308  auto fid = feature(feat);
309  if (fid != analyzer_->num_features())
310  feats_.emplace_back(fid, amount);
311  }
312  };
313 
314  private:
319  void load_feature_id_mapping(const std::string& prefix);
320 
325  void load_label_id_mapping(const std::string& prefix);
326 
333  void add_feature(observation& obs, const std::string& feature,
334  double weight = 1.0);
335 
337  std::vector<std::function<void(const sequence&, uint64_t, collector&)>>
339 
341  std::unordered_map<std::string, feature_id> feature_id_mapping_;
342 
345 };
346 
352 }
353 }
354 #endif
observation * obs_
the observation we are collecting data for
Definition: sequence_analyzer.h:245
Contains top-level namespace documentation for the META toolkit.
~collector()
Writes all analyzed information out to the observation.
Definition: sequence_analyzer.h:215
virtual void add(const std::string &feat, double amount)
Adds a new feature to this observation.
Definition: sequence_analyzer.h:232
Definition: sequence_analyzer.h:250
void save(const std::string &prefix) const
Saves the sequence analyzer into the folder given by prefix.
Definition: sequence_analyzer.cpp:71
basic_collector(Analyzer *analyzer, observation *obs)
Creates the collector with the given analyzer.
Definition: sequence_analyzer.h:269
virtual feature_id feature(const std::string &feat)
Definition: sequence_analyzer.h:279
sequence_analyzer & operator=(const sequence_analyzer &)=default
Sequence analyzers may be copy assigned.
collector(observation *obs)
Constructs the collector over a given observation.
Definition: sequence_analyzer.h:207
const util::invertible_map< tag_t, label_id > & labels() const
Definition: sequence_analyzer.cpp:153
feature_id feature(const std::string &feature)
Looks up the feature id for the given string representation.
Definition: sequence_analyzer.cpp:130
Non-const version of the collector.
Definition: sequence_analyzer.h:289
util::invertible_map< tag_t, label_id > label_id_mapping_
The label_id mapping (tag_t to label_id)
Definition: sequence_analyzer.h:344
const std::string & prefix() const
sequence_analyzer()=default
Default constructor.
void analyze(sequence &sequence)
Analyzes a sequence, generating new label_ids and feature_ids for unseen elements.
Definition: sequence_analyzer.cpp:92
Interface class used for analyzing observations inside user-provided feature functions.
Definition: sequence_analyzer.h:200
Represents a tagged sequence of observations.
Definition: sequence.h:24
std::unordered_map< std::string, feature_id > feature_id_mapping_
The feature_id mapping (string to id)
Definition: sequence_analyzer.h:341
uint64_t num_features() const
Definition: sequence_analyzer.cpp:148
tag_t tag(label_id lbl) const
Definition: sequence_analyzer.cpp:163
const feature_vector & features() const
Definition: observation.cpp:64
std::vector< std::pair< feature_id, double >> feature_vector
internal feature vector for observations
Definition: observation.h:36
virtual void add(const std::string &feat, double amount)
Adds a new feature to this observation.
Definition: sequence_analyzer.h:306
std::vector< std::function< void(const sequence &, uint64_t, collector &)> > obs_fns_
The observation functions.
Definition: sequence_analyzer.h:338
The ModErn Text Analysis toolkit is a suite of natural language processing, classification, information retreival, data mining, and other applications of text processing.
Definition: analyzer.h:24
void load_label_id_mapping(const std::string &prefix)
Loads the label_id mapping from disk.
Definition: sequence_analyzer.cpp:63
observation::feature_vector feats_
the feature vector that will be placed into the observation
Definition: sequence_analyzer.h:247
virtual feature_id feature(const std::string &feat)=0
void add_feature(observation &obs, const std::string &feature, double weight=1.0)
Adds a feature to an observation.
Analyzer * analyzer_
back-pointer to the analyzer for this collector
Definition: sequence_analyzer.h:277
void load(const std::string &prefix)
Loads a sequence analyzer from a folder given by prefix.
Definition: sequence_analyzer.cpp:29
Represents an observation in a tagged sequence.
Definition: observation.h:32
Const version of the collector.
Definition: sequence_analyzer.h:299
sequence_analyzer default_pos_analyzer()
Constructs a sequence_analyzer that is specialized for part-of-speech tagging.
Definition: sequence_analyzer.cpp:190
Analyzer that operates over sequences, generating features based on a set of "observation functions"...
Definition: sequence_analyzer.h:49
label_id label(tag_t lbl) const
Definition: sequence_analyzer.cpp:158
uint64_t num_labels() const
Definition: sequence_analyzer.cpp:168
void add_observation_function(Function &&function)
Adds an observation function to the list of functions to be used for analyzing observations.
Definition: sequence_analyzer.h:191
void load_feature_id_mapping(const std::string &prefix)
Loads the feature_id mapping from disk.
Definition: sequence_analyzer.cpp:37
Implementation-detail collector.
Definition: sequence_analyzer.h:261