ModErn Text Analysis
META Enumerates Textual Applications
sr_parser.h
Go to the documentation of this file.
1 
9 #ifndef META_PARSER_SR_PARSER_H_
10 #define META_PARSER_SR_PARSER_H_
11 
12 #include <map>
13 #include <random>
14 #include <unordered_map>
15 
17 #include "meta.h"
18 #include "parallel/thread_pool.h"
20 #include "parser/transition_map.h"
21 #include "sequence/sequence.h"
22 #include "util/optional.h"
23 #include "util/sparse_vector.h"
24 
25 namespace meta
26 {
27 namespace parser
28 {
29 
30 class parse_tree;
31 class state;
32 
41 class sr_parser
42 {
43  public:
47  enum class training_algorithm
48  {
49  EARLY_TERMINATION,
50  BEAM_SEARCH
51  };
52 
57  {
62  uint64_t batch_size = 25;
63 
68  uint64_t beam_size = 8;
69 
73  uint64_t max_iterations = 40;
74 
79  std::random_device::result_type seed = std::random_device{}();
80 
84  uint64_t num_threads = std::thread::hardware_concurrency();
85 
91  training_algorithm algorithm = training_algorithm::EARLY_TERMINATION;
92 
96  training_options() = default;
97 
101  training_options(const training_options&) = default;
102  };
103 
107  sr_parser() = default;
108 
114  sr_parser(const std::string& prefix);
115 
122  parse_tree parse(const sequence::sequence& sentence) const;
123 
131  void train(std::vector<parse_tree>& trees, training_options options);
132 
136  void save(const std::string& prefix) const;
137 
141  class exception : public std::runtime_error
142  {
143  public:
144  using std::runtime_error::runtime_error;
145  };
146 
150  using feature_vector = std::unordered_map<std::string, float>;
151 
156 
160  using weight_vectors = std::unordered_map<std::string, weight_vector>;
161 
162  private:
166  class training_data;
167 
172  {
173  training_data& data;
174  size_t start;
175  size_t end;
176  };
177 
182  class state_analyzer;
183 
187  void load(const std::string& prefix);
188 
197  std::tuple<weight_vectors, uint64_t, uint64_t>
199  const training_options& options);
200 
210  std::pair<uint64_t, uint64_t> train_instance(
211  const parse_tree& tree, const std::vector<trans_id>& transitions,
212  const training_options& options, weight_vectors& update) const;
213 
224  std::pair<uint64_t, uint64_t>
226  const std::vector<trans_id>& transitions,
227  weight_vectors& update) const;
228 
238  std::pair<uint64_t, uint64_t> train_beam_search(
239  const parse_tree& tree, const std::vector<trans_id>& transitions,
240  const training_options& options, weight_vectors& update) const;
241 
252  trans_id best_transition(const feature_vector& features, const state& state,
253  bool check_legality = false) const;
254 
255  using scored_trans = std::pair<trans_id, float>;
256 
267  std::vector<scored_trans>
268  best_transitions(const feature_vector& features, const state& state,
269  size_t num, bool check_legality = false) const;
270 
275 
280 
284  uint64_t beam_size_ = 1;
285 };
286 }
287 }
288 #endif
parse_tree parse(const sequence::sequence &sentence) const
Parses a POS-tagged sentence (represented as a sequence::sequence).
Definition: sr_parser.cpp:37
uint64_t beam_size
How many states should be kept on the beam? (valid for beam search only)
Definition: sr_parser.h:68
Contains top-level namespace documentation for the META toolkit.
uint64_t num_threads
How many threads to use for training.
Definition: sr_parser.h:84
std::random_device::result_type seed
The seed for the random number generator used for shuffling examples during training.
Definition: sr_parser.h:79
transition_map trans_
Storage for the ids for each transition.
Definition: sr_parser.h:274
Analyzer responsible for converting a parser state to a feature_vector.
Definition: state_analyzer.h:24
trans_id best_transition(const feature_vector &features, const state &state, bool check_legality=false) const
Computes the most likely transition according to the current model.
Definition: sr_parser.cpp:416
Represents the parse tree for a sentence.
Definition: parse_tree.h:32
classify::linear_model< std::string, float, trans_id > model_
Storage for the weights for each possible transition.
Definition: sr_parser.h:279
Represents the current parser state of a shift-reduce parser.
Definition: state.h:36
std::pair< uint64_t, uint64_t > train_early_termination(const parse_tree &tree, const std::vector< trans_id > &transitions, weight_vectors &update) const
Calculates a weight update on a single tree, using the greedy early termination training strategy...
Definition: sr_parser.cpp:264
std::pair< uint64_t, uint64_t > train_beam_search(const parse_tree &tree, const std::vector< trans_id > &transitions, const training_options &options, weight_vectors &update) const
Calculates a weight update on a single tree, using beam search.
Definition: sr_parser.cpp:297
uint64_t batch_size
How many trees should be put together into a single batch for learning?
Definition: sr_parser.h:62
Represents a tagged sequence of observations.
Definition: sequence.h:24
void save(const std::string &prefix) const
Definition: sr_parser.cpp:436
A training batch.
Definition: sr_parser.h:171
std::unordered_map< std::string, weight_vector > weight_vectors
A collection of weight vectors by feature type.
Definition: sr_parser.h:160
Exception thrown during parser actions.
Definition: sr_parser.h:141
uint64_t max_iterations
How many iterations to run the training algorithm for?
Definition: sr_parser.h:73
std::unordered_map< std::string, float > feature_vector
Sparse vector representation of a state's features.
Definition: sr_parser.h:150
std::pair< uint64_t, uint64_t > train_instance(const parse_tree &tree, const std::vector< trans_id > &transitions, const training_options &options, weight_vectors &update) const
Calculates a weight update on a single tree.
Definition: sr_parser.cpp:246
Represents a sparse vector, indexed by type Index and storing values of type Value.
Definition: sparse_vector.h:28
Training options required for learning a parser model.
Definition: sr_parser.h:56
std::vector< scored_trans > best_transitions(const feature_vector &features, const state &state, size_t num, bool check_legality=false) const
Computes the most likely transitions according to the current model.
Definition: sr_parser.cpp:426
A shift-reduce constituency parser.
Definition: sr_parser.h:41
Training data for the parser.
Definition: training_data.h:22
The ModErn Text Analysis toolkit is a suite of natural language processing, classification, information retreival, data mining, and other applications of text processing.
Definition: analyzer.h:24
void load(const std::string &prefix)
Definition: sr_parser.cpp:451
training_options()=default
Default constructor.
Represents a collection of a fixed number of threads, which tasks can be added to.
Definition: thread_pool.h:33
uint64_t beam_size_
Beam size used during training.
Definition: sr_parser.h:284
An invertible map that maps transitions to ids.
Definition: transition_map.h:23
std::tuple< weight_vectors, uint64_t, uint64_t > train_batch(training_batch batch, parallel::thread_pool &pool, const training_options &options)
Calculates a weight update on a given batch of training trees.
Definition: sr_parser.cpp:203
sr_parser()=default
Default constructor.
void train(std::vector< parse_tree > &trees, training_options options)
Trains a model on the given parse trees using the supplied training options.
Definition: sr_parser.cpp:144
training_algorithm algorithm
The algorithm to use for training.
Definition: sr_parser.h:91
training_algorithm
The set of training algorithms available for the parser.
Definition: sr_parser.h:47