ModErn Text Analysis
META Enumerates Textual Applications
batch_training.h
Go to the documentation of this file.
1 
10 #ifndef META_BATCH_TRAINING_H_
11 #define META_BATCH_TRAINING_H_
12 
13 #include <algorithm>
14 #include <random>
15 #include <vector>
16 
17 #include "logging/logger.h"
18 #include "meta.h"
19 
20 namespace meta
21 {
22 namespace classify
23 {
24 
38 template <class Index, class Classifier>
39 void batch_train(Index& idx, Classifier& cls,
40  const std::vector<doc_id>& training_set, uint64_t batch_size)
41 {
42  auto docs = training_set;
43  std::mt19937 gen(std::random_device{}());
44  std::shuffle(docs.begin(), docs.end(), gen);
45 
46  // integer-math ceil(docs.size() / batch_size)
47  auto num_batches = (docs.size() + batch_size - 1) / batch_size;
48  for (uint64_t i = 0; i < num_batches; ++i)
49  {
50  LOG(progress) << "\rTraining batch " << i + 1 << "/" << num_batches
51  << ENDLG;
52  auto end = std::min<uint64_t>((i + 1) * batch_size, docs.size());
53  std::vector<doc_id> batch{docs.begin() + (i * batch_size),
54  docs.begin() + end};
55  idx.clear_cache();
56  cls.train(batch);
57  }
58  LOG(progress) << '\n' << ENDLG;
59 }
60 
61 }
62 }
63 #endif
Contains top-level namespace documentation for the META toolkit.
void batch_train(Index &idx, Classifier &cls, const std::vector< doc_id > &training_set, uint64_t batch_size)
This trains a classifier in an online fashion, using batches of size batch_size from the training_set...
Definition: batch_training.h:39
The ModErn Text Analysis toolkit is a suite of natural language processing, classification, information retreival, data mining, and other applications of text processing.
Definition: analyzer.h:24