ModErn Text Analysis
META Enumerates Textual Applications
chunk_handler.h
Go to the documentation of this file.
1 
10 #ifndef META_INDEX_CHUNK_HANDLER_H_
11 #define META_INDEX_CHUNK_HANDLER_H_
12 
13 #include <atomic>
14 #include <functional>
15 #include <mutex>
16 #include <queue>
17 #include <stdexcept>
18 #include <unordered_set>
19 #include <utility>
20 #include <vector>
21 
22 #include "index/chunk.h"
23 #include "util/optional.h"
24 
25 namespace meta
26 {
27 namespace index
28 {
29 
34 template <class Index>
36 {
37  public:
38  using index_pdata_type = typename Index::index_pdata_type;
39  using primary_key_type = typename index_pdata_type::primary_key_type;
40  using secondary_key_type = typename index_pdata_type::secondary_key_type;
42 
46  class producer
47  {
48  public:
53  producer(chunk_handler* parent);
54 
61  template <class Container>
62  void operator()(const secondary_key_type& key, const Container& counts);
63 
68  ~producer();
69 
70  private:
74  void flush_chunk();
75 
77  std::unordered_set<index_pdata_type> pdata_;
78 
80  uint64_t chunk_size_;
81 
83  const static uint64_t constexpr max_size = 1024 * 1024 * 128; // 128 MB
84 
87  };
88 
93  chunk_handler(const std::string& prefix);
94 
102 
106  uint32_t size() const;
107 
112  uint64_t final_size() const;
113 
117  void merge_chunks();
118 
122  uint64_t unique_primary_keys() const;
123 
127  class chunk_handler_exception : public std::runtime_error
128  {
129  using std::runtime_error::runtime_error;
130  };
131 
132  private:
137  void write_chunk(std::vector<index_pdata_type>& pdata);
138 
140  std::string prefix_;
141 
143  std::atomic<uint32_t> chunk_num_{0};
144 
146  std::priority_queue<chunk_t> chunks_;
147 
149  mutable std::mutex mutables_;
150 
153 };
154 }
155 }
156 
157 #include "index/chunk_handler.tcc"
158 #endif
counts_t counts(const std::string &text, bool contains_label=true)
Definition: libsvm_parser.cpp:28
static const uint64_t constexpr max_size
Maximum allowed size of a chunk in bytes before it is written.
Definition: chunk_handler.h:83
Simple exception class for chunk_handler interactions.
Definition: chunk_handler.h:127
An interface for writing and merging inverted chunks of postings_data for a disk_index.
Definition: chunk_handler.h:35
~producer()
Destroys the producer, writing to disk any chunk data still resident in memory.
Definition: chunk_handler.tcc:71
std::mutex mutables_
Mutex used for protecting the chunk queue.
Definition: chunk_handler.h:149
std::string prefix_
The prefix for all chunks to be written.
Definition: chunk_handler.h:140
chunk_handler * parent_
Back-pointer to the handler this producer is operating on.
Definition: chunk_handler.h:86
uint64_t chunk_size_
Current size of the in-memory chunk.
Definition: chunk_handler.h:80
void write_chunk(std::vector< index_pdata_type > &pdata)
Definition: chunk_handler.tcc:90
uint32_t size() const
Definition: chunk_handler.tcc:207
The ModErn Text Analysis toolkit is a suite of natural language processing, classification, information retreival, data mining, and other applications of text processing.
Definition: analyzer.h:24
std::atomic< uint32_t > chunk_num_
The current chunk number.
Definition: chunk_handler.h:143
std::unordered_set< index_pdata_type > pdata_
Current in-memory chunk.
Definition: chunk_handler.h:77
uint64_t unique_primary_keys() const
Definition: chunk_handler.tcc:189
std::priority_queue< chunk_t > chunks_
Queue of chunks on disk that need to be merged */.
Definition: chunk_handler.h:146
Represents a portion of a disk_index's postings file.
Definition: chunk.h:28
producer(chunk_handler *parent)
Definition: chunk_handler.tcc:18
chunk_handler(const std::string &prefix)
Constructs a chunk_handler that writes to the given prefix.
Definition: chunk_handler.tcc:77
void flush_chunk()
Flushes the current in-memory chunk to disk.
Definition: chunk_handler.tcc:56
uint64_t final_size() const
Definition: chunk_handler.tcc:198
util::optional< uint64_t > unique_primary_keys_
Number of unique primary keys encountered while merging.
Definition: chunk_handler.h:152
producer make_producer()
Creates a producer for this chunk_handler.
Definition: chunk_handler.tcc:84
void merge_chunks()
Merge the remaining on-disk chunks.
Definition: chunk_handler.tcc:131
The object that is fed postings_data by the index.
Definition: chunk_handler.h:46
void operator()(const secondary_key_type &key, const Container &counts)
Handler for when a given secondary_key has been processed and is ready to be added to the in-memory c...
Definition: chunk_handler.tcc:26