ModErn Text Analysis
META Enumerates Textual Applications
postings_data.h
Go to the documentation of this file.
1 
10 #ifndef META_POSTINGS_DATA_
11 #define META_POSTINGS_DATA_
12 
13 #include <fstream>
14 #include <limits>
15 #include <utility>
16 #include <vector>
17 
18 #include "meta.h"
21 #include "util/sparse_vector.h"
22 
23 namespace meta
24 {
25 namespace index
26 {
27 
28 template <class, class>
29 class postings_data;
30 
31 template <class PrimaryKey, class SecondaryKey>
32 io::compressed_file_reader& operator>>(io::compressed_file_reader&,
33  postings_data<PrimaryKey,
34  SecondaryKey>&);
35 
44 template <class PrimaryKey, class SecondaryKey>
45 class postings_data
46 {
47  public:
48  using primary_key_type = PrimaryKey;
49  using secondary_key_type = SecondaryKey;
50  using pair_t = std::pair<SecondaryKey, double>;
51  using count_t = std::vector<pair_t>;
52 
57  static_assert(
58  (std::is_integral<PrimaryKey>::value
59  || std::is_base_of<util::numeric, PrimaryKey>::value
60  || std::is_same<PrimaryKey, std::string>::value)
61  &&
62  (std::is_integral<SecondaryKey>::value
63  || std::is_base_of<util::numeric, SecondaryKey>::value),
64  "primary and secondary keys in postings data must be numeric types");
65 
70  static_assert(sizeof(uint64_t) == sizeof(double),
71  "sizeof(uint64_t) must equal sizeof(double) since "
72  "reinterpret_cast is used in postings_data");
73 
77  postings_data() = default;
78 
83  postings_data(PrimaryKey p_id);
84 
89  void merge_with(postings_data& other);
90 
96  void increase_count(SecondaryKey s_id, double amount);
97 
103  double count(SecondaryKey s_id) const;
104 
109  const count_t& counts() const;
110 
114  void set_counts(const count_t& counts);
115 
121  bool operator<(const postings_data& other) const;
122 
130  {
131  pd.counts_.clear();
132  uint32_t num_pairs = in.next();
133  for (uint32_t i = 0; i < num_pairs; ++i)
134  {
135  SecondaryKey s_id = SecondaryKey{in.next()};
136  uint64_t count = in.next();
137  pd.counts_.emplace_back(s_id, static_cast<double>(count));
138  }
139  }
140 
147  friend io::compressed_file_reader& operator>>
150 
160  {
161  if (pd.counts_.empty())
162  return out;
163 
164  out.write(pd.p_id_);
165  uint32_t size = pd.counts_.size();
166  out.write(size);
167  for (auto& p : pd.counts_)
168  {
169  out.write(p.first);
170  out.write(p.second);
171  }
172 
173  return out;
174  }
175 
183  void write_compressed(io::compressed_file_writer& writer) const;
184 
193 
197  void write_libsvm(std::ofstream& out) const
198  {
199  out << p_id_;
200  for (auto& c : counts_)
201  out << ' ' << (c.first + 1) << ':' << c.second;
202  out << '\n';
203  }
204 
208  PrimaryKey primary_key() const;
209 
213  void set_primary_key(PrimaryKey new_key);
214 
218  uint64_t inverse_frequency() const;
219 
223  uint64_t bytes_used() const;
224 
225  private:
227  PrimaryKey p_id_;
228 
231 
233  const static uint64_t delimiter_ = std::numeric_limits<uint64_t>::max();
234 };
235 
242 template <class PrimaryKey, class SecondaryKey>
244  postings_data<PrimaryKey,
245  SecondaryKey>& pd)
246 {
247  pd.p_id_ = in.next();
248  stream_helper(in, pd);
249  return in;
250 }
251 
258 template <>
259 inline io::compressed_file_reader& operator>>
261 {
262  pd.p_id_ = in.next_string();
263  stream_helper(in, pd);
264  return in;
265 }
266 
273 template <class PrimaryKey, class SecondaryKey>
274 bool operator==(const postings_data<PrimaryKey, SecondaryKey>& lhs,
275  const postings_data<PrimaryKey, SecondaryKey>& rhs);
276 }
277 }
278 
279 namespace std
280 {
281 template <class PrimaryKey, class SecondaryKey>
285 struct hash<meta::index::postings_data<PrimaryKey, SecondaryKey>>
286 {
292  size_t operator()(const pdata_t& pd) const
293  {
294  return std::hash<PrimaryKey>{}(pd.primary_key());
295  }
296 };
297 }
298 
299 #include "index/postings_data.tcc"
300 #endif
bool operator<(const postings_data &other) const
Definition: postings_data.tcc:88
Contains top-level namespace documentation for the META toolkit.
void write(uint64_t value)
Writes a value to the end of the compressed file.
Definition: compressed_file_writer.cpp:72
void merge_with(postings_data &other)
Definition: postings_data.tcc:22
PrimaryKey p_id_
Primary id this postings_data represents.
Definition: postings_data.h:227
double count(SecondaryKey s_id) const
Definition: postings_data.tcc:61
const count_t & counts() const
Definition: postings_data.tcc:68
STL namespace.
Represents a file of unsigned integers compressed using gamma compression.
Definition: compressed_file_reader.h:38
std::string next_string()
Definition: compressed_file_reader.cpp:68
uint64_t next()
Definition: compressed_file_reader.cpp:99
friend void stream_helper(io::compressed_file_reader &in, postings_data< PrimaryKey, SecondaryKey > &pd)
Helper function used by istream operator.
Definition: postings_data.h:128
uint64_t bytes_used() const
Definition: postings_data.tcc:205
void read_compressed(io::compressed_file_reader &reader)
Reads compressed postings_data into this object.
Definition: postings_data.tcc:155
static const uint64_t delimiter_
delimiter used when writing to compressed files
Definition: postings_data.h:233
uint64_t inverse_frequency() const
void set_counts(const count_t &counts)
Definition: postings_data.tcc:74
The ModErn Text Analysis toolkit is a suite of natural language processing, classification, information retreival, data mining, and other applications of text processing.
Definition: analyzer.h:24
size_t operator()(const pdata_t &pd) const
Definition: postings_data.h:292
io::compressed_file_reader & operator>>(io::compressed_file_reader &, postings_data< PrimaryKey, SecondaryKey > &)
Reads semi-compressed postings data from a compressed file.
Definition: postings_data.h:243
void set_primary_key(PrimaryKey new_key)
Definition: postings_data.tcc:82
void write_libsvm(std::ofstream &out) const
Definition: postings_data.h:197
Writes to a file of unsigned integers using gamma compression.
Definition: compressed_file_writer.h:25
void write_compressed(io::compressed_file_writer &writer) const
Writes this postings_data to a compressed file.
Definition: postings_data.tcc:109
friend io::compressed_file_writer & operator<<(io::compressed_file_writer &out, const postings_data< PrimaryKey, SecondaryKey > &pd)
Writes semi-compressed postings data to a compressed file.
Definition: postings_data.h:157
A class to represent the per-PrimaryKey data in an index's postings file.
Definition: forward_index.h:30
util::sparse_vector< SecondaryKey, double > counts_
The (secondary_key_type, count) pairs.
Definition: postings_data.h:230
void increase_count(SecondaryKey s_id, double amount)
Definition: postings_data.tcc:55
postings_data()=default
PrimaryKeys may only be integral types or strings; SecondaryKeys may only be integral types...
bool operator==(const postings_data< PrimaryKey, SecondaryKey > &lhs, const postings_data< PrimaryKey, SecondaryKey > &rhs)
Definition: postings_data.tcc:95
PrimaryKey primary_key() const
Definition: postings_data.tcc:102