ModErn Text Analysis
META Enumerates Textual Applications
disk_index_impl.h
Go to the documentation of this file.
1 
10 #ifndef META_INDEX_DISK_INDEX_IMPL_H_
11 #define META_INDEX_DISK_INDEX_IMPL_H_
12 
13 #include <mutex>
14 
15 #include "index/disk_index.h"
16 #include "index/string_list.h"
17 #include "index/vocabulary_map.h"
18 #include "util/disk_vector.h"
19 #include "util/invertible_map.h"
20 #include "util/optional.h"
21 
22 namespace meta
23 {
24 namespace index
25 {
26 
27 class string_list_writer;
28 
33 {
34  DOC_IDS_MAPPING = 0,
35  DOC_IDS_MAPPING_INDEX,
36  DOC_SIZES,
37  DOC_LABELS,
38  DOC_UNIQUETERMS,
39  LABEL_IDS_MAPPING,
40  POSTINGS,
41  TERM_IDS_MAPPING,
42  TERM_IDS_MAPPING_INVERSE
43 };
44 
49 {
50  public:
52  friend disk_index;
53 
57  const static std::vector<const char*> files;
58 
64  void initialize_metadata(uint64_t num_docs = 0);
65 
70  void load_doc_sizes(uint64_t num_docs = 0);
71 
76  void load_labels(uint64_t num_docs = 0);
77 
82  void load_unique_terms(uint64_t num_docs = 0);
83 
87  void load_doc_id_mapping();
88 
92  void load_term_id_mapping();
93 
97  void load_label_id_mapping();
98 
102  void load_postings();
103 
107  void save_label_id_mapping();
108 
116 
122  void set_label(doc_id id, const class_label& label);
123 
129  void set_length(doc_id id, uint64_t length);
130 
136  void set_unique_terms(doc_id id, uint64_t terms);
137 
141  const io::mmap_file& postings() const;
142 
146  uint64_t total_unique_terms() const;
147 
152  label_id doc_label_id(doc_id id) const;
153 
157  std::vector<class_label> class_labels() const;
158 
159  private:
165  label_id get_label_id(const class_label& lbl);
166 
168  std::string index_name_;
169 
175 
181 
187 
195 
198 
201 
208 
210  mutable std::mutex mutex_;
211 };
212 }
213 }
214 #endif
static const std::vector< const char * > files
Filenames used in the index.
Definition: disk_index_impl.h:57
index_file
Collection of all the files that comprise a disk_index.
Definition: disk_index_impl.h:32
A class for representing optional values.
Definition: vocabulary_map.h:21
util::optional< string_list > doc_id_mapping_
doc_id -> document path mapping.
Definition: disk_index_impl.h:174
void load_doc_sizes(uint64_t num_docs=0)
Loads the doc sizes.
Definition: disk_index.cpp:142
util::optional< vocabulary_map > term_id_mapping_
Maps string terms to term_ids.
Definition: disk_index_impl.h:197
A class for writing large lists of strings to disk with an associated index file for fast random acce...
Definition: string_list_writer.h:34
uint64_t length(const std::string &str)
Definition: utf.cpp:136
void save_label_id_mapping()
Saves the label_id mapping.
Definition: disk_index.cpp:180
Memory maps a text file readonly.
Definition: mmap_file.h:24
std::mutex mutex_
mutex for thread-safe operations
Definition: disk_index_impl.h:210
void load_unique_terms(uint64_t num_docs=0)
Loads the unique terms per document.
Definition: disk_index.cpp:154
uint64_t num_docs() const
Definition: disk_index.cpp:91
uint64_t total_unique_terms() const
Definition: disk_index.cpp:211
util::optional< io::mmap_file > postings_
A pointer to a memory-mapped postings file.
Definition: disk_index_impl.h:207
The implementation of a disk_index.
Definition: disk_index_impl.h:48
util::invertible_map< class_label, label_id > label_ids_
Assigns an integer to each class label (used for liblinear mappings)
Definition: disk_index_impl.h:200
util::optional< util::disk_vector< double > > doc_sizes_
doc_id -> document length mapping.
Definition: disk_index_impl.h:180
void set_length(doc_id id, uint64_t length)
Sets the size of a document.
Definition: disk_index.cpp:196
void set_unique_terms(doc_id id, uint64_t terms)
Sets the number of unique terms for a document.
Definition: disk_index.cpp:201
friend disk_index
friend the interface
Definition: disk_index_impl.h:52
void initialize_metadata(uint64_t num_docs=0)
Initializes the following metadata maps: doc_sizes_, labels_, unique_terms_.
Definition: disk_index.cpp:135
void load_labels(uint64_t num_docs=0)
Loads the doc labels.
Definition: disk_index.cpp:148
The ModErn Text Analysis toolkit is a suite of natural language processing, classification, information retreival, data mining, and other applications of text processing.
Definition: analyzer.h:24
void load_term_id_mapping()
Loads the term_id mapping.
Definition: disk_index.cpp:165
std::string index_name_
the location of this index
Definition: disk_index_impl.h:168
void set_label(doc_id id, const class_label &label)
Sets the label for a document.
Definition: disk_index.cpp:191
void load_postings()
Loads the postings file.
Definition: disk_index.cpp:175
void load_label_id_mapping()
Loads the label_id mapping.
Definition: disk_index.cpp:170
label_id get_label_id(const class_label &lbl)
Definition: disk_index.cpp:121
label_id doc_label_id(doc_id id) const
Definition: disk_index.cpp:216
const io::mmap_file & postings() const
Definition: disk_index.cpp:206
string_list_writer make_doc_id_writer(uint64_t num_docs) const
Creates a string_list_writer for writing the docids mapping.
Definition: disk_index.cpp:186
util::optional< util::disk_vector< uint64_t > > unique_terms_
Holds how many unique terms there are per-document.
Definition: disk_index_impl.h:194
class_label label(doc_id d_id) const
Definition: disk_index.cpp:46
util::optional< util::disk_vector< label_id > > labels_
Maps which class a document belongs to (if any).
Definition: disk_index_impl.h:186
std::vector< class_label > class_labels() const
Definition: disk_index.cpp:221
void load_doc_id_mapping()
Loads the doc_id mapping.
Definition: disk_index.cpp:160