ModErn Text Analysis
META Enumerates Textual Applications
Main Page
Related Pages
Namespaces
Classes
Files
File List
File Members
include
index
disk_index_impl.h
Go to the documentation of this file.
1
10
#ifndef META_INDEX_DISK_INDEX_IMPL_H_
11
#define META_INDEX_DISK_INDEX_IMPL_H_
12
13
#include <mutex>
14
15
#include "
index/disk_index.h
"
16
#include "
index/string_list.h
"
17
#include "
index/vocabulary_map.h
"
18
#include "
util/disk_vector.h
"
19
#include "
util/invertible_map.h
"
20
#include "
util/optional.h
"
21
22
namespace
meta
23
{
24
namespace
index
25
{
26
27
class
string_list_writer;
28
32
enum
index_file
33
{
34
DOC_IDS_MAPPING = 0,
35
DOC_IDS_MAPPING_INDEX,
36
DOC_SIZES,
37
DOC_LABELS,
38
DOC_UNIQUETERMS,
39
LABEL_IDS_MAPPING,
40
POSTINGS,
41
TERM_IDS_MAPPING,
42
TERM_IDS_MAPPING_INVERSE
43
};
44
48
class
disk_index::disk_index_impl
49
{
50
public
:
52
friend
disk_index
;
53
57
const
static
std::vector<const char*>
files
;
58
64
void
initialize_metadata
(uint64_t
num_docs
= 0);
65
70
void
load_doc_sizes
(uint64_t
num_docs
= 0);
71
76
void
load_labels
(uint64_t
num_docs
= 0);
77
82
void
load_unique_terms
(uint64_t
num_docs
= 0);
83
87
void
load_doc_id_mapping
();
88
92
void
load_term_id_mapping
();
93
97
void
load_label_id_mapping
();
98
102
void
load_postings
();
103
107
void
save_label_id_mapping
();
108
115
string_list_writer
make_doc_id_writer
(uint64_t
num_docs
)
const
;
116
122
void
set_label
(doc_id
id
,
const
class_label&
label
);
123
129
void
set_length
(doc_id
id
, uint64_t
length
);
130
136
void
set_unique_terms
(doc_id
id
, uint64_t terms);
137
141
const
io::mmap_file
&
postings
()
const
;
142
146
uint64_t
total_unique_terms
()
const
;
147
152
label_id
doc_label_id
(doc_id
id
)
const
;
153
157
std::vector<class_label>
class_labels
()
const
;
158
159
private
:
165
label_id
get_label_id
(
const
class_label& lbl);
166
168
std::string
index_name_
;
169
174
util::optional<string_list>
doc_id_mapping_
;
175
180
util::optional<util::disk_vector<double>
>
doc_sizes_
;
181
186
util::optional<util::disk_vector<label_id>
>
labels_
;
187
194
util::optional<util::disk_vector<uint64_t>
>
unique_terms_
;
195
197
util::optional<vocabulary_map>
term_id_mapping_
;
198
200
util::invertible_map<class_label, label_id>
label_ids_
;
201
207
util::optional<io::mmap_file>
postings_
;
208
210
mutable
std::mutex
mutex_
;
211
};
212
}
213
}
214
#endif
meta::index::disk_index::disk_index_impl::files
static const std::vector< const char * > files
Filenames used in the index.
Definition:
disk_index_impl.h:57
meta::index::index_file
index_file
Collection of all the files that comprise a disk_index.
Definition:
disk_index_impl.h:32
meta::util::optional
A class for representing optional values.
Definition:
vocabulary_map.h:21
optional.h
meta::index::disk_index::disk_index_impl::doc_id_mapping_
util::optional< string_list > doc_id_mapping_
doc_id -> document path mapping.
Definition:
disk_index_impl.h:174
meta::index::disk_index::disk_index_impl::load_doc_sizes
void load_doc_sizes(uint64_t num_docs=0)
Loads the doc sizes.
Definition:
disk_index.cpp:142
meta::index::disk_index::disk_index_impl::term_id_mapping_
util::optional< vocabulary_map > term_id_mapping_
Maps string terms to term_ids.
Definition:
disk_index_impl.h:197
meta::index::string_list_writer
A class for writing large lists of strings to disk with an associated index file for fast random acce...
Definition:
string_list_writer.h:34
meta::utf::length
uint64_t length(const std::string &str)
Definition:
utf.cpp:136
meta::index::disk_index::disk_index_impl::save_label_id_mapping
void save_label_id_mapping()
Saves the label_id mapping.
Definition:
disk_index.cpp:180
meta::io::mmap_file
Memory maps a text file readonly.
Definition:
mmap_file.h:24
meta::index::disk_index::disk_index_impl::mutex_
std::mutex mutex_
mutex for thread-safe operations
Definition:
disk_index_impl.h:210
meta::index::disk_index::disk_index_impl::load_unique_terms
void load_unique_terms(uint64_t num_docs=0)
Loads the unique terms per document.
Definition:
disk_index.cpp:154
meta::index::disk_index::num_docs
uint64_t num_docs() const
Definition:
disk_index.cpp:91
vocabulary_map.h
meta::index::disk_index::disk_index_impl::total_unique_terms
uint64_t total_unique_terms() const
Definition:
disk_index.cpp:211
meta::index::disk_index::disk_index_impl::postings_
util::optional< io::mmap_file > postings_
A pointer to a memory-mapped postings file.
Definition:
disk_index_impl.h:207
meta::index::disk_index::disk_index_impl
The implementation of a disk_index.
Definition:
disk_index_impl.h:48
meta::index::disk_index::disk_index_impl::label_ids_
util::invertible_map< class_label, label_id > label_ids_
Assigns an integer to each class label (used for liblinear mappings)
Definition:
disk_index_impl.h:200
meta::index::disk_index::disk_index_impl::doc_sizes_
util::optional< util::disk_vector< double > > doc_sizes_
doc_id -> document length mapping.
Definition:
disk_index_impl.h:180
meta::index::disk_index::disk_index_impl::set_length
void set_length(doc_id id, uint64_t length)
Sets the size of a document.
Definition:
disk_index.cpp:196
meta::index::disk_index::disk_index_impl::set_unique_terms
void set_unique_terms(doc_id id, uint64_t terms)
Sets the number of unique terms for a document.
Definition:
disk_index.cpp:201
meta::index::disk_index::disk_index_impl::disk_index
friend disk_index
friend the interface
Definition:
disk_index_impl.h:52
meta::index::disk_index::disk_index_impl::initialize_metadata
void initialize_metadata(uint64_t num_docs=0)
Initializes the following metadata maps: doc_sizes_, labels_, unique_terms_.
Definition:
disk_index.cpp:135
meta::index::disk_index::disk_index_impl::load_labels
void load_labels(uint64_t num_docs=0)
Loads the doc labels.
Definition:
disk_index.cpp:148
disk_index.h
meta
The ModErn Text Analysis toolkit is a suite of natural language processing, classification, information retreival, data mining, and other applications of text processing.
Definition:
analyzer.h:24
meta::index::disk_index::disk_index_impl::load_term_id_mapping
void load_term_id_mapping()
Loads the term_id mapping.
Definition:
disk_index.cpp:165
meta::index::disk_index::disk_index_impl::index_name_
std::string index_name_
the location of this index
Definition:
disk_index_impl.h:168
meta::index::disk_index::disk_index_impl::set_label
void set_label(doc_id id, const class_label &label)
Sets the label for a document.
Definition:
disk_index.cpp:191
string_list.h
meta::index::disk_index::disk_index_impl::load_postings
void load_postings()
Loads the postings file.
Definition:
disk_index.cpp:175
disk_vector.h
meta::index::disk_index::disk_index_impl::load_label_id_mapping
void load_label_id_mapping()
Loads the label_id mapping.
Definition:
disk_index.cpp:170
meta::index::disk_index::disk_index_impl::get_label_id
label_id get_label_id(const class_label &lbl)
Definition:
disk_index.cpp:121
invertible_map.h
meta::index::disk_index::disk_index_impl::doc_label_id
label_id doc_label_id(doc_id id) const
Definition:
disk_index.cpp:216
meta::index::disk_index::disk_index_impl::postings
const io::mmap_file & postings() const
Definition:
disk_index.cpp:206
meta::index::disk_index::disk_index_impl::make_doc_id_writer
string_list_writer make_doc_id_writer(uint64_t num_docs) const
Creates a string_list_writer for writing the docids mapping.
Definition:
disk_index.cpp:186
meta::util::invertible_map< class_label, label_id >
meta::index::disk_index::disk_index_impl::unique_terms_
util::optional< util::disk_vector< uint64_t > > unique_terms_
Holds how many unique terms there are per-document.
Definition:
disk_index_impl.h:194
meta::index::disk_index::label
class_label label(doc_id d_id) const
Definition:
disk_index.cpp:46
meta::index::disk_index::disk_index_impl::labels_
util::optional< util::disk_vector< label_id > > labels_
Maps which class a document belongs to (if any).
Definition:
disk_index_impl.h:186
meta::index::disk_index::disk_index_impl::class_labels
std::vector< class_label > class_labels() const
Definition:
disk_index.cpp:221
meta::index::disk_index::disk_index_impl::load_doc_id_mapping
void load_doc_id_mapping()
Loads the doc_id mapping.
Definition:
disk_index.cpp:160
Generated on Tue Mar 3 2015 23:20:16 for ModErn Text Analysis by
1.8.9.1