ModErn Text Analysis
META Enumerates Textual Applications
line_corpus.h
Go to the documentation of this file.
1 
10 #ifndef META_LINE_CORPUS_H_
11 #define META_LINE_CORPUS_H_
12 
13 #include <string>
14 #include <vector>
15 #include <utility>
16 #include "io/parser.h"
17 #include "corpus/corpus.h"
18 
19 namespace meta
20 {
21 namespace corpus
22 {
23 
29 class line_corpus : public corpus
30 {
31  public:
40  line_corpus(const std::string& file, std::string encoding,
41  uint64_t num_lines = 0);
42 
46  bool has_next() const override;
47 
51  document next() override;
52 
56  uint64_t size() const override;
57 
58  private:
60  doc_id cur_id_;
61 
63  uint64_t num_lines_;
64 
67 
69  std::unique_ptr<io::parser> class_parser_;
70 
72  std::unique_ptr<io::parser> name_parser_;
73 };
74 }
75 }
76 
77 #endif
line_corpus(const std::string &file, std::string encoding, uint64_t num_lines=0)
Definition: line_corpus.cpp:18
std::unique_ptr< io::parser > class_parser_
Parser to read the class labels.
Definition: line_corpus.h:69
Parses a text file by reading it completely into memory, delimiting tokens by user request...
Definition: parser.h:29
document next() override
Definition: line_corpus.cpp:52
Represents an indexable document.
Definition: document.h:31
const std::string & encoding() const
Definition: corpus.cpp:21
Provides interface to with multiple corpus input formats.
Definition: corpus.h:27
The ModErn Text Analysis toolkit is a suite of natural language processing, classification, information retreival, data mining, and other applications of text processing.
Definition: analyzer.h:24
io::parser parser_
Parser to read the corpus file.
Definition: line_corpus.h:66
bool has_next() const override
Definition: line_corpus.cpp:47
doc_id cur_id_
The current document we are on.
Definition: line_corpus.h:60
std::unique_ptr< io::parser > name_parser_
Parser to read the document names.
Definition: line_corpus.h:72
Fills document objects with content line-by-line from an input file.
Definition: line_corpus.h:29
uint64_t size() const override
Definition: line_corpus.cpp:69
uint64_t num_lines_
The number of lines in the file.
Definition: line_corpus.h:63