ModErn Text Analysis
META Enumerates Textual Applications
document.h
Go to the documentation of this file.
1 
10 #ifndef META_DOCUMENT_H_
11 #define META_DOCUMENT_H_
12 
13 #include <string>
14 #include <unordered_map>
15 
16 #include "meta.h"
17 #include "util/optional.h"
18 
19 namespace meta
20 {
21 namespace corpus
22 {
23 
31 class document
32 {
33  public:
40  document(const std::string& path = "[NONE]", doc_id d_id = doc_id{0},
41  const class_label& label = class_label{"[NONE]"});
42 
48  void increment(const std::string& term, double amount);
49 
53  std::string path() const;
54 
58  const class_label& label() const;
59 
63  std::string name() const;
64 
68  void name(const std::string& n);
69 
74  uint64_t length() const;
75 
81  double count(const std::string& term) const;
82 
86  const std::unordered_map<std::string, double>& counts() const;
87 
96  void content(const std::string& content,
97  const std::string& encoding = "utf-8");
98 
103  void encoding(const std::string& encoding);
104 
108  const std::string& content() const;
109 
113  const std::string& encoding() const;
114 
118  doc_id id() const;
119 
123  bool contains_content() const;
124 
129  void label(class_label label);
130 
131  private:
133  std::string path_;
134 
136  doc_id d_id_;
137 
139  class_label label_;
140 
142  std::string name_;
143 
145  size_t length_;
146 
148  std::unordered_map<std::string, double> counts_;
149 
152 
154  std::string encoding_;
155 };
156 }
157 }
158 
159 #endif
Contains top-level namespace documentation for the META toolkit.
std::string name_
The short name for this document (not the full path)
Definition: document.h:142
void increment(const std::string &term, double amount)
Increment the count of the specified transition.
Definition: document.cpp:23
const class_label & label() const
Definition: document.cpp:34
const std::string & content() const
Definition: document.cpp:76
document(const std::string &path="[NONE]", doc_id d_id=doc_id{0}, const class_label &label=class_label{"[NONE]"})
Constructor.
Definition: document.cpp:15
class_label label_
Which category this document would be classified into.
Definition: document.h:139
std::string encoding_
The encoding for the content.
Definition: document.h:154
bool contains_content() const
Definition: document.cpp:94
std::string path_
Where this document is on disk.
Definition: document.h:133
const std::string & encoding() const
Definition: document.cpp:84
Represents an indexable document.
Definition: document.h:31
The ModErn Text Analysis toolkit is a suite of natural language processing, classification, information retreival, data mining, and other applications of text processing.
Definition: analyzer.h:24
const std::unordered_map< std::string, double > & counts() const
Definition: document.cpp:59
uint64_t length() const
Definition: document.cpp:49
std::string name() const
Definition: document.cpp:39
doc_id id() const
Definition: document.cpp:89
std::unordered_map< std::string, double > counts_
Counts of how many times each token appears.
Definition: document.h:148
doc_id d_id_
The document id for this document.
Definition: document.h:136
std::string path() const
Definition: document.cpp:29
size_t length_
The number of (non-unique) tokens in this document.
Definition: document.h:145
double count(const std::string &term) const
Get the number of occurrences for a particular term.
Definition: document.cpp:54
util::optional< std::string > content_
What the document contains.
Definition: document.h:151