ModErn Text Analysis
META Enumerates Textual Applications
sentence_boundary.h
Go to the documentation of this file.
1 
9 #ifndef META_SENTENCE_BOUNDARY_H_
10 #define META_SENTENCE_BOUNDARY_H_
11 
12 #include <deque>
13 #include <memory>
14 #include <unordered_set>
15 
17 #include "util/clonable.h"
18 #include "util/optional.h"
19 
20 namespace cpptoml
21 {
22 class table;
23 }
24 
25 namespace meta
26 {
27 namespace analyzers
28 {
29 namespace filters
30 {
31 
37 class sentence_boundary : public util::clonable<token_stream, sentence_boundary>
38 {
39  public:
44  static void load_heuristics(const cpptoml::table& config);
45 
51  sentence_boundary(std::unique_ptr<token_stream> source);
52 
58 
63  void set_content(const std::string& content) override;
64 
68  std::string next() override;
69 
73  operator bool() const override;
74 
76  const static std::string id;
77 
78  private:
82  std::string current_token();
83 
89  static bool possible_punc(const std::string& token);
90 
95  static bool possible_end(const std::string& token);
96 
101  static bool possible_start(const std::string& token);
102 
104  std::unique_ptr<token_stream> source_;
105 
107  std::deque<std::string> tokens_;
108 
111 
113  static std::unordered_set<std::string> punc_set;
114 
119  static std::unordered_set<std::string> start_exception_set;
120 
125  static std::unordered_set<std::string> end_exception_set;
126 
132  static bool heuristics_loaded;
133 };
134 
139 template <>
140 std::unique_ptr<token_stream>
141  make_filter<sentence_boundary>(std::unique_ptr<token_stream>,
142  const cpptoml::table&);
143 }
144 }
145 }
146 #endif
std::string next() override
Definition: sentence_boundary.cpp:86
static const std::string id
Identifier for this filter.
Definition: sentence_boundary.h:76
static bool possible_end(const std::string &token)
Determines if the given token can be the last word in a sentence.
Definition: sentence_boundary.cpp:160
void set_content(const std::string &content) override
Sets the content for the beginning of the filter chain.
Definition: sentence_boundary.cpp:43
static std::unordered_set< std::string > end_exception_set
The set of words that may not end sentences, shared among all instances.
Definition: sentence_boundary.h:125
static std::unordered_set< std::string > start_exception_set
The set of words that may not start sentences, shared among all instances.
Definition: sentence_boundary.h:119
static void load_heuristics(const cpptoml::table &config)
Loads the maps that contain the heuristics for the sentence boundary instances.
Definition: sentence_boundary.cpp:51
static std::unordered_set< std::string > punc_set
The set of possible punctuation marks, shared among all instances.
Definition: sentence_boundary.h:113
static bool heuristics_loaded
Whether or not the heuristics above have been loaded.
Definition: sentence_boundary.h:132
Template class to facilitate polymorphic cloning.
Definition: clonable.h:28
std::unique_ptr< token_stream > source_
The source to read tokens from.
Definition: sentence_boundary.h:104
std::string current_token()
Definition: sentence_boundary.cpp:147
util::optional< std::string > prev_
The previous token.
Definition: sentence_boundary.h:110
The ModErn Text Analysis toolkit is a suite of natural language processing, classification, information retreival, data mining, and other applications of text processing.
Definition: analyzer.h:24
sentence_boundary(std::unique_ptr< token_stream > source)
Constructs a sentence_boundary filter, reading tokens from the given source and configured via the gi...
Definition: sentence_boundary.cpp:26
static bool possible_start(const std::string &token)
Determines if the given token can be the beginning of a sentence.
Definition: sentence_boundary.cpp:166
std::unique_ptr< token_stream > make_filter< sentence_boundary >(std::unique_ptr< token_stream >, const cpptoml::table &)
Specialization of the factory method used to create sentence_boundary filters.
Definition: sentence_boundary.cpp:173
std::deque< std::string > tokens_
The current buffered tokens.
Definition: sentence_boundary.h:107
Filter that adds sentence boundary tokens ("" and "") to streams of tokens.
Definition: sentence_boundary.h:37
Definition: analyzer.h:19
static bool possible_punc(const std::string &token)
Determines if the given token is a possible end of sentence punctuation marker.
Definition: sentence_boundary.cpp:155