ModErn Text Analysis
META Enumerates Textual Applications
english_normalizer.h
Go to the documentation of this file.
1 
9 #ifndef META_ENGLISH_NORMALIZER_H_
10 #define META_ENGLISH_NORMALIZER_H_
11 
12 #include <deque>
13 #include <memory>
14 #include "analyzers/token_stream.h"
15 #include "util/clonable.h"
16 #include "util/optional.h"
17 
18 namespace meta
19 {
20 namespace analyzers
21 {
22 namespace filters
23 {
24 
32  : public util::clonable<token_stream, english_normalizer>
33 {
34  public:
40  english_normalizer(std::unique_ptr<token_stream> source);
41 
47 
52  void set_content(const std::string& content) override;
53 
57  std::string next() override;
58 
62  operator bool() const override;
63 
65  const static std::string id;
66 
67  private:
72  bool is_whitespace(const std::string& token) const;
73 
79  void parse_token(const std::string& token);
80 
87  uint64_t starting_quotes(uint64_t start, const std::string& token);
88 
93  bool is_quote(char c);
94 
100  uint64_t strip_dashes(uint64_t start, const std::string& token);
101 
109  uint64_t word(uint64_t start, const std::string& token);
110 
114  std::string current_token();
115 
117  std::unique_ptr<token_stream> source_;
118 
120  std::deque<std::string> tokens_;
121 };
122 }
123 }
124 }
125 #endif
std::unique_ptr< token_stream > source_
The source to read tokens from.
Definition: english_normalizer.h:117
uint64_t strip_dashes(uint64_t start, const std::string &token)
Reads consecutive dash characters.
Definition: english_normalizer.cpp:140
english_normalizer(std::unique_ptr< token_stream > source)
Constructs an english_normalizer which reads tokens from the given source.
Definition: english_normalizer.cpp:19
static const std::string id
Identifier for this filter.
Definition: english_normalizer.h:65
std::string current_token()
Definition: english_normalizer.cpp:186
std::string next() override
Obtains the next token in the sequence.
Definition: english_normalizer.cpp:37
Template class to facilitate polymorphic cloning.
Definition: clonable.h:28
std::deque< std::string > tokens_
Buffered tokens to return.
Definition: english_normalizer.h:120
The ModErn Text Analysis toolkit is a suite of natural language processing, classification, information retreival, data mining, and other applications of text processing.
Definition: analyzer.h:24
bool is_quote(char c)
Checks if the given character is a passable quote symbol.
Definition: english_normalizer.cpp:135
bool is_whitespace(const std::string &token) const
Determines if the given token is a whitespace token.
Definition: english_normalizer.cpp:71
void parse_token(const std::string &token)
Converts the given non-whitespace token into a series of tokens and places them on the buffer...
Definition: english_normalizer.cpp:77
uint64_t starting_quotes(uint64_t start, const std::string &token)
Checks for starting quotes in the token, adding a normalized begin quote token to the stream if they ...
Definition: english_normalizer.cpp:117
uint64_t word(uint64_t start, const std::string &token)
Reads "word" characters (alpha numeric and dashes) starting at start from the given token...
Definition: english_normalizer.cpp:150
void set_content(const std::string &content) override
Sets the content for the beginning of the filter chain.
Definition: english_normalizer.cpp:31
Filter that normalizes english language tokens.
Definition: english_normalizer.h:31