ModErn Text Analysis
META Enumerates Textual Applications
ptb_normalizer.h
Go to the documentation of this file.
1 
9 #ifndef META_PTB_NORMALIZER_H_
10 #define META_PTB_NORMALIZER_H_
11 
12 #include <deque>
13 #include <memory>
14 #include "analyzers/token_stream.h"
15 #include "util/clonable.h"
16 
17 namespace meta
18 {
19 namespace analyzers
20 {
21 namespace filters
22 {
23 
29 class ptb_normalizer : public util::clonable<token_stream, ptb_normalizer>
30 {
31  public:
37  ptb_normalizer(std::unique_ptr<token_stream> source);
38 
43  ptb_normalizer(const ptb_normalizer& other);
44 
49  void set_content(const std::string& content) override;
50 
54  std::string next() override;
55 
59  operator bool() const override;
60 
62  const static std::string id;
63 
64  private:
68  std::string current_token();
69 
76  void parse_token(const std::string& token);
77 
79  std::unique_ptr<token_stream> source_;
80 
82  std::deque<std::string> tokens_;
83 };
84 }
85 }
86 }
87 #endif
std::string next() override
Obtains the next token in the sequence.
Definition: ptb_normalizer.cpp:37
std::string current_token()
Definition: ptb_normalizer.cpp:90
std::unique_ptr< token_stream > source_
The source to read tokens from.
Definition: ptb_normalizer.h:79
static const std::string id
Identifier for this filter.
Definition: ptb_normalizer.h:62
A filter that normalizes text to match Penn Treebank conventions.
Definition: ptb_normalizer.h:29
ptb_normalizer(std::unique_ptr< token_stream > source)
Constructs an ptb_normalizer which reads tokens from the given source.
Definition: ptb_normalizer.cpp:19
Template class to facilitate polymorphic cloning.
Definition: clonable.h:28
The ModErn Text Analysis toolkit is a suite of natural language processing, classification, information retreival, data mining, and other applications of text processing.
Definition: analyzer.h:24
void parse_token(const std::string &token)
Performs token normalization, splitting, etc.
Definition: ptb_normalizer.cpp:97
void set_content(const std::string &content) override
Sets the content for the beginning of the filter chain.
Definition: ptb_normalizer.cpp:31
std::deque< std::string > tokens_
Buffered tokens to return.
Definition: ptb_normalizer.h:82