ModErn Text Analysis
META Enumerates Textual Applications
analyzer.h
Go to the documentation of this file.
1 
11 #ifndef META_ANALYZER_H_
12 #define META_ANALYZER_H_
13 
14 #include <stdexcept>
15 #include <memory>
16 
17 #include "io/parser.h"
18 
19 namespace cpptoml
20 {
21 class table;
22 }
23 
24 namespace meta
25 {
26 
27 namespace corpus
28 {
29 class document;
30 }
31 
32 namespace analyzers
33 {
34 
35 class token_stream;
36 
41 class analyzer
42 {
43  public:
47  virtual ~analyzer() = default;
48 
53  virtual void tokenize(corpus::document& doc) = 0;
54 
58  virtual std::unique_ptr<analyzer> clone() const = 0;
59 
64  static std::unique_ptr<analyzer> load(const cpptoml::table& config);
65 
71  static std::unique_ptr<token_stream>
72  default_filter_chain(const cpptoml::table& config);
73 
79  static std::unique_ptr<token_stream>
80  load_filters(const cpptoml::table& global,
81  const cpptoml::table& config);
82 
88  static std::unique_ptr<token_stream>
89  load_filter(std::unique_ptr<token_stream> src,
90  const cpptoml::table& config);
91 
100  static io::parser create_parser(const corpus::document& doc,
101  const std::string& extension,
102  const std::string& delims);
103 
108  static std::string get_content(const corpus::document& doc);
109 
110  public:
114  class analyzer_exception : public std::runtime_error
115  {
116  public:
117  using std::runtime_error::runtime_error;
118  };
119 };
120 }
121 }
122 #endif
virtual ~analyzer()=default
A default virtual destructor.
static io::parser create_parser(const corpus::document &doc, const std::string &extension, const std::string &delims)
Definition: analyzer.cpp:36
static std::unique_ptr< analyzer > load(const cpptoml::table &config)
Definition: analyzer.cpp:99
static std::unique_ptr< token_stream > load_filters(const cpptoml::table &global, const cpptoml::table &config)
Definition: analyzer.cpp:77
static std::unique_ptr< token_stream > default_filter_chain(const cpptoml::table &config)
Definition: analyzer.cpp:49
Parses a text file by reading it completely into memory, delimiting tokens by user request...
Definition: parser.h:29
Represents an indexable document.
Definition: document.h:31
Provides interface to with multiple corpus input formats.
Definition: corpus.h:27
virtual void tokenize(corpus::document &doc)=0
Tokenizes a document.
static std::string get_content(const corpus::document &doc)
Definition: analyzer.cpp:27
The ModErn Text Analysis toolkit is a suite of natural language processing, classification, information retreival, data mining, and other applications of text processing.
Definition: analyzer.h:24
Basic exception for analyzer interactions.
Definition: analyzer.h:114
An class that provides a framework to produce token counts from documents.
Definition: analyzer.h:41
static std::unique_ptr< token_stream > load_filter(std::unique_ptr< token_stream > src, const cpptoml::table &config)
Definition: analyzer.cpp:67
Base class that represents a stream of tokens that have been extracted from a document.
Definition: token_stream.h:27
virtual std::unique_ptr< analyzer > clone() const =0
Clones this analyzer.
Definition: analyzer.h:19