Filter that adds sentence boundary tokens ("<s>" and "</s>") to streams of tokens.
More...
#include <sentence_boundary.h>
|
static void | load_heuristics (const cpptoml::table &config) |
| Loads the maps that contain the heuristics for the sentence boundary instances.
|
|
|
static const std::string | id = "sentence-boundary" |
| Identifier for this filter.
|
|
|
static bool | possible_punc (const std::string &token) |
| Determines if the given token is a possible end of sentence punctuation marker. More...
|
|
static bool | possible_end (const std::string &token) |
| Determines if the given token can be the last word in a sentence. More...
|
|
static bool | possible_start (const std::string &token) |
| Determines if the given token can be the beginning of a sentence. More...
|
|
|
static std::unordered_set< std::string > | punc_set {} |
| The set of possible punctuation marks, shared among all instances.
|
|
static std::unordered_set< std::string > | start_exception_set {} |
| The set of words that may not start sentences, shared among all instances.
|
|
static std::unordered_set< std::string > | end_exception_set {} |
| The set of words that may not end sentences, shared among all instances.
|
|
static bool | heuristics_loaded = false |
| Whether or not the heuristics above have been loaded. More...
|
|
Filter that adds sentence boundary tokens ("<s>" and "</s>") to streams of tokens.
This filter requires that whitespace and punctuation be present in the source stream.
meta::analyzers::filters::sentence_boundary::sentence_boundary |
( |
std::unique_ptr< token_stream > |
source | ) |
|
Constructs a sentence_boundary filter, reading tokens from the given source and configured via the given configuration group.
- Parameters
-
source | The source to construct this filter from |
meta::analyzers::filters::sentence_boundary::sentence_boundary |
( |
const sentence_boundary & |
other | ) |
|
Copy constructor.
- Parameters
-
void meta::analyzers::filters::sentence_boundary::set_content |
( |
const std::string & |
content | ) |
|
|
override |
Sets the content for the beginning of the filter chain.
- Parameters
-
content | The string content to set |
std::string meta::analyzers::filters::sentence_boundary::next |
( |
| ) |
|
|
override |
- Returns
- the next token in the sequence.
std::string meta::analyzers::filters::sentence_boundary::current_token |
( |
| ) |
|
|
private |
- Returns
- the next buffered token.
bool meta::analyzers::filters::sentence_boundary::possible_punc |
( |
const std::string & |
token | ) |
|
|
staticprivate |
Determines if the given token is a possible end of sentence punctuation marker.
- Parameters
-
bool meta::analyzers::filters::sentence_boundary::possible_end |
( |
const std::string & |
token | ) |
|
|
staticprivate |
Determines if the given token can be the last word in a sentence.
- Parameters
-
bool meta::analyzers::filters::sentence_boundary::possible_start |
( |
const std::string & |
token | ) |
|
|
staticprivate |
Determines if the given token can be the beginning of a sentence.
- Parameters
-
bool meta::analyzers::filters::sentence_boundary::heuristics_loaded = false |
|
staticprivate |
Whether or not the heuristics above have been loaded.
Must be set by loading the heuristics before constructing any sentence_boundary filters.
The documentation for this class was generated from the following files: