Filter that normalizes english language tokens.
More...
#include <english_normalizer.h>
|
static const std::string | id = "normalize" |
| Identifier for this filter.
|
|
|
bool | is_whitespace (const std::string &token) const |
| Determines if the given token is a whitespace token. More...
|
|
void | parse_token (const std::string &token) |
| Converts the given non-whitespace token into a series of tokens and places them on the buffer. More...
|
|
uint64_t | starting_quotes (uint64_t start, const std::string &token) |
| Checks for starting quotes in the token, adding a normalized begin quote token to the stream if they exist. More...
|
|
bool | is_quote (char c) |
| Checks if the given character is a passable quote symbol. More...
|
|
uint64_t | strip_dashes (uint64_t start, const std::string &token) |
| Reads consecutive dash characters. More...
|
|
uint64_t | word (uint64_t start, const std::string &token) |
| Reads "word" characters (alpha numeric and dashes) starting at start from the given token. More...
|
|
std::string | current_token () |
|
|
std::unique_ptr< token_stream > | source_ |
| The source to read tokens from.
|
|
std::deque< std::string > | tokens_ |
| Buffered tokens to return.
|
|
Filter that normalizes english language tokens.
Normalization occurs to whitespace (adjacent whitespace tokens are converted to a single normalized space token) and punctuation (which is split out from words following basic heuristics).
meta::analyzers::filters::english_normalizer::english_normalizer |
( |
std::unique_ptr< token_stream > |
source | ) |
|
Constructs an english_normalizer which reads tokens from the given source.
- Parameters
-
source | The source to construct the filter from |
meta::analyzers::filters::english_normalizer::english_normalizer |
( |
const english_normalizer & |
other | ) |
|
Copy constructor.
- Parameters
-
void meta::analyzers::filters::english_normalizer::set_content |
( |
const std::string & |
content | ) |
|
|
override |
Sets the content for the beginning of the filter chain.
- Parameters
-
content | The string content to set |
bool meta::analyzers::filters::english_normalizer::is_whitespace |
( |
const std::string & |
token | ) |
const |
|
private |
Determines if the given token is a whitespace token.
- Parameters
-
void meta::analyzers::filters::english_normalizer::parse_token |
( |
const std::string & |
token | ) |
|
|
private |
Converts the given non-whitespace token into a series of tokens and places them on the buffer.
- Parameters
-
uint64_t meta::analyzers::filters::english_normalizer::starting_quotes |
( |
uint64_t |
start, |
|
|
const std::string & |
token |
|
) |
| |
|
private |
Checks for starting quotes in the token, adding a normalized begin quote token to the stream if they exist.
- Parameters
-
start | The index to start searching at |
token | The given token |
bool meta::analyzers::filters::english_normalizer::is_quote |
( |
char |
c | ) |
|
|
private |
Checks if the given character is a passable quote symbol.
- Parameters
-
uint64_t meta::analyzers::filters::english_normalizer::strip_dashes |
( |
uint64_t |
start, |
|
|
const std::string & |
token |
|
) |
| |
|
private |
Reads consecutive dash characters.
- Parameters
-
start | The index to start searching at |
token | The given token |
uint64_t meta::analyzers::filters::english_normalizer::word |
( |
uint64_t |
start, |
|
|
const std::string & |
token |
|
) |
| |
|
private |
Reads "word" characters (alpha numeric and dashes) starting at start from the given token.
The first token is not checked and is assumed to be part of the returned token.
- Parameters
-
start | The index to start searching at |
token | The given token |
std::string meta::analyzers::filters::english_normalizer::current_token |
( |
| ) |
|
|
private |
- Returns
- the next buffered token.
The documentation for this class was generated from the following files: