Text Processing#

This module provides a set of classes and functions for preprocessing and filtering texts, including normalization, length filtering, etc.

class flexrag.text_process.TextUnit(content, reserved=True, processed_by=<factory>)[source]#
dump(path)#

Dump the dataclass to a YAML file.

dumps()#

Dump the dataclass to a YAML string.

classmethod load(path)#

Load the dataclass from a YAML file.

classmethod loads(s)#

Load the dataclass from a YAML string.

class flexrag.text_process.Processor[source]#
__call__(input_text)[source]#

Process the input text. If the processor has been filtered, the reserved flag of the input TextUnit will be set to False.

Parameters:

input_text (TextUnit) – The input text to process.

Returns:

The processed text.

Return type:

TextUnit

class flexrag.text_process.TextProcessPipelineConfig(processor_type=<factory>, length_filter_config=<factory>, token_normalize_config=<factory>, truncate_config=<factory>)#

Configuration class for processor (name: TextProcessPipelineConfig, default: None).

Parameters:
  • processor_type (list[str]) – The processor type to use.

  • length_filter_config (LengthFilterConfig) – The config for LengthFilter.

  • token_normalize_config (TokenNormalizerConfig) – The config for TokenNormalizer.

  • truncate_config (TruncatorConfig) – The config for Truncator.

class flexrag.text_process.TextProcessPipeline(cfg)[source]#
class flexrag.text_process.TokenNormalizerConfig(lang='en', penn=True, norm_quote_commas=True, norm_numbers=True, pre_replace_unicode_punct=False, post_remove_control_chars=False, perl_parity=False)[source]#
dump(path)#

Dump the dataclass to a YAML file.

dumps()#

Dump the dataclass to a YAML string.

classmethod load(path)#

Load the dataclass from a YAML file.

classmethod loads(s)#

Load the dataclass from a YAML string.

class flexrag.text_process.TokenNormalizer(cfg)[source]#

Bases: Processor

class flexrag.text_process.ChineseSimplifier[source]#

Bases: Processor

class flexrag.text_process.Lowercase[source]#

Bases: Processor

class flexrag.text_process.Unifier[source]#

Bases: Processor

class flexrag.text_process.TruncatorConfig(max_chars=None, max_bytes=None, max_tokens=None, tokenizer_config=<factory>)[source]#
dump(path)#

Dump the dataclass to a YAML file.

dumps()#

Dump the dataclass to a YAML string.

classmethod load(path)#

Load the dataclass from a YAML file.

classmethod loads(s)#

Load the dataclass from a YAML string.

class flexrag.text_process.Truncator(cfg)[source]#

Bases: Processor

class flexrag.text_process.AnswerSimplifier[source]#

Bases: Processor

class flexrag.text_process.ExactDeduplicate[source]#

Bases: Processor