Text Processing#

This module provides a set of classes and functions for preprocessing and filtering texts, including normalization, length filtering, etc.

class flexrag.text_process.TextUnit(content, reserved=True, processed_by=<factory>)[源代码]#
dump(path)#

Dump the dataclass to a YAML file.

dumps()#

Dump the dataclass to a YAML string.

classmethod load(path)#

Load the dataclass from a YAML file.

classmethod loads(s)#

Load the dataclass from a YAML string.

class flexrag.text_process.Processor[源代码]#
__call__(input_text)[源代码]#

Process the input text. If the processor has been filtered, the reserved flag of the input TextUnit will be set to False.

参数:

input_text (TextUnit) -- The input text to process.

返回:

The processed text.

返回类型:

TextUnit

class flexrag.text_process.TextProcessPipelineConfig(processor_type=<factory>, length_filter_config=<factory>, token_normalize_config=<factory>, truncate_config=<factory>)#

Configuration class for processor (name: TextProcessPipelineConfig, default: None).

参数:
  • processor_type (list[str]) -- The processor type to use.

  • length_filter_config (LengthFilterConfig) -- The config for LengthFilter.

  • token_normalize_config (TokenNormalizerConfig) -- The config for TokenNormalizer.

  • truncate_config (TruncatorConfig) -- The config for Truncator.

class flexrag.text_process.TextProcessPipeline(cfg)[源代码]#
class flexrag.text_process.TokenNormalizerConfig(lang='en', penn=True, norm_quote_commas=True, norm_numbers=True, pre_replace_unicode_punct=False, post_remove_control_chars=False, perl_parity=False)[源代码]#
dump(path)#

Dump the dataclass to a YAML file.

dumps()#

Dump the dataclass to a YAML string.

classmethod load(path)#

Load the dataclass from a YAML file.

classmethod loads(s)#

Load the dataclass from a YAML string.

class flexrag.text_process.TokenNormalizer(cfg)[源代码]#

基类:Processor

class flexrag.text_process.ChineseSimplifier[源代码]#

基类:Processor

class flexrag.text_process.Lowercase[源代码]#

基类:Processor

class flexrag.text_process.Unifier[源代码]#

基类:Processor

class flexrag.text_process.TruncatorConfig(max_chars=None, max_bytes=None, max_tokens=None, tokenizer_config=<factory>)[源代码]#
dump(path)#

Dump the dataclass to a YAML file.

dumps()#

Dump the dataclass to a YAML string.

classmethod load(path)#

Load the dataclass from a YAML file.

classmethod loads(s)#

Load the dataclass from a YAML string.

class flexrag.text_process.Truncator(cfg)[源代码]#

基类:Processor

class flexrag.text_process.AnswerSimplifier[源代码]#

基类:Processor

class flexrag.text_process.ExactDeduplicate[源代码]#

基类:Processor