Source code for flexrag.chunking.sentence_splitter

from abc import ABC, abstractmethod
from functools import partial

from flexrag.utils import Register, configure


[docs] class SentenceSplitterBase(ABC): """Sentence splitter that splits text into sentences. This is an abstract class that defines the interface for all sentence splitters. The subclasses should implement the `split` method to split the text. The `reversible` property should return True if the splitted sentences can be concatenate back to the original text. """
[docs] @abstractmethod def split(self, text: str) -> list[str]: """Split the given text into sentences. :param text: The text to split. :type text: str :return: The sentences of the text. :rtype: list[str] """ return
@property @abstractmethod def reversible(self) -> bool: """return True if the splitted sentences can be concatenate back to the original text.""" return
SENTENCE_SPLITTERS = Register[SentenceSplitterBase]("sentence_splitter")
[docs] @configure class NLTKSentenceSplitterConfig: """Configuration for NLTKSentenceSplitter. :param language: The language to use for the sentence splitter. Default is "english". :type language: str """ language: str = "english"
[docs] @SENTENCE_SPLITTERS("nltk_splitter", config_class=NLTKSentenceSplitterConfig) class NLTKSentenceSplitter(SentenceSplitterBase): """NLTKSentenceSplitter splits text into sentences using NLTK's PunktSentenceTokenizer. For more information, see https://www.nltk.org/api/nltk.tokenize.punkt.html#module-nltk.tokenize.punkt. """ def __init__(self, cfg: NLTKSentenceSplitterConfig) -> None: try: import nltk except ImportError: raise ImportError("NLTK is required for NLTKSentenceSplitter.") self.splitter = partial(nltk.sent_tokenize, language=cfg.language) return
[docs] def split(self, text: str) -> list[str]: texts = [t + " " for t in self.splitter(text)] texts[-1] = texts[-1][:-1] return texts
@property def reversible(self) -> bool: """NLTKSentenceSplitter is not reversible as it may lose spaces between sentences.""" return False
PREDEFINED_SPLIT_PATTERNS = { "en": { "big_paragraph": r"(?<=\R{2,})", "paragraph": r"(?<=\R)", "sentence": r"(?<=[.?!])", "subsentence": r"(?<=[,;\"'{}<>\[\]`~])", "word": r"(?<=\s)", }, "zh": { "big_paragraph": r"(?<=\R{2,})", "paragraph": r"(?<=\R)", "setence": r"(?<=[。!?])", "subsentence": r"(?<=[,;:“”‘’《》【】、])", }, }
[docs] @configure class RegexSplitterConfig: """Configuration for RegexSentenceSplitter. :param pattern: The regular expression pattern to split the text. Default is ``PREDEFINED_SPLIT_PATTERNS["en"]["sentence"]`` :type pattern: str Note that some patterns may lose the seperators between sentences. A good practice is to use the lookbehind and lookahead assertion to avoid consuming the splitter. """ pattern: str = PREDEFINED_SPLIT_PATTERNS["en"]["sentence"]
[docs] @SENTENCE_SPLITTERS("regex", config_class=RegexSplitterConfig) class RegexSplitter(SentenceSplitterBase): """RegexSentenceSplitter splits text into sentences using a regular expression pattern. Note that this splitter uses the `regex` module, which might be slightly different from the built-in `re` module. """ def __init__(self, cfg: RegexSplitterConfig) -> None: import regex self.pattern = regex.compile(cfg.pattern) return
[docs] def split(self, text: str) -> list[str]: return self.pattern.split(text)
@property def reversible(self) -> bool: """The default RegexSplitter is reversible. However, the reversibility depends on the pattern used.""" return True
[docs] @configure class SpacySentenceSplitterConfig: """Configuration for SpacySentenceSplitter. :param model: The spaCy model to use for sentence splitting. Default is "en_core_web_sm". :type model: str """ model: str = "en_core_web_sm"
[docs] @SENTENCE_SPLITTERS("spacy", config_class=SpacySentenceSplitterConfig) class SpacySentenceSplitter(SentenceSplitterBase): """SpacySentenceSplitter splits text into sentences using spaCy's sentence splitter.""" def __init__(self, cfg: SpacySentenceSplitterConfig) -> None: try: import spacy except ImportError: raise ImportError("spaCy is required for SpacySentenceSplitter.") self.nlp = spacy.load(cfg.model) return
[docs] def split(self, text: str) -> list[str]: return [sent.text for sent in self.nlp(text).sents]
@property def reversible(self) -> bool: """SpacySentenceSplitter is not reversible as it may lose spaces between sentences.""" return False
SentenceSplitterConfig = SENTENCE_SPLITTERS.make_config( default="regex", config_name="SentenceSplitterConfig" )