Source code for flexrag.chunking.sentence_splitter
from abc import ABC, abstractmethod
from functools import partial
from flexrag.utils import Register, configure
[docs]
class SentenceSplitterBase(ABC):
"""Sentence splitter that splits text into sentences.
This is an abstract class that defines the interface for all sentence splitters.
The subclasses should implement the `split` method to split the text.
The `reversible` property should return True if the splitted sentences can be concatenate back to the original text.
"""
[docs]
@abstractmethod
def split(self, text: str) -> list[str]:
"""Split the given text into sentences.
:param text: The text to split.
:type text: str
:return: The sentences of the text.
:rtype: list[str]
"""
return
@property
@abstractmethod
def reversible(self) -> bool:
"""return True if the splitted sentences can be concatenate back to the original text."""
return
SENTENCE_SPLITTERS = Register[SentenceSplitterBase]("sentence_splitter")
[docs]
@configure
class NLTKSentenceSplitterConfig:
"""Configuration for NLTKSentenceSplitter.
:param language: The language to use for the sentence splitter. Default is "english".
:type language: str
"""
language: str = "english"
[docs]
@SENTENCE_SPLITTERS("nltk_splitter", config_class=NLTKSentenceSplitterConfig)
class NLTKSentenceSplitter(SentenceSplitterBase):
"""NLTKSentenceSplitter splits text into sentences using NLTK's PunktSentenceTokenizer.
For more information, see https://www.nltk.org/api/nltk.tokenize.punkt.html#module-nltk.tokenize.punkt.
"""
def __init__(self, cfg: NLTKSentenceSplitterConfig) -> None:
try:
import nltk
except ImportError:
raise ImportError("NLTK is required for NLTKSentenceSplitter.")
self.splitter = partial(nltk.sent_tokenize, language=cfg.language)
return
[docs]
def split(self, text: str) -> list[str]:
texts = [t + " " for t in self.splitter(text)]
texts[-1] = texts[-1][:-1]
return texts
@property
def reversible(self) -> bool:
"""NLTKSentenceSplitter is not reversible as it may lose spaces between sentences."""
return False
PREDEFINED_SPLIT_PATTERNS = {
"en": {
"big_paragraph": r"(?<=\R{2,})",
"paragraph": r"(?<=\R)",
"sentence": r"(?<=[.?!])",
"subsentence": r"(?<=[,;\"'{}<>\[\]`~])",
"word": r"(?<=\s)",
},
"zh": {
"big_paragraph": r"(?<=\R{2,})",
"paragraph": r"(?<=\R)",
"setence": r"(?<=[。!?])",
"subsentence": r"(?<=[,;:“”‘’《》【】、])",
},
}
[docs]
@configure
class RegexSplitterConfig:
"""Configuration for RegexSentenceSplitter.
:param pattern: The regular expression pattern to split the text.
Default is ``PREDEFINED_SPLIT_PATTERNS["en"]["sentence"]``
:type pattern: str
Note that some patterns may lose the seperators between sentences.
A good practice is to use the lookbehind and lookahead assertion to avoid consuming the splitter.
"""
pattern: str = PREDEFINED_SPLIT_PATTERNS["en"]["sentence"]
[docs]
@SENTENCE_SPLITTERS("regex", config_class=RegexSplitterConfig)
class RegexSplitter(SentenceSplitterBase):
"""RegexSentenceSplitter splits text into sentences using a regular expression pattern.
Note that this splitter uses the `regex` module, which might be slightly different from the built-in `re` module.
"""
def __init__(self, cfg: RegexSplitterConfig) -> None:
import regex
self.pattern = regex.compile(cfg.pattern)
return
[docs]
def split(self, text: str) -> list[str]:
return self.pattern.split(text)
@property
def reversible(self) -> bool:
"""The default RegexSplitter is reversible. However, the reversibility depends on the pattern used."""
return True
[docs]
@configure
class SpacySentenceSplitterConfig:
"""Configuration for SpacySentenceSplitter.
:param model: The spaCy model to use for sentence splitting. Default is "en_core_web_sm".
:type model: str
"""
model: str = "en_core_web_sm"
[docs]
@SENTENCE_SPLITTERS("spacy", config_class=SpacySentenceSplitterConfig)
class SpacySentenceSplitter(SentenceSplitterBase):
"""SpacySentenceSplitter splits text into sentences using spaCy's sentence splitter."""
def __init__(self, cfg: SpacySentenceSplitterConfig) -> None:
try:
import spacy
except ImportError:
raise ImportError("spaCy is required for SpacySentenceSplitter.")
self.nlp = spacy.load(cfg.model)
return
[docs]
def split(self, text: str) -> list[str]:
return [sent.text for sent in self.nlp(text).sents]
@property
def reversible(self) -> bool:
"""SpacySentenceSplitter is not reversible as it may lose spaces between sentences."""
return False
SentenceSplitterConfig = SENTENCE_SPLITTERS.make_config(
default="regex", config_name="SentenceSplitterConfig"
)