Source code for flexrag.chunking.chunker_base
from abc import ABC, abstractmethod
from typing import Optional
from flexrag.utils import Register, data
@data
class Chunk:
"""The dataclass for a chunk of text.
:param text: The text of the chunk.
:type text: str
:param start: The start index of the chunk in the original text.
:type start: Optional[int]
:param end: The end index of the chunk in the original text.
:type end: Optional[int]
"""
text: str
start: Optional[int] = None
end: Optional[int] = None
[docs]
class ChunkerBase(ABC):
"""Chunker that splits text into chunks of fixed size.
This is an abstract class that defines the interface for all chunkers.
The subclasses should implement the `chunk` method to split the text.
"""
[docs]
@abstractmethod
def chunk(self, text: str, return_str: bool = False) -> list[Chunk]:
"""Chunk the given text into smaller chunks.
:param text: The text to chunk.
:type text: str
:param return_str: If True, return the chunks as strings instead of Chunk objects.
Default is False.
:type return_str: bool
:return: The chunks of the text.
:rtype: list[Chunk]
"""
return
CHUNKERS = Register[ChunkerBase]("chunker")