Source code for flexrag.document_parser.docling_parser
import os
from flexrag.utils import configure
from .document_parser_base import DOCUMENTPARSERS, Document, DocumentParserBase
[docs]
@configure
class DoclingConfig:
do_ocr: bool = False
do_table_structure: bool = True
generate_page_images: bool = False
generate_picture_images: bool = False
[docs]
@DOCUMENTPARSERS("docling", config_class=DoclingConfig)
class DoclingParser(DocumentParserBase):
def __init__(self, config: DoclingConfig):
try:
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
except ImportError:
raise ImportError(
"Docling is not installed. Please install it via `pip install docling`."
)
pdf_pipeline_options = PdfPipelineOptions(
do_ocr=config.do_ocr,
do_table_structure=config.do_table_structure,
generate_page_images=config.generate_page_images,
generate_picture_images=config.generate_picture_images,
)
self.doc_converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_pipeline_options)
}
)
return
def parse(self, input_file_path: str) -> Document:
assert os.path.exists(input_file_path)
document_ = self.doc_converter.convert(input_file_path).document
document = Document(
source_file_path=input_file_path,
text=document_.export_to_markdown(),
title=document_.name,
)
if document.pagaes.image is not None:
document.screenshots = [p.image.pil_image for p in document_.pages]
if document.pictures.image is not None:
document.images = [p.image.pil_image for p in document_.pictures]
return document