Source code for flexrag.retriever.web_retrievers.web_reader

import os
import re
from abc import ABC, abstractmethod
from typing import Optional

import httpx

from flexrag.models import GENERATORS, GenerationConfig, GeneratorConfig
from flexrag.prompt import ChatPrompt, ChatTurn
from flexrag.utils import Register, configure
from flexrag.utils.configure import extract_config
from flexrag.utils.dataclasses import RetrievedContext

from .utils import WebResource
from .web_downloader import (
    WEB_DOWNLOADERS,
    PlaywrightWebDownloader,
    PlaywrightWebDownloaderConfig,
    WebDownloaderConfig,
)


[docs] class WebReaderBase(ABC): """The base class for the ``WebReader``. The WebReader is used to parse the web resources into a format that can be fed into the LLM. """
[docs] @abstractmethod def read(self, resources: list[WebResource]) -> list[RetrievedContext]: """ Parse the retrieved contexts into LLM readable format. :param resources: Resources sought from the web. :type resources: list[WebResource] :return: Contexts that can be fed into the LLM. :rtype: list[RetrievedContext] """ return
@property @abstractmethod def fields(self) -> list[str]: """The fields that the reader will return.""" return
WEB_READERS = Register[WebReaderBase]("web_reader")
[docs] @configure class JinaReaderLMConfig(GeneratorConfig, WebDownloaderConfig, GenerationConfig): """The configuration for the ``JinaReaderLM``. :param use_v2_prompt: Whether to use the jinaai/ReaderLM-v2 prompt. Default is False. :type use_v2_prompt: bool :param pre_clean_html: Whether to pre-clean the HTML content. Default is False. :type pre_clean_html: bool :param clean_svg: Whether to clean the SVG content. Default is False. :type clean_svg: bool :param clean_base64: Whether to clean the base64 images. Default is False. :type clean_base64: bool """ use_v2_prompt: bool = False pre_clean_html: bool = False clean_svg: bool = False clean_base64: bool = False
[docs] @WEB_READERS("jina_readerlm", config_class=JinaReaderLMConfig) class JinaReaderLM(WebReaderBase): """The JinaReaderLM downloads and parses the HTML content using the Jina ReaderLM model.""" # Patterns SCRIPT_PATTERN = r"<[ ]*script.*?\/[ ]*script[ ]*>" STYLE_PATTERN = r"<[ ]*style.*?\/[ ]*style[ ]*>" META_PATTERN = r"<[ ]*meta.*?>" COMMENT_PATTERN = r"<[ ]*!--.*?--[ ]*>" LINK_PATTERN = r"<[ ]*link.*?>" BASE64_IMG_PATTERN = r'<img[^>]+src="data:image/[^;]+;base64,[^"]+"[^>]*>' SVG_PATTERN = r"(<svg[^>]*>)(.*?)(<\/svg>)" def __init__(self, cfg: JinaReaderLMConfig): self.reader = GENERATORS.load(cfg) self.downloader = WEB_DOWNLOADERS.load(cfg) self.cfg = extract_config(cfg, JinaReaderLMConfig) if self.cfg.use_v2_prompt: self.template = ( "Extract the main content from the given HTML and convert it to Markdown format." "\n```html\n{text}\n```" ) else: self.template = "{text}" return
[docs] def read(self, resources: list[WebResource]) -> list[RetrievedContext]: resources = self.downloader.download(resources) # Pre-clean the HTML content web_pages = [] if self.cfg.pre_clean_html: for r in resources: page = r.data if page is not None: web_pages.append( JinaReaderLM.clean_html( html=page, clean_svg=self.cfg.clean_svg, clean_base64=self.cfg.clean_base64, ) ) else: web_pages.append(None) else: web_pages.append(page) # prepare prompts prompts = [ ChatPrompt( history=[ ChatTurn(role="user", content=self.template.format(text=web_page)) ] ) for web_page in web_pages if web_page is not None ] # chat with the reader texts = self.reader.chat(prompts, generation_config=self.cfg) texts = [t[0] for t in texts] contexts = [] for p, ctx in zip(web_pages, resources): if p is None: continue contexts.append( RetrievedContext( retriever="web", query=ctx.query, data={"raw_content": p, "processed_content": texts.pop(0)}, source=ctx.url, ) ) return contexts
@property def fields(self): """The ``JinaReaderLM`` will return the ``raw_content`` and ``processed_content`` fields.""" return ["raw_content", "processed_content"] @staticmethod def replace_svg(html: str, new_content: str = "this is a placeholder") -> str: return re.sub( JinaReaderLM.SVG_PATTERN, lambda match: f"{match.group(1)}{new_content}{match.group(3)}", html, flags=re.DOTALL, ) @staticmethod def replace_base64_images(html: str, new_image_src: str = "#") -> str: return re.sub( JinaReaderLM.BASE64_IMG_PATTERN, f'<img src="{new_image_src}"/>', html, ) @staticmethod def clean_html(html: str, clean_svg: bool = False, clean_base64: bool = False): html = re.sub( JinaReaderLM.SCRIPT_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL, ) html = re.sub( JinaReaderLM.STYLE_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL, ) html = re.sub( JinaReaderLM.META_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL, ) html = re.sub( JinaReaderLM.COMMENT_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL, ) html = re.sub( JinaReaderLM.LINK_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL, ) if clean_svg: html = JinaReaderLM.replace_svg(html) if clean_base64: html = JinaReaderLM.replace_base64_images(html) return html
[docs] @configure class JinaReaderConfig: """The configuration for the ``JinaReader``. :param base_url: The base URL of the Jina Reader API. Default is "https://r.jina.ai". :type base_url: str :param api_key: The API key for the Jina Reader API. If not provided, it will use the environment variable `JINA_API_KEY`. Defaults to None. :type api_key: str :param proxy: The proxy to use. Defaults to None. :type proxy: Optional[str] """ base_url: str = "https://r.jina.ai" api_key: Optional[str] = None proxy: Optional[str] = None
[docs] @WEB_READERS("jina_reader", config_class=JinaReaderConfig) class JinaReader(WebReaderBase): """The JinaReader parse the web pages using the Jina Reader API.""" def __init__(self, cfg: JinaReaderConfig): api_key = cfg.api_key or os.getenv("JINA_API_KEY") if not api_key: raise ValueError( "API key for Jina Reader is not provided. " "Please set it in the configuration or as an environment variable 'JINA_API_KEY'." ) self.client = httpx.Client( base_url=cfg.base_url, headers={"Authorization": f"Bearer {api_key}"}, proxy=cfg.proxy, follow_redirects=True, ) return
[docs] def read(self, resources: list[WebResource]) -> list[RetrievedContext]: responses = [self.client.get(f"/{rc.url}") for rc in resources] contexts = [] for rc, response in zip(resources, responses): if response.status_code == 200: contexts.append( RetrievedContext( retriever="web", query=rc.query, data={"processed_content": response.text}, source=rc.url, ) ) return contexts
@property def fields(self): """The ``JinaReader`` will return the ``processed_content`` field.""" return ["processed_content"]
[docs] @WEB_READERS("snippet") class SnippetWebReader(WebReaderBase): """The SnippetWebReader will return the snippet of the resource directly. This is useful if the resources are retrieved by the ``SearchEngine``, and the snippets are sufficient for the LLM to generate the response. """
[docs] def read(self, resources: list[WebResource]) -> list[RetrievedContext]: return [ RetrievedContext( retriever="web", query=rc.query, data={"snippet": rc.metadata.get("snippet", "")}, source=rc.url, ) for rc in resources ]
@property def fields(self): """The ``SnippetWebReader`` will return the ``snippet`` field.""" return ["snippet"]
[docs] @configure class ScreenshotWebReaderConfig(PlaywrightWebDownloaderConfig): """The configuration for the ``ScreenshotWebReader``.""" return_screenshot: bool = True
[docs] @WEB_READERS("screenshot", config_class=ScreenshotWebReaderConfig) class ScreenshotWebReader(WebReaderBase): """The ScreenshotWebReader reads the web pages by taking screenshots.""" def __init__(self, cfg: ScreenshotWebReaderConfig): super().__init__() assert cfg.return_screenshot == True, "`return_screenshot` must be True" self.downloader = PlaywrightWebDownloader(cfg) return
[docs] def read(self, resources: list[WebResource]) -> list[RetrievedContext]: resources = self.downloader.download(resources) return [ RetrievedContext( retriever="web", query=r.query, data={"screenshot": r.data}, source=r.url, ) for r in resources ]
@property def fields(self): """The ``ScreenshotWebReader`` will return the ``screenshot`` field.""" return ["screenshot"]
WebReaderConfig = WEB_READERS.make_config(config_name="WebReaderConfig")