Source code for flexrag.retriever.web_retrievers.web_seeker

import os
from abc import ABC, abstractmethod
from typing import Annotated, Optional

import httpx

from flexrag.utils import Choices, Register, configure

from .utils import WebResource


[docs] class WebSeekerBase(ABC): """The base class for the WebSeeker. The WebSeeker is used to seek the web resources for a given query. The web resources could be sought by walking through a set of given web pages, by using a search engine, etc. The subclasses should implement the ``seek`` method. """
[docs] @abstractmethod def seek(self, query: str, top_k: int = 10, **kwargs) -> list[WebResource]: """Seek the web resources. :param query: The query to seek. :type query: str :param top_k: The number of resources to seek. Default is 10. :type top_k: int :param kwargs: The additional keyword arguments. :return: The web resources. :rtype: list[WebResource] """ raise NotImplementedError
WEB_SEEKERS = Register[WebSeekerBase]("web_seeker") SEARCH_ENGINES = Register[WebSeekerBase]("search_engine")
[docs] @configure class BingEngineConfig: """The configuration for the ``BingSeeker``. :param subscription_key: The subscription key for the Bing Search API. Default is os.environ.get("BING_SEARCH_KEY", "EMPTY"). :type subscription_key: str :param base_url: The base_url for the Bing Search API. Default is "https://api.bing.microsoft.com/v7.0/search". :type base_url: str :param timeout: The timeout for the requests. Default is 3.0. :type timeout: float :param market: The market to use. see https://learn.microsoft.com/en-us/bing/search-apis/bing-web-search/reference/market-codes for more information). Default is en-US. :type market: str :param lang: The language to use. Default is "en". :type lang: str :param freshness: To get articles discovered by Bing during a specific timeframe, specify a date range in the form, YYYY-MM-DD..YYYY-MM-DD. For example, &freshness=2019-02-01..2019-05-30. Default is None. :type freshness: Optional[str] """ subscription_key: str = os.environ.get("BING_SEARCH_KEY", "EMPTY") base_url: str = "https://api.bing.microsoft.com/v7.0/search" timeout: float = 3.0 market: str = "en-US" lang: str = "en" freshness: Optional[str] = None
[docs] @SEARCH_ENGINES("bing", config_class=BingEngineConfig) @WEB_SEEKERS("bing", config_class=BingEngineConfig) class BingEngine(WebSeekerBase): """The BingSeeker retrieves the web pages using the Bing Search API.""" def __init__(self, cfg: BingEngineConfig): super().__init__() self.client = httpx.Client( base_url=cfg.base_url, headers={"Ocp-Apim-Subscription-Key": cfg.subscription_key}, timeout=cfg.timeout, follow_redirects=True, ) self.market = cfg.market self.freshness = cfg.freshness self.lang = cfg.lang return
[docs] def seek( self, query: str, top_k: int = 10, **search_kwargs, ) -> list[WebResource]: params = {"q": query, "mkt": self.market, "count": top_k, "setLang": self.lang} if self.freshness is not None: params["freshness"] = self.freshness params.update(search_kwargs) response = self.client.get("", params=params) response.raise_for_status() result = response.json() if "webPages" not in result: return [] result = [ WebResource( query=query, url=i["url"], metadata={ "engine": "Bing", "snippet": i["snippet"], }, ) for i in result["webPages"]["value"] ] return result
[docs] @configure class DuckDuckGoEngineConfig: """The configuration for the ``DuckDuckGoEngine``. :param proxy: The proxy to use. Default is None. :type proxy: Optional[str] """ proxy: Optional[str] = None
[docs] @SEARCH_ENGINES("ddg", config_class=DuckDuckGoEngineConfig) @WEB_SEEKERS("ddg", config_class=DuckDuckGoEngineConfig) class DuckDuckGoEngine(WebSeekerBase): """The DuckDuckGoEngine retrieves the web pages using the DuckDuckGo Search API.""" def __init__(self, cfg: DuckDuckGoEngineConfig): super().__init__() from duckduckgo_search import DDGS self.ddgs = DDGS(proxy=cfg.proxy) return
[docs] def seek( self, query: str, top_k: int = 10, **search_kwargs, ) -> list[WebResource]: result = self.ddgs.text(query, max_results=top_k, **search_kwargs) result = [ WebResource( query=query, url=i["href"], metadata={ "engine": "DuckDuckGo", "title": i["title"], "snippet": i["body"], }, ) for i in result ] return result
[docs] @configure class GoogleEngineConfig: """The configuration for the ``GoogleEngine``. :param subscription_key: The subscription key for the Google Search API. If not provided, it will use the environment variable `GOOGLE_SEARCH_KEY`. Defaults to None. :type subscription_key: str :param search_engine_id: The search engine id for the Google Search API. If not provided, it will use the environment variable `GOOGLE_SEARCH_ENGINE_ID`. Defaults to None. :type search_engine_id: str :param endpoint: The endpoint for the Google Search API. Default is "https://customsearch.googleapis.com/customsearch/v1". :type endpoint: str :param proxy: The proxy to use. Default is None. :type proxy: Optional[str] :param timeout: The timeout for the requests. Default is 3.0. :type timeout: float """ subscription_key: Optional[str] = None search_engine_id: Optional[str] = None endpoint: str = "https://customsearch.googleapis.com/customsearch/v1" proxy: Optional[str] = None timeout: float = 3.0
[docs] @SEARCH_ENGINES("google", config_class=GoogleEngineConfig) @WEB_SEEKERS("google", config_class=GoogleEngineConfig) class GoogleEngine(WebSeekerBase): """The GoogleEngine retrieves the web pages using the `Google Custom Search` API.""" name = "google" def __init__(self, cfg: GoogleEngineConfig): super().__init__() self.subscription_key = cfg.subscription_key or os.getenv("GOOGLE_SEARCH_KEY") if not self.subscription_key: raise ValueError( "Subscription key for Google Search API is not provided. " "Please set it in the configuration or as an environment variable 'GOOGLE_SEARCH_KEY'." ) self.engine_id = cfg.search_engine_id or os.getenv("GOOGLE_SEARCH_ENGINE_ID") if not self.engine_id: raise ValueError( "Search engine ID for Google Search API is not provided. " "Please set it in the configuration or as an environment variable 'GOOGLE_SEARCH_ENGINE_ID'." ) self.client = httpx.Client( base_url=cfg.endpoint, timeout=cfg.timeout, proxy=cfg.proxy, follow_redirects=True, ) return
[docs] def seek( self, query: str, top_k: int = 10, **search_kwargs, ) -> list[WebResource]: params = { "key": self.subscription_key, "cx": self.engine_id, "q": query, "num": top_k, } response = self.client.get("", params=params) response.raise_for_status() result = response.json() result = [ WebResource( query=query, url=i["link"], metadata={ "engine": "Google", "title": i["title"], "snippet": i["snippet"], }, ) for i in result["items"] ] return result
[docs] @configure class SerpApiConfig: """The configuration for the ``SerpApi``. :param api_key: The API key for the SerpApi. If not provided, it will use the environment variable `SERPAPI_API_KEY`. Defaults to None. :type api_key: str :param engine: The search engine to use. Default is "google". Available choices are "google", "bing", "baidu", "yandex", "yahoo", "google_scholar", "duckduckgo". :type engine: str :param country: The country to search. Default is "us". :type country: str :param language: The language to search. Default is "en". :type language: str """ api_key: Optional[str] = None engine: Annotated[ str, Choices( "google", "bing", "baidu", "yandex", "yahoo", "google_scholar", "duckduckgo", ), ] = "google" country: str = "us" language: str = "en"
[docs] @SEARCH_ENGINES("serpapi", config_class=SerpApiConfig) @WEB_SEEKERS("serpapi", config_class=SerpApiConfig) class SerpApi(WebSeekerBase): """The SerpApi retrieves the web pages using the `SerpApi <https://serpapi.com/>_`.""" def __init__(self, cfg: SerpApiConfig): super().__init__() api_key = cfg.api_key or os.getenv("SERPAPI_API_KEY") if not api_key: raise ValueError( "API key for SerpApi is not provided. " "Please set it in the configuration or as an environment variable 'SERPAPI_API_KEY'." ) try: import serpapi self.client = serpapi.Client(api_key=api_key) except ImportError: raise ImportError("Please install serpapi with `pip install serpapi`.") self.api_key = api_key self.engine = cfg.engine self.gl = cfg.country self.hl = cfg.language return
[docs] def seek( self, query: str, top_k: int = 10, **search_kwargs, ) -> list[WebResource]: search_params = { "q": query, "engine": self.engine, "api_key": self.api_key, "gl": self.gl, "hl": self.hl, "num": top_k, } search_params.update(search_kwargs) data = self.client.search(search_params) contexts = [ WebResource( query=query, url=r["link"], metadata={ "engine": "SerpAPI", "title": r.get("title", None), "snippet": r.get("snippet", None), }, ) for r in data["organic_results"] ] return contexts
WebSeekerConfig = WEB_SEEKERS.make_config(config_name="WebSeekerConfig") SearchEngineConfig = SEARCH_ENGINES.make_config(config_name="SearchEngineConfig")