langchain/libs/partners/exa/langchain_exa/retrievers.py

from typing import (  # type: ignore[import-not-found, import-not-found]
    Any,
    Dict,
    List,
    Literal,
    Optional,
    Union,
)

from exa_py import Exa  # type: ignore
from exa_py.api import HighlightsContentsOptions, TextContentsOptions  # type: ignore
from langchain_core.callbacks import CallbackManagerForRetrieverRun
from langchain_core.documents import Document
from langchain_core.pydantic_v1 import Field, SecretStr, root_validator
from langchain_core.retrievers import BaseRetriever

from langchain_exa._utilities import initialize_client


def _get_metadata(result: Any) -> Dict[str, Any]:
    """Get the metadata from a result object."""
    metadata = {
        "title": result.title,
        "url": result.url,
        "id": result.id,
        "score": result.score,
        "published_date": result.published_date,
        "author": result.author,
    }
    if getattr(result, "highlights"):
        metadata["highlights"] = result.highlights
    if getattr(result, "highlight_scores"):
        metadata["highlight_scores"] = result.highlight_scores
    return metadata


class ExaSearchRetriever(BaseRetriever):
    """Exa Search retriever."""

    k: int = 10  # num_results
    """The number of search results to return."""
    include_domains: Optional[List[str]] = None
    """A list of domains to include in the search."""
    exclude_domains: Optional[List[str]] = None
    """A list of domains to exclude from the search."""
    start_crawl_date: Optional[str] = None
    """The start date for the crawl (in YYYY-MM-DD format)."""
    end_crawl_date: Optional[str] = None
    """The end date for the crawl (in YYYY-MM-DD format)."""
    start_published_date: Optional[str] = None
    """The start date for when the document was published (in YYYY-MM-DD format)."""
    end_published_date: Optional[str] = None
    """The end date for when the document was published (in YYYY-MM-DD format)."""
    use_autoprompt: Optional[bool] = None
    """Whether to use autoprompt for the search."""
    type: str = "neural"
    """The type of search, 'keyword' or 'neural'. Default: neural"""
    highlights: Optional[Union[HighlightsContentsOptions, bool]] = None
    """Whether to set the page content to the highlights of the results."""
    text_contents_options: Union[TextContentsOptions, Literal[True]] = True
    """How to set the page content of the results"""

    client: Exa = Field(default=None)
    exa_api_key: SecretStr = Field(default=None)
    exa_base_url: Optional[str] = None

    @root_validator(pre=True)
    def validate_environment(cls, values: Dict) -> Dict:
        """Validate the environment."""
        values = initialize_client(values)
        return values

    def _get_relevant_documents(
        self, query: str, *, run_manager: CallbackManagerForRetrieverRun
    ) -> List[Document]:
        response = self.client.search_and_contents(  # type: ignore[misc]
            query,
            num_results=self.k,
            text=self.text_contents_options,
            highlights=self.highlights,  # type: ignore
            include_domains=self.include_domains,
            exclude_domains=self.exclude_domains,
            start_crawl_date=self.start_crawl_date,
            end_crawl_date=self.end_crawl_date,
            start_published_date=self.start_published_date,
            end_published_date=self.end_published_date,
            use_autoprompt=self.use_autoprompt,
        )

        results = response.results

        return [
            Document(
                page_content=(result.text),
                metadata=_get_metadata(result),
            )
            for result in results
        ]
infra: update mypy 1.10, ruff 0.5 (#23721) ```python """python scripts/update_mypy_ruff.py""" import glob import tomllib from pathlib import Path import toml import subprocess import re ROOT_DIR = Path(__file__).parents[1] def main(): for path in glob.glob(str(ROOT_DIR / "libs/*/pyproject.toml"), recursive=True): print(path) with open(path, "rb") as f: pyproject = tomllib.load(f) try: pyproject["tool"]["poetry"]["group"]["typing"]["dependencies"]["mypy"] = ( "^1.10" ) pyproject["tool"]["poetry"]["group"]["lint"]["dependencies"]["ruff"] = ( "^0.5" ) except KeyError: continue with open(path, "w") as f: toml.dump(pyproject, f) cwd = "/".join(path.split("/")[:-1]) completed = subprocess.run( "poetry lock --no-update; poetry install --with typing; poetry run mypy . --no-color", cwd=cwd, shell=True, capture_output=True, text=True, ) logs = completed.stdout.split("\n") to_ignore = {} for l in logs: if re.match("^(.)\:(\d+)\: error:.\[(.)\]", l): path, line_no, error_type = re.match( "^(.)\:(\d+)\: error:.\[(.*)\]", l ).groups() if (path, line_no) in to_ignore: to_ignore[(path, line_no)].append(error_type) else: to_ignore[(path, line_no)] = [error_type] print(len(to_ignore)) for (error_path, line_no), error_types in to_ignore.items(): all_errors = ", ".join(error_types) full_path = f"{cwd}/{error_path}" try: with open(full_path, "r") as f: file_lines = f.readlines() except FileNotFoundError: continue file_lines[int(line_no) - 1] = ( file_lines[int(line_no) - 1][:-1] + f" # type: ignore[{all_errors}]\n" ) with open(full_path, "w") as f: f.write("".join(file_lines)) subprocess.run( "poetry run ruff format .; poetry run ruff --select I --fix .", cwd=cwd, shell=True, capture_output=True, text=True, ) if __name__ == "__main__": main() ``` 2024-07-03 17:33:27 +00:00			`from typing import ( # type: ignore[import-not-found, import-not-found]`
			`Any,`
			`Dict,`
			`List,`
			`Literal,`
			`Optional,`
			`Union,`
			`)`
exa: init pkg (#16553) 2024-01-25 03:57:17 +00:00
			`from exa_py import Exa # type: ignore`
			`from exa_py.api import HighlightsContentsOptions, TextContentsOptions # type: ignore`
			`from langchain_core.callbacks import CallbackManagerForRetrieverRun`
			`from langchain_core.documents import Document`
exa[patch]: fix lint (#17610) 2024-02-16 04:45:16 +00:00			`from langchain_core.pydantic_v1 import Field, SecretStr, root_validator`
exa: init pkg (#16553) 2024-01-25 03:57:17 +00:00			`from langchain_core.retrievers import BaseRetriever`

			`from langchain_exa._utilities import initialize_client`


			`def _get_metadata(result: Any) -> Dict[str, Any]:`
			`"""Get the metadata from a result object."""`
			`metadata = {`
			`"title": result.title,`
			`"url": result.url,`
			`"id": result.id,`
			`"score": result.score,`
			`"published_date": result.published_date,`
			`"author": result.author,`
			`}`
			`if getattr(result, "highlights"):`
			`metadata["highlights"] = result.highlights`
			`if getattr(result, "highlight_scores"):`
			`metadata["highlight_scores"] = result.highlight_scores`
			`return metadata`


			`class ExaSearchRetriever(BaseRetriever):`
			`"""Exa Search retriever."""`

			`k: int = 10 # num_results`
			`"""The number of search results to return."""`
			`include_domains: Optional[List[str]] = None`
			`"""A list of domains to include in the search."""`
			`exclude_domains: Optional[List[str]] = None`
			`"""A list of domains to exclude from the search."""`
			`start_crawl_date: Optional[str] = None`
			`"""The start date for the crawl (in YYYY-MM-DD format)."""`
			`end_crawl_date: Optional[str] = None`
			`"""The end date for the crawl (in YYYY-MM-DD format)."""`
			`start_published_date: Optional[str] = None`
			`"""The start date for when the document was published (in YYYY-MM-DD format)."""`
			`end_published_date: Optional[str] = None`
			`"""The end date for when the document was published (in YYYY-MM-DD format)."""`
			`use_autoprompt: Optional[bool] = None`
			`"""Whether to use autoprompt for the search."""`
			`type: str = "neural"`
			`"""The type of search, 'keyword' or 'neural'. Default: neural"""`
			`highlights: Optional[Union[HighlightsContentsOptions, bool]] = None`
			`"""Whether to set the page content to the highlights of the results."""`
			`text_contents_options: Union[TextContentsOptions, Literal[True]] = True`
			`"""How to set the page content of the results"""`

exa[patch]: fix lint (#17610) 2024-02-16 04:45:16 +00:00			`client: Exa = Field(default=None)`
			`exa_api_key: SecretStr = Field(default=None)`
exa: init pkg (#16553) 2024-01-25 03:57:17 +00:00			`exa_base_url: Optional[str] = None`

			`@root_validator(pre=True)`
			`def validate_environment(cls, values: Dict) -> Dict:`
			`"""Validate the environment."""`
			`values = initialize_client(values)`
			`return values`

			`def _get_relevant_documents(`
			`self, query: str, *, run_manager: CallbackManagerForRetrieverRun`
			`) -> List[Document]:`
exa[patch]: fix lint (#17610) 2024-02-16 04:45:16 +00:00			`response = self.client.search_and_contents( # type: ignore[misc]`
exa: init pkg (#16553) 2024-01-25 03:57:17 +00:00			`query,`
			`num_results=self.k,`
			`text=self.text_contents_options,`
exa[patch]: fix lint (#17610) 2024-02-16 04:45:16 +00:00			`highlights=self.highlights, # type: ignore`
exa: init pkg (#16553) 2024-01-25 03:57:17 +00:00			`include_domains=self.include_domains,`
			`exclude_domains=self.exclude_domains,`
			`start_crawl_date=self.start_crawl_date,`
			`end_crawl_date=self.end_crawl_date,`
			`start_published_date=self.start_published_date,`
			`end_published_date=self.end_published_date,`
			`use_autoprompt=self.use_autoprompt,`
			`)`

			`results = response.results`

			`return [`
			`Document(`
			`page_content=(result.text),`
			`metadata=_get_metadata(result),`
			`)`
			`for result in results`
			`]`