langchain/libs/community/langchain_community/vectorstores/llm_rails.py

"""Wrapper around LLMRails vector database."""
from __future__ import annotations

import json
import logging
import os
import uuid
from typing import Any, Iterable, List, Optional, Tuple

import requests
from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_core.pydantic_v1 import Field
from langchain_core.vectorstores import VectorStore, VectorStoreRetriever


class LLMRails(VectorStore):
    """Implementation of Vector Store using LLMRails.

     See https://llmrails.com/

    Example:
        .. code-block:: python

            from langchain_community.vectorstores import LLMRails

            vectorstore = LLMRails(
                api_key=llm_rails_api_key,
                datastore_id=datastore_id
            )
    """

    def __init__(
        self,
        datastore_id: Optional[str] = None,
        api_key: Optional[str] = None,
    ):
        """Initialize with LLMRails API."""
        self._datastore_id = datastore_id or os.environ.get("LLM_RAILS_DATASTORE_ID")
        self._api_key = api_key or os.environ.get("LLM_RAILS_API_KEY")
        if self._api_key is None:
            logging.warning("Can't find Rails credentials in environment.")

        self._session = requests.Session()  # to reuse connections
        self.datastore_id = datastore_id
        self.base_url = "https://api.llmrails.com/v1"

    def _get_post_headers(self) -> dict:
        """Returns headers that should be attached to each post request."""
        return {"X-API-KEY": self._api_key}

    def add_texts(
        self,
        texts: Iterable[str],
        metadatas: Optional[List[dict]] = None,
        **kwargs: Any,
    ) -> List[str]:
        """Run more texts through the embeddings and add to the vectorstore.

        Args:
            texts: Iterable of strings to add to the vectorstore.

        Returns:
            List of ids from adding the texts into the vectorstore.

        """
        names: List[str] = []
        for text in texts:
            doc_name = str(uuid.uuid4())
            response = self._session.post(
                f"{self.base_url}/datastores/{self._datastore_id}/text",
                json={"name": doc_name, "text": text},
                verify=True,
                headers=self._get_post_headers(),
            )

            if response.status_code != 200:
                logging.error(
                    f"Create request failed for doc_name = {doc_name} with status code "
                    f"{response.status_code}, reason {response.reason}, text "
                    f"{response.text}"
                )

                return names

            names.append(doc_name)

        return names

    def add_files(
        self,
        files_list: Iterable[str],
        metadatas: Optional[List[dict]] = None,
        **kwargs: Any,
    ) -> bool:
        """
        LLMRails provides a way to add documents directly via our API where
        pre-processing and chunking occurs internally in an optimal way
        This method provides a way to use that API in LangChain

        Args:
            files_list: Iterable of strings, each representing a local file path.
                    Files could be text, HTML, PDF, markdown, doc/docx, ppt/pptx, etc.
                    see API docs for full list

        Returns:
            List of ids associated with each of the files indexed
        """
        files = []

        for file in files_list:
            if not os.path.exists(file):
                logging.error(f"File {file} does not exist, skipping")
                continue

            files.append(("file", (os.path.basename(file), open(file, "rb"))))

        response = self._session.post(
            f"{self.base_url}/datastores/{self._datastore_id}/file",
            files=files,
            verify=True,
            headers=self._get_post_headers(),
        )

        if response.status_code != 200:
            logging.error(
                f"Create request failed for datastore = {self._datastore_id} "
                f"with status code {response.status_code}, reason {response.reason}, "
                f"text {response.text}"
            )

            return False

        return True

    def similarity_search_with_score(
        self, query: str, k: int = 5
    ) -> List[Tuple[Document, float]]:
        """Return LLMRails documents most similar to query, along with scores.

        Args:
            query: Text to look up documents similar to.
            k: Number of Documents to return. Defaults to 5 Max 10.
            alpha: parameter for hybrid search .

        Returns:
            List of Documents most similar to the query and score for each.
        """
        response = self._session.post(
            headers=self._get_post_headers(),
            url=f"{self.base_url}/datastores/{self._datastore_id}/search",
            data=json.dumps({"k": k, "text": query}),
            timeout=10,
        )

        if response.status_code != 200:
            logging.error(
                "Query failed %s",
                f"(code {response.status_code}, reason {response.reason}, details "
                f"{response.text})",
            )
            return []

        results = response.json()["results"]
        docs = [
            (
                Document(
                    page_content=x["text"],
                    metadata={
                        key: value
                        for key, value in x["metadata"].items()
                        if key != "score"
                    },
                ),
                x["metadata"]["score"],
            )
            for x in results
        ]

        return docs

    def similarity_search(
        self, query: str, k: int = 4, **kwargs: Any
    ) -> List[Document]:
        """Return LLMRails documents most similar to query, along with scores.

        Args:
            query: Text to look up documents similar to.
            k: Number of Documents to return. Defaults to 5.

        Returns:
            List of Documents most similar to the query
        """
        docs_and_scores = self.similarity_search_with_score(query, k=k)

        return [doc for doc, _ in docs_and_scores]

    @classmethod
    def from_texts(
        cls,
        texts: List[str],
        embedding: Optional[Embeddings] = None,
        metadatas: Optional[List[dict]] = None,
        **kwargs: Any,
    ) -> LLMRails:
        """Construct LLMRails wrapper from raw documents.
        This is intended to be a quick way to get started.
        Example:
            .. code-block:: python

                from langchain_community.vectorstores import LLMRails
                llm_rails = LLMRails.from_texts(
                    texts,
                    datastore_id=datastore_id,
                    api_key=llm_rails_api_key
                )
        """
        # Note: LLMRails generates its own embeddings, so we ignore the provided
        # embeddings (required by interface)
        llm_rails = cls(**kwargs)
        llm_rails.add_texts(texts)
        return llm_rails

    def as_retriever(self, **kwargs: Any) -> LLMRailsRetriever:
        return LLMRailsRetriever(vectorstore=self, **kwargs)


class LLMRailsRetriever(VectorStoreRetriever):
    """Retriever for LLMRails."""

    vectorstore: LLMRails
    search_kwargs: dict = Field(default_factory=lambda: {"k": 5})
    """Search params.
        k: Number of Documents to return. Defaults to 5.
        alpha: parameter for hybrid search .
    """

    def add_texts(self, texts: List[str]) -> None:
        """Add text to the datastore.

        Args:
            texts (List[str]): The text
        """
        self.vectorstore.add_texts(texts)
community[major], core[patch], langchain[patch], experimental[patch]: Create langchain-community (#14463) Moved the following modules to new package langchain-community in a backwards compatible fashion: ``` mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community ``` Moved the following to core ``` mv langchain/langchain/utils/json_schema.py core/langchain_core/utils mv langchain/langchain/utils/html.py core/langchain_core/utils mv langchain/langchain/utils/strings.py core/langchain_core/utils cat langchain/langchain/utils/env.py >> core/langchain_core/utils/env.py rm langchain/langchain/utils/env.py ``` See .scripts/community_split/script_integrations.sh for all changes 2023-12-11 21:53:30 +00:00			`"""Wrapper around LLMRails vector database."""`
			`from __future__ import annotations`

			`import json`
			`import logging`
			`import os`
			`import uuid`
			`from typing import Any, Iterable, List, Optional, Tuple`

			`import requests`
			`from langchain_core.documents import Document`
			`from langchain_core.embeddings import Embeddings`
			`from langchain_core.pydantic_v1 import Field`
			`from langchain_core.vectorstores import VectorStore, VectorStoreRetriever`


			`class LLMRails(VectorStore):`
			`"""Implementation of Vector Store using LLMRails.`

			`See https://llmrails.com/`

			`Example:`
			`.. code-block:: python`

			`from langchain_community.vectorstores import LLMRails`

			`vectorstore = LLMRails(`
			`api_key=llm_rails_api_key,`
			`datastore_id=datastore_id`
			`)`
			`"""`

			`def __init__(`
			`self,`
			`datastore_id: Optional[str] = None,`
			`api_key: Optional[str] = None,`
			`):`
			`"""Initialize with LLMRails API."""`
			`self._datastore_id = datastore_id or os.environ.get("LLM_RAILS_DATASTORE_ID")`
			`self._api_key = api_key or os.environ.get("LLM_RAILS_API_KEY")`
			`if self._api_key is None:`
			`logging.warning("Can't find Rails credentials in environment.")`

			`self._session = requests.Session() # to reuse connections`
			`self.datastore_id = datastore_id`
			`self.base_url = "https://api.llmrails.com/v1"`

			`def _get_post_headers(self) -> dict:`
			`"""Returns headers that should be attached to each post request."""`
			`return {"X-API-KEY": self._api_key}`

			`def add_texts(`
			`self,`
			`texts: Iterable[str],`
			`metadatas: Optional[List[dict]] = None,`
			`**kwargs: Any,`
			`) -> List[str]:`
			`"""Run more texts through the embeddings and add to the vectorstore.`

			`Args:`
			`texts: Iterable of strings to add to the vectorstore.`

			`Returns:`
			`List of ids from adding the texts into the vectorstore.`

			`"""`
			`names: List[str] = []`
			`for text in texts:`
			`doc_name = str(uuid.uuid4())`
			`response = self._session.post(`
			`f"{self.base_url}/datastores/{self._datastore_id}/text",`
			`json={"name": doc_name, "text": text},`
			`verify=True,`
			`headers=self._get_post_headers(),`
			`)`

			`if response.status_code != 200:`
			`logging.error(`
			`f"Create request failed for doc_name = {doc_name} with status code "`
			`f"{response.status_code}, reason {response.reason}, text "`
			`f"{response.text}"`
			`)`

			`return names`

			`names.append(doc_name)`

			`return names`

			`def add_files(`
			`self,`
			`files_list: Iterable[str],`
			`metadatas: Optional[List[dict]] = None,`
			`**kwargs: Any,`
			`) -> bool:`
			`"""`
			`LLMRails provides a way to add documents directly via our API where`
			`pre-processing and chunking occurs internally in an optimal way`
			`This method provides a way to use that API in LangChain`

			`Args:`
			`files_list: Iterable of strings, each representing a local file path.`
			`Files could be text, HTML, PDF, markdown, doc/docx, ppt/pptx, etc.`
			`see API docs for full list`

			`Returns:`
			`List of ids associated with each of the files indexed`
			`"""`
			`files = []`

			`for file in files_list:`
			`if not os.path.exists(file):`
			`logging.error(f"File {file} does not exist, skipping")`
			`continue`

			`files.append(("file", (os.path.basename(file), open(file, "rb"))))`

			`response = self._session.post(`
			`f"{self.base_url}/datastores/{self._datastore_id}/file",`
			`files=files,`
			`verify=True,`
			`headers=self._get_post_headers(),`
			`)`

			`if response.status_code != 200:`
			`logging.error(`
			`f"Create request failed for datastore = {self._datastore_id} "`
			`f"with status code {response.status_code}, reason {response.reason}, "`
			`f"text {response.text}"`
			`)`

			`return False`

			`return True`

			`def similarity_search_with_score(`
			`self, query: str, k: int = 5`
			`) -> List[Tuple[Document, float]]:`
			`"""Return LLMRails documents most similar to query, along with scores.`

			`Args:`
			`query: Text to look up documents similar to.`
			`k: Number of Documents to return. Defaults to 5 Max 10.`
			`alpha: parameter for hybrid search .`

			`Returns:`
			`List of Documents most similar to the query and score for each.`
			`"""`
			`response = self._session.post(`
			`headers=self._get_post_headers(),`
			`url=f"{self.base_url}/datastores/{self._datastore_id}/search",`
			`data=json.dumps({"k": k, "text": query}),`
			`timeout=10,`
			`)`

			`if response.status_code != 200:`
			`logging.error(`
			`"Query failed %s",`
			`f"(code {response.status_code}, reason {response.reason}, details "`
			`f"{response.text})",`
			`)`
			`return []`

			`results = response.json()["results"]`
			`docs = [`
			`(`
			`Document(`
			`page_content=x["text"],`
			`metadata={`
			`key: value`
			`for key, value in x["metadata"].items()`
			`if key != "score"`
			`},`
			`),`
			`x["metadata"]["score"],`
			`)`
			`for x in results`
			`]`

			`return docs`

			`def similarity_search(`
			`self, query: str, k: int = 4, **kwargs: Any`
			`) -> List[Document]:`
			`"""Return LLMRails documents most similar to query, along with scores.`

			`Args:`
			`query: Text to look up documents similar to.`
			`k: Number of Documents to return. Defaults to 5.`

			`Returns:`
			`List of Documents most similar to the query`
			`"""`
			`docs_and_scores = self.similarity_search_with_score(query, k=k)`

			`return [doc for doc, _ in docs_and_scores]`

			`@classmethod`
			`def from_texts(`
			`cls,`
			`texts: List[str],`
			`embedding: Optional[Embeddings] = None,`
			`metadatas: Optional[List[dict]] = None,`
			`**kwargs: Any,`
			`) -> LLMRails:`
			`"""Construct LLMRails wrapper from raw documents.`
			`This is intended to be a quick way to get started.`
			`Example:`
			`.. code-block:: python`

			`from langchain_community.vectorstores import LLMRails`
			`llm_rails = LLMRails.from_texts(`
			`texts,`
			`datastore_id=datastore_id,`
			`api_key=llm_rails_api_key`
			`)`
			`"""`
			`# Note: LLMRails generates its own embeddings, so we ignore the provided`
			`# embeddings (required by interface)`
			`llm_rails = cls(**kwargs)`
			`llm_rails.add_texts(texts)`
			`return llm_rails`

			`def as_retriever(self, **kwargs: Any) -> LLMRailsRetriever:`
			`return LLMRailsRetriever(vectorstore=self, **kwargs)`


			`class LLMRailsRetriever(VectorStoreRetriever):`
			`"""Retriever for LLMRails."""`

			`vectorstore: LLMRails`
			`search_kwargs: dict = Field(default_factory=lambda: {"k": 5})`
			`"""Search params.`
			`k: Number of Documents to return. Defaults to 5.`
			`alpha: parameter for hybrid search .`
			`"""`

			`def add_texts(self, texts: List[str]) -> None:`
			`"""Add text to the datastore.`

			`Args:`
			`texts (List[str]): The text`
			`"""`
			`self.vectorstore.add_texts(texts)`