langchain/libs/community/langchain_community/retrievers/google_cloud_documentai_war...

"""Retriever wrapper for Google Cloud Document AI Warehouse."""
from typing import TYPE_CHECKING, Any, Dict, List, Optional

from langchain_core.callbacks import CallbackManagerForRetrieverRun
from langchain_core.documents import Document
from langchain_core.pydantic_v1 import root_validator
from langchain_core.retrievers import BaseRetriever
from langchain_core.utils import get_from_dict_or_env

from langchain_community.utilities.vertexai import get_client_info

if TYPE_CHECKING:
    from google.cloud.contentwarehouse_v1 import (
        DocumentServiceClient,
        RequestMetadata,
        SearchDocumentsRequest,
    )
    from google.cloud.contentwarehouse_v1.services.document_service.pagers import (
        SearchDocumentsPager,
    )


class GoogleDocumentAIWarehouseRetriever(BaseRetriever):
    """A retriever based on Document AI Warehouse.

    Documents should be created and documents should be uploaded
        in a separate flow, and this retriever uses only Document AI
        schema_id provided to search for revelant documents.

    More info: https://cloud.google.com/document-ai-warehouse.
    """

    location: str = "us"
    """Google Cloud location where Document AI Warehouse is placed."""
    project_number: str
    """Google Cloud project number, should contain digits only."""
    schema_id: Optional[str] = None
    """Document AI Warehouse schema to query against.
    If nothing is provided, all documents in the project will be searched."""
    qa_size_limit: int = 5
    """The limit on the number of documents returned."""
    client: "DocumentServiceClient" = None  #: :meta private:

    @root_validator()
    def validate_environment(cls, values: Dict) -> Dict:
        """Validates the environment."""
        try:  # noqa: F401
            from google.cloud.contentwarehouse_v1 import DocumentServiceClient
        except ImportError as exc:
            raise ImportError(
                "google.cloud.contentwarehouse is not installed."
                "Please install it with pip install google-cloud-contentwarehouse"
            ) from exc

        values["project_number"] = get_from_dict_or_env(
            values, "project_number", "PROJECT_NUMBER"
        )
        values["client"] = DocumentServiceClient(
            client_info=get_client_info(module="document-ai-warehouse")
        )
        return values

    def _prepare_request_metadata(self, user_ldap: str) -> "RequestMetadata":
        from google.cloud.contentwarehouse_v1 import RequestMetadata, UserInfo

        user_info = UserInfo(id=f"user:{user_ldap}")
        return RequestMetadata(user_info=user_info)

    def _get_relevant_documents(
        self, query: str, *, run_manager: CallbackManagerForRetrieverRun, **kwargs: Any
    ) -> List[Document]:
        request = self._prepare_search_request(query, **kwargs)
        response = self.client.search_documents(request=request)
        return self._parse_search_response(response=response)

    def _prepare_search_request(
        self, query: str, **kwargs: Any
    ) -> "SearchDocumentsRequest":
        from google.cloud.contentwarehouse_v1 import (
            DocumentQuery,
            SearchDocumentsRequest,
        )

        try:
            user_ldap = kwargs["user_ldap"]
        except KeyError:
            raise ValueError("Argument user_ldap should be provided!")

        request_metadata = self._prepare_request_metadata(user_ldap=user_ldap)
        schemas = []
        if self.schema_id:
            schemas.append(
                self.client.document_schema_path(
                    project=self.project_number,
                    location=self.location,
                    document_schema=self.schema_id,
                )
            )
        return SearchDocumentsRequest(
            parent=self.client.common_location_path(self.project_number, self.location),
            request_metadata=request_metadata,
            document_query=DocumentQuery(
                query=query, is_nl_query=True, document_schema_names=schemas
            ),
            qa_size_limit=self.qa_size_limit,
        )

    def _parse_search_response(
        self, response: "SearchDocumentsPager"
    ) -> List[Document]:
        documents = []
        for doc in response.matching_documents:
            metadata = {
                "title": doc.document.title,
                "source": doc.document.raw_document_path,
            }
            documents.append(
                Document(page_content=doc.search_text_snippet, metadata=metadata)
            )
        return documents
community[major], core[patch], langchain[patch], experimental[patch]: Create langchain-community (#14463) Moved the following modules to new package langchain-community in a backwards compatible fashion: ``` mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community ``` Moved the following to core ``` mv langchain/langchain/utils/json_schema.py core/langchain_core/utils mv langchain/langchain/utils/html.py core/langchain_core/utils mv langchain/langchain/utils/strings.py core/langchain_core/utils cat langchain/langchain/utils/env.py >> core/langchain_core/utils/env.py rm langchain/langchain/utils/env.py ``` See .scripts/community_split/script_integrations.sh for all changes 9 months ago			`"""Retriever wrapper for Google Cloud Document AI Warehouse."""`
			`from typing import TYPE_CHECKING, Any, Dict, List, Optional`

			`from langchain_core.callbacks import CallbackManagerForRetrieverRun`
			`from langchain_core.documents import Document`
			`from langchain_core.pydantic_v1 import root_validator`
			`from langchain_core.retrievers import BaseRetriever`
			`from langchain_core.utils import get_from_dict_or_env`

			`from langchain_community.utilities.vertexai import get_client_info`

			`if TYPE_CHECKING:`
			`from google.cloud.contentwarehouse_v1 import (`
			`DocumentServiceClient,`
			`RequestMetadata,`
			`SearchDocumentsRequest,`
			`)`
			`from google.cloud.contentwarehouse_v1.services.document_service.pagers import (`
			`SearchDocumentsPager,`
			`)`


			`class GoogleDocumentAIWarehouseRetriever(BaseRetriever):`
			`"""A retriever based on Document AI Warehouse.`

			`Documents should be created and documents should be uploaded`
			`in a separate flow, and this retriever uses only Document AI`
			`schema_id provided to search for revelant documents.`

			`More info: https://cloud.google.com/document-ai-warehouse.`
			`"""`

			`location: str = "us"`
			`"""Google Cloud location where Document AI Warehouse is placed."""`
			`project_number: str`
			`"""Google Cloud project number, should contain digits only."""`
			`schema_id: Optional[str] = None`
			`"""Document AI Warehouse schema to query against.`
			`If nothing is provided, all documents in the project will be searched."""`
			`qa_size_limit: int = 5`
			`"""The limit on the number of documents returned."""`
			`client: "DocumentServiceClient" = None #: :meta private:`

			`@root_validator()`
			`def validate_environment(cls, values: Dict) -> Dict:`
			`"""Validates the environment."""`
			`try: # noqa: F401`
			`from google.cloud.contentwarehouse_v1 import DocumentServiceClient`
			`except ImportError as exc:`
			`raise ImportError(`
			`"google.cloud.contentwarehouse is not installed."`
			`"Please install it with pip install google-cloud-contentwarehouse"`
			`) from exc`

			`values["project_number"] = get_from_dict_or_env(`
			`values, "project_number", "PROJECT_NUMBER"`
			`)`
			`values["client"] = DocumentServiceClient(`
			`client_info=get_client_info(module="document-ai-warehouse")`
			`)`
			`return values`

			`def _prepare_request_metadata(self, user_ldap: str) -> "RequestMetadata":`
			`from google.cloud.contentwarehouse_v1 import RequestMetadata, UserInfo`

			`user_info = UserInfo(id=f"user:{user_ldap}")`
			`return RequestMetadata(user_info=user_info)`

			`def _get_relevant_documents(`
			`self, query: str, , run_manager: CallbackManagerForRetrieverRun, *kwargs: Any`
			`) -> List[Document]:`
			`request = self._prepare_search_request(query, **kwargs)`
			`response = self.client.search_documents(request=request)`
			`return self._parse_search_response(response=response)`

			`def _prepare_search_request(`
			`self, query: str, **kwargs: Any`
			`) -> "SearchDocumentsRequest":`
			`from google.cloud.contentwarehouse_v1 import (`
			`DocumentQuery,`
			`SearchDocumentsRequest,`
			`)`

			`try:`
			`user_ldap = kwargs["user_ldap"]`
			`except KeyError:`
			`raise ValueError("Argument user_ldap should be provided!")`

			`request_metadata = self._prepare_request_metadata(user_ldap=user_ldap)`
			`schemas = []`
			`if self.schema_id:`
			`schemas.append(`
			`self.client.document_schema_path(`
			`project=self.project_number,`
			`location=self.location,`
			`document_schema=self.schema_id,`
			`)`
			`)`
			`return SearchDocumentsRequest(`
			`parent=self.client.common_location_path(self.project_number, self.location),`
			`request_metadata=request_metadata,`
			`document_query=DocumentQuery(`
			`query=query, is_nl_query=True, document_schema_names=schemas`
			`),`
			`qa_size_limit=self.qa_size_limit,`
			`)`

			`def _parse_search_response(`
			`self, response: "SearchDocumentsPager"`
			`) -> List[Document]:`
			`documents = []`
			`for doc in response.matching_documents:`
			`metadata = {`
			`"title": doc.document.title,`
			`"source": doc.document.raw_document_path,`
			`}`
			`documents.append(`
			`Document(page_content=doc.search_text_snippet, metadata=metadata)`
			`)`
			`return documents`