mirror of
https://github.com/hwchase17/langchain
synced 2024-11-08 07:10:35 +00:00
ed58eeb9c5
Moved the following modules to new package langchain-community in a backwards compatible fashion: ``` mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community ``` Moved the following to core ``` mv langchain/langchain/utils/json_schema.py core/langchain_core/utils mv langchain/langchain/utils/html.py core/langchain_core/utils mv langchain/langchain/utils/strings.py core/langchain_core/utils cat langchain/langchain/utils/env.py >> core/langchain_core/utils/env.py rm langchain/langchain/utils/env.py ``` See .scripts/community_split/script_integrations.sh for all changes
121 lines
4.5 KiB
Python
121 lines
4.5 KiB
Python
"""Retriever wrapper for Google Cloud Document AI Warehouse."""
|
|
from typing import TYPE_CHECKING, Any, Dict, List, Optional
|
|
|
|
from langchain_core.callbacks import CallbackManagerForRetrieverRun
|
|
from langchain_core.documents import Document
|
|
from langchain_core.pydantic_v1 import root_validator
|
|
from langchain_core.retrievers import BaseRetriever
|
|
from langchain_core.utils import get_from_dict_or_env
|
|
|
|
from langchain_community.utilities.vertexai import get_client_info
|
|
|
|
if TYPE_CHECKING:
|
|
from google.cloud.contentwarehouse_v1 import (
|
|
DocumentServiceClient,
|
|
RequestMetadata,
|
|
SearchDocumentsRequest,
|
|
)
|
|
from google.cloud.contentwarehouse_v1.services.document_service.pagers import (
|
|
SearchDocumentsPager,
|
|
)
|
|
|
|
|
|
class GoogleDocumentAIWarehouseRetriever(BaseRetriever):
|
|
"""A retriever based on Document AI Warehouse.
|
|
|
|
Documents should be created and documents should be uploaded
|
|
in a separate flow, and this retriever uses only Document AI
|
|
schema_id provided to search for revelant documents.
|
|
|
|
More info: https://cloud.google.com/document-ai-warehouse.
|
|
"""
|
|
|
|
location: str = "us"
|
|
"""Google Cloud location where Document AI Warehouse is placed."""
|
|
project_number: str
|
|
"""Google Cloud project number, should contain digits only."""
|
|
schema_id: Optional[str] = None
|
|
"""Document AI Warehouse schema to query against.
|
|
If nothing is provided, all documents in the project will be searched."""
|
|
qa_size_limit: int = 5
|
|
"""The limit on the number of documents returned."""
|
|
client: "DocumentServiceClient" = None #: :meta private:
|
|
|
|
@root_validator()
|
|
def validate_environment(cls, values: Dict) -> Dict:
|
|
"""Validates the environment."""
|
|
try: # noqa: F401
|
|
from google.cloud.contentwarehouse_v1 import DocumentServiceClient
|
|
except ImportError as exc:
|
|
raise ImportError(
|
|
"google.cloud.contentwarehouse is not installed."
|
|
"Please install it with pip install google-cloud-contentwarehouse"
|
|
) from exc
|
|
|
|
values["project_number"] = get_from_dict_or_env(
|
|
values, "project_number", "PROJECT_NUMBER"
|
|
)
|
|
values["client"] = DocumentServiceClient(
|
|
client_info=get_client_info(module="document-ai-warehouse")
|
|
)
|
|
return values
|
|
|
|
def _prepare_request_metadata(self, user_ldap: str) -> "RequestMetadata":
|
|
from google.cloud.contentwarehouse_v1 import RequestMetadata, UserInfo
|
|
|
|
user_info = UserInfo(id=f"user:{user_ldap}")
|
|
return RequestMetadata(user_info=user_info)
|
|
|
|
def _get_relevant_documents(
|
|
self, query: str, *, run_manager: CallbackManagerForRetrieverRun, **kwargs: Any
|
|
) -> List[Document]:
|
|
request = self._prepare_search_request(query, **kwargs)
|
|
response = self.client.search_documents(request=request)
|
|
return self._parse_search_response(response=response)
|
|
|
|
def _prepare_search_request(
|
|
self, query: str, **kwargs: Any
|
|
) -> "SearchDocumentsRequest":
|
|
from google.cloud.contentwarehouse_v1 import (
|
|
DocumentQuery,
|
|
SearchDocumentsRequest,
|
|
)
|
|
|
|
try:
|
|
user_ldap = kwargs["user_ldap"]
|
|
except KeyError:
|
|
raise ValueError("Argument user_ldap should be provided!")
|
|
|
|
request_metadata = self._prepare_request_metadata(user_ldap=user_ldap)
|
|
schemas = []
|
|
if self.schema_id:
|
|
schemas.append(
|
|
self.client.document_schema_path(
|
|
project=self.project_number,
|
|
location=self.location,
|
|
document_schema=self.schema_id,
|
|
)
|
|
)
|
|
return SearchDocumentsRequest(
|
|
parent=self.client.common_location_path(self.project_number, self.location),
|
|
request_metadata=request_metadata,
|
|
document_query=DocumentQuery(
|
|
query=query, is_nl_query=True, document_schema_names=schemas
|
|
),
|
|
qa_size_limit=self.qa_size_limit,
|
|
)
|
|
|
|
def _parse_search_response(
|
|
self, response: "SearchDocumentsPager"
|
|
) -> List[Document]:
|
|
documents = []
|
|
for doc in response.matching_documents:
|
|
metadata = {
|
|
"title": doc.document.title,
|
|
"source": doc.document.raw_document_path,
|
|
}
|
|
documents.append(
|
|
Document(page_content=doc.search_text_snippet, metadata=metadata)
|
|
)
|
|
return documents
|