diff --git a/docs/modules/indexes/document_loaders/examples/embaas.ipynb b/docs/modules/indexes/document_loaders/examples/embaas.ipynb new file mode 100644 index 00000000..0c8c19d7 --- /dev/null +++ b/docs/modules/indexes/document_loaders/examples/embaas.ipynb @@ -0,0 +1,167 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# Embaas\n", + "[embaas](https://embaas.io) is a fully managed NLP API service that offers features like embedding generation, document text extraction, document to embeddings and more. You can choose a [variety of pre-trained models](https://embaas.io/docs/models/embeddings).\n", + "\n", + "### Prerequisites\n", + "Create a free embaas account at [https://embaas.io/register](https://embaas.io/register) and generate an [API key](https://embaas.io/dashboard/api-keys)\n", + "\n", + "### Document Text Extraction API\n", + "The document text extraction API allows you to extract the text from a given document. The API supports a variety of document formats, including PDF, mp3, mp4 and more. For a full list of supported formats, check out the API docs (link below)." + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "# Set API key\n", + "embaas_api_key = \"YOUR_API_KEY\"\n", + "# or set environment variable\n", + "os.environ[\"EMBAAS_API_KEY\"] = \"YOUR_API_KEY\"" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "#### Using a blob (bytes)" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "from langchain.document_loaders.embaas import EmbaasBlobLoader\n", + "from langchain.document_loaders.blob_loaders import Blob" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "blob_loader = EmbaasBlobLoader()\n", + "blob = Blob.from_path(\"example.pdf\")\n", + "documents = blob_loader.load(blob)" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "# You can also directly create embeddings with your preferred embeddings model\n", + "blob_loader = EmbaasBlobLoader(params={\"model\": \"e5-large-v2\", \"should_embed\": True})\n", + "blob = Blob.from_path(\"example.pdf\")\n", + "documents = blob_loader.load(blob)\n", + "\n", + "print(documents[0][\"metadata\"][\"embedding\"])" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "start_time": "2023-06-12T22:19:48.366886Z", + "end_time": "2023-06-12T22:19:48.380467Z" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "#### Using a file" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "from langchain.document_loaders.embaas import EmbaasLoader" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "file_loader = EmbaasLoader(file_path=\"example.pdf\")\n", + "documents = file_loader.load()" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 15, + "outputs": [], + "source": [ + "# Disable automatic text splitting\n", + "file_loader = EmbaasLoader(file_path=\"example.mp3\", params={\"should_chunk\": False})\n", + "documents = file_loader.load()" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "start_time": "2023-06-12T22:24:31.880857Z", + "end_time": "2023-06-12T22:24:31.894665Z" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "For more detailed information about the embaas document text extraction API, please refer to [the official embaas API documentation](https://embaas.io/api-reference)." + ], + "metadata": { + "collapsed": false + } + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py index 87d2335e..17f764b1 100644 --- a/langchain/document_loaders/__init__.py +++ b/langchain/document_loaders/__init__.py @@ -31,6 +31,7 @@ from langchain.document_loaders.email import ( OutlookMessageLoader, UnstructuredEmailLoader, ) +from langchain.document_loaders.embaas import EmbaasBlobLoader, EmbaasLoader from langchain.document_loaders.epub import UnstructuredEPubLoader from langchain.document_loaders.evernote import EverNoteLoader from langchain.document_loaders.excel import UnstructuredExcelLoader @@ -250,4 +251,6 @@ __all__ = [ "WikipediaLoader", "YoutubeLoader", "SnowflakeLoader", + "EmbaasLoader", + "EmbaasBlobLoader", ] diff --git a/langchain/document_loaders/embaas.py b/langchain/document_loaders/embaas.py new file mode 100644 index 00000000..5dc4071e --- /dev/null +++ b/langchain/document_loaders/embaas.py @@ -0,0 +1,234 @@ +import base64 +import warnings +from typing import Any, Dict, Iterator, List, Optional + +import requests +from pydantic import BaseModel, root_validator, validator +from typing_extensions import NotRequired, TypedDict + +from langchain.docstore.document import Document +from langchain.document_loaders.base import BaseBlobParser, BaseLoader +from langchain.document_loaders.blob_loaders import Blob +from langchain.text_splitter import TextSplitter +from langchain.utils import get_from_dict_or_env + +EMBAAS_DOC_API_URL = "https://api.embaas.io/v1/document/extract-text/bytes/" + + +class EmbaasDocumentExtractionParameters(TypedDict): + """Parameters for the embaas document extraction API.""" + + mime_type: NotRequired[str] + """The mime type of the document.""" + file_extension: NotRequired[str] + """The file extension of the document.""" + file_name: NotRequired[str] + """The file name of the document.""" + + should_chunk: NotRequired[bool] + """Whether to chunk the document into pages.""" + chunk_size: NotRequired[int] + """The maximum size of the text chunks.""" + chunk_overlap: NotRequired[int] + """The maximum overlap allowed between chunks.""" + chunk_splitter: NotRequired[str] + """The text splitter class name for creating chunks.""" + separators: NotRequired[List[str]] + """The separators for chunks.""" + + should_embed: NotRequired[bool] + """Whether to create embeddings for the document in the response.""" + model: NotRequired[str] + """The model to pass to the Embaas document extraction API.""" + instruction: NotRequired[str] + """The instruction to pass to the Embaas document extraction API.""" + + +class EmbaasDocumentExtractionPayload(EmbaasDocumentExtractionParameters): + bytes: str + """The base64 encoded bytes of the document to extract text from.""" + + +class BaseEmbaasLoader(BaseModel): + embaas_api_key: Optional[str] = None + api_url: str = EMBAAS_DOC_API_URL + """The URL of the embaas document extraction API.""" + params: EmbaasDocumentExtractionParameters = EmbaasDocumentExtractionParameters() + """Additional parameters to pass to the embaas document extraction API.""" + + @root_validator(pre=True) + def validate_environment(cls, values: Dict) -> Dict: + """Validate that api key and python package exists in environment.""" + embaas_api_key = get_from_dict_or_env( + values, "embaas_api_key", "EMBAAS_API_KEY" + ) + values["embaas_api_key"] = embaas_api_key + return values + + +class EmbaasBlobLoader(BaseEmbaasLoader, BaseBlobParser): + """Wrapper around embaas's document byte loader service. + + To use, you should have the + environment variable ``EMBAAS_API_KEY`` set with your API key, or pass + it as a named parameter to the constructor. + + Example: + .. code-block:: python + + # Default parsing + from langchain.document_loaders.embaas import EmbaasBlobLoader + loader = EmbaasBlobLoader() + blob = Blob.from_path(path="example.mp3") + documents = loader.parse(blob=blob) + + # Custom api parameters (create embeddings automatically) + from langchain.document_loaders.embaas import EmbaasBlobLoader + loader = EmbaasBlobLoader( + params={ + "should_embed": True, + "model": "e5-large-v2", + "chunk_size": 256, + "chunk_splitter": "CharacterTextSplitter" + } + ) + blob = Blob.from_path(path="example.pdf") + documents = loader.parse(blob=blob) + """ + + def lazy_parse(self, blob: Blob) -> Iterator[Document]: + yield from self._get_documents(blob=blob) + + @staticmethod + def _api_response_to_documents(chunks: List[Dict[str, Any]]) -> List[Document]: + """Convert the API response to a list of documents.""" + docs = [] + for chunk in chunks: + metadata = chunk["metadata"] + if chunk.get("embedding", None) is not None: + metadata["embedding"] = chunk["embedding"] + doc = Document(page_content=chunk["text"], metadata=metadata) + docs.append(doc) + + return docs + + def _generate_payload(self, blob: Blob) -> EmbaasDocumentExtractionPayload: + """Generates payload for the API request.""" + base64_byte_str = base64.b64encode(blob.as_bytes()).decode() + payload: EmbaasDocumentExtractionPayload = EmbaasDocumentExtractionPayload( + bytes=base64_byte_str, + # Workaround for mypy issue: https://github.com/python/mypy/issues/9408 + # type: ignore + **self.params, + ) + + if blob.mimetype is not None and payload.get("mime_type", None) is None: + payload["mime_type"] = blob.mimetype + + return payload + + def _handle_request( + self, payload: EmbaasDocumentExtractionPayload + ) -> List[Document]: + """Sends a request to the embaas API and handles the response.""" + headers = { + "Authorization": f"Bearer {self.embaas_api_key}", + "Content-Type": "application/json", + } + + response = requests.post(self.api_url, headers=headers, json=payload) + response.raise_for_status() + + parsed_response = response.json() + return EmbaasBlobLoader._api_response_to_documents( + chunks=parsed_response["data"]["chunks"] + ) + + def _get_documents(self, blob: Blob) -> Iterator[Document]: + """Get the documents from the blob.""" + payload = self._generate_payload(blob=blob) + + try: + documents = self._handle_request(payload=payload) + except requests.exceptions.RequestException as e: + if e.response is None or not e.response.text: + raise ValueError( + f"Error raised by embaas document text extraction API: {e}" + ) + + parsed_response = e.response.json() + if "message" in parsed_response: + raise ValueError( + f"Validation Error raised by embaas document text extraction API:" + f" {parsed_response['message']}" + ) + raise + + yield from documents + + +class EmbaasLoader(BaseEmbaasLoader, BaseLoader): + """Wrapper around embaas's document loader service. + + To use, you should have the + environment variable ``EMBAAS_API_KEY`` set with your API key, or pass + it as a named parameter to the constructor. + + Example: + .. code-block:: python + + # Default parsing + from langchain.document_loaders.embaas import EmbaasLoader + loader = EmbaasLoader(file_path="example.mp3") + documents = loader.load() + + # Custom api parameters (create embeddings automatically) + from langchain.document_loaders.embaas import EmbaasBlobLoader + loader = EmbaasBlobLoader( + file_path="example.pdf", + params={ + "should_embed": True, + "model": "e5-large-v2", + "chunk_size": 256, + "chunk_splitter": "CharacterTextSplitter" + } + ) + documents = loader.load() + """ + + file_path: str + """The path to the file to load.""" + blob_loader: Optional[EmbaasBlobLoader] + """The blob loader to use. If not provided, a default one will be created.""" + + @validator("blob_loader", always=True) + def validate_blob_loader( + cls, v: EmbaasBlobLoader, values: Dict + ) -> EmbaasBlobLoader: + return v or EmbaasBlobLoader( + embaas_api_key=values["embaas_api_key"], + api_url=values["api_url"], + params=values["params"], + ) + + def lazy_load(self) -> Iterator[Document]: + """Load the documents from the file path lazily.""" + blob = Blob.from_path(path=self.file_path) + + assert self.blob_loader is not None + # Should never be None, but mypy doesn't know that. + yield from self.blob_loader.lazy_parse(blob=blob) + + def load(self) -> List[Document]: + return list(self.lazy_load()) + + def load_and_split( + self, text_splitter: Optional[TextSplitter] = None + ) -> List[Document]: + if self.params.get("should_embed", False): + warnings.warn( + "Embeddings are not supported with load_and_split." + " Use the API splitter to properly generate embeddings." + " For more information see embaas.io docs." + ) + return super().load_and_split(text_splitter=text_splitter) diff --git a/langchain/embeddings/embaas.py b/langchain/embeddings/embaas.py index 8a9134f7..e0a42e46 100644 --- a/langchain/embeddings/embaas.py +++ b/langchain/embeddings/embaas.py @@ -32,17 +32,16 @@ class EmbaasEmbeddings(BaseModel, Embeddings): .. code-block:: python # Initialise with default model and instruction - from langchain.llms import EmbaasEmbeddings + from langchain.embeddings import EmbaasEmbeddings emb = EmbaasEmbeddings() # Initialise with custom model and instruction - from langchain.llms import EmbaasEmbeddings + from langchain.embeddings import EmbaasEmbeddings emb_model = "instructor-large" emb_inst = "Represent the Wikipedia document for retrieval" emb = EmbaasEmbeddings( model=emb_model, - instruction=emb_inst, - embaas_api_key="your-api-key" + instruction=emb_inst ) """ diff --git a/tests/integration_tests/document_loaders/test_embaas.py b/tests/integration_tests/document_loaders/test_embaas.py new file mode 100644 index 00000000..2170a143 --- /dev/null +++ b/tests/integration_tests/document_loaders/test_embaas.py @@ -0,0 +1,59 @@ +from typing import Any +from unittest.mock import MagicMock, patch + +import responses + +from langchain.document_loaders import EmbaasBlobLoader, EmbaasLoader +from langchain.document_loaders.blob_loaders import Blob +from langchain.document_loaders.embaas import EMBAAS_DOC_API_URL + + +@responses.activate +def test_handle_request() -> None: + responses.add( + responses.POST, + EMBAAS_DOC_API_URL, + json={ + "data": { + "chunks": [ + { + "text": "Hello", + "metadata": {"start_page": 1, "end_page": 2}, + "embeddings": [0.0], + } + ] + } + }, + status=200, + ) + + loader = EmbaasBlobLoader(embaas_api_key="api_key", params={"should_embed": True}) + documents = loader.parse(blob=Blob.from_data(data="Hello")) + assert len(documents) == 1 + assert documents[0].page_content == "Hello" + assert documents[0].metadata["start_page"] == 1 + assert documents[0].metadata["end_page"] == 2 + assert documents[0].metadata["embeddings"] == [0.0] + + +@responses.activate +def test_handle_request_exception() -> None: + responses.add( + responses.POST, + EMBAAS_DOC_API_URL, + json={"message": "Invalid request"}, + status=400, + ) + loader = EmbaasBlobLoader(embaas_api_key="api_key") + try: + loader.parse(blob=Blob.from_data(data="Hello")) + except Exception as e: + assert "Invalid request" in str(e) + + +@patch.object(EmbaasBlobLoader, "_handle_request") +def test_load(mock_handle_request: Any) -> None: + mock_handle_request.return_value = [MagicMock()] + loader = EmbaasLoader(file_path="test_embaas.py", embaas_api_key="api_key") + documents = loader.load() + assert len(documents) == 1