mirror of
https://github.com/hwchase17/langchain
synced 2024-11-16 06:13:16 +00:00
Bagatur/lakefs loader2 (#12524)
Co-authored-by: Jonathan Rosenberg <96974219+Jonathan-Rosenberg@users.noreply.github.com>
This commit is contained in:
parent
3243dcc83e
commit
9bedda50f2
103
docs/docs/integrations/document_loaders/lakefs.ipynb
Normal file
103
docs/docs/integrations/document_loaders/lakefs.ipynb
Normal file
@ -0,0 +1,103 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# lakeFS\n",
|
||||||
|
"\n",
|
||||||
|
">[lakeFS](https://docs.lakefs.io/) provides scalable version control over the data lake, and uses Git-like semantics to create and access those versions.\n",
|
||||||
|
"\n",
|
||||||
|
"This notebooks covers how to load document objects from a `lakeFS` path (whether it's an object or a prefix).\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"source": [
|
||||||
|
"## Initializing the lakeFS loader\n",
|
||||||
|
"\n",
|
||||||
|
"Replace `ENDPOINT`, `LAKEFS_ACCESS_KEY`, and `LAKEFS_SECRET_KEY` values with your own."
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.document_loaders import LakeFSLoader"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"ENDPOINT = \"\"\n",
|
||||||
|
"LAKEFS_ACCESS_KEY = \"\"\n",
|
||||||
|
"LAKEFS_SECRET_KEY = \"\"\n",
|
||||||
|
"\n",
|
||||||
|
"lakefs_loader = LakeFSLoader(\n",
|
||||||
|
" lakefs_access_key=LAKEFS_ACCESS_KEY,\n",
|
||||||
|
" lakefs_secret_key=LAKEFS_SECRET_KEY,\n",
|
||||||
|
" lakefs_endpoint=ENDPOINT,\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"source": [
|
||||||
|
"## Specifying a path\n",
|
||||||
|
"You can specify a prefix or a complete object path to control which files to load.\n",
|
||||||
|
"\n",
|
||||||
|
"Specify the repository, reference (branch, commit id, or tag), and path in the corresponding `REPO`, `REF`, and `PATH` to load the documents from:"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"REPO = \"\"\n",
|
||||||
|
"REF = \"\"\n",
|
||||||
|
"PATH = \"\"\n",
|
||||||
|
"\n",
|
||||||
|
"lakefs_loader.set_repo(REPO)\n",
|
||||||
|
"lakefs_loader.set_ref(REF)\n",
|
||||||
|
"lakefs_loader.set_path(PATH)\n",
|
||||||
|
"\n",
|
||||||
|
"docs = lakefs_loader.load()\n",
|
||||||
|
"docs"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.9.1"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
@ -101,6 +101,7 @@ from langchain.document_loaders.imsdb import IMSDbLoader
|
|||||||
from langchain.document_loaders.iugu import IuguLoader
|
from langchain.document_loaders.iugu import IuguLoader
|
||||||
from langchain.document_loaders.joplin import JoplinLoader
|
from langchain.document_loaders.joplin import JoplinLoader
|
||||||
from langchain.document_loaders.json_loader import JSONLoader
|
from langchain.document_loaders.json_loader import JSONLoader
|
||||||
|
from langchain.document_loaders.lakefs import LakeFSLoader
|
||||||
from langchain.document_loaders.larksuite import LarkSuiteDocLoader
|
from langchain.document_loaders.larksuite import LarkSuiteDocLoader
|
||||||
from langchain.document_loaders.markdown import UnstructuredMarkdownLoader
|
from langchain.document_loaders.markdown import UnstructuredMarkdownLoader
|
||||||
from langchain.document_loaders.mastodon import MastodonTootsLoader
|
from langchain.document_loaders.mastodon import MastodonTootsLoader
|
||||||
@ -280,6 +281,7 @@ __all__ = [
|
|||||||
"JSONLoader",
|
"JSONLoader",
|
||||||
"JoplinLoader",
|
"JoplinLoader",
|
||||||
"LarkSuiteDocLoader",
|
"LarkSuiteDocLoader",
|
||||||
|
"LakeFSLoader",
|
||||||
"MHTMLLoader",
|
"MHTMLLoader",
|
||||||
"MWDumpLoader",
|
"MWDumpLoader",
|
||||||
"MastodonTootsLoader",
|
"MastodonTootsLoader",
|
||||||
|
179
libs/langchain/langchain/document_loaders/lakefs.py
Normal file
179
libs/langchain/langchain/document_loaders/lakefs.py
Normal file
@ -0,0 +1,179 @@
|
|||||||
|
import os
|
||||||
|
import tempfile
|
||||||
|
import urllib.parse
|
||||||
|
from typing import Any, List, Optional
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from requests.auth import HTTPBasicAuth
|
||||||
|
|
||||||
|
from langchain.document_loaders.base import BaseLoader
|
||||||
|
from langchain.document_loaders.unstructured import UnstructuredBaseLoader
|
||||||
|
from langchain.schema import Document
|
||||||
|
|
||||||
|
|
||||||
|
class LakeFSClient:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
lakefs_access_key: str,
|
||||||
|
lakefs_secret_key: str,
|
||||||
|
lakefs_endpoint: str,
|
||||||
|
):
|
||||||
|
self.__endpoint = "/".join([lakefs_endpoint, "api", "v1/"])
|
||||||
|
self.__auth = HTTPBasicAuth(lakefs_access_key, lakefs_secret_key)
|
||||||
|
try:
|
||||||
|
health_check = requests.get(
|
||||||
|
urljoin(self.__endpoint, "healthcheck"), auth=self.__auth
|
||||||
|
)
|
||||||
|
health_check.raise_for_status()
|
||||||
|
except Exception:
|
||||||
|
raise ValueError(
|
||||||
|
"lakeFS server isn't accessible. Make sure lakeFS is running."
|
||||||
|
)
|
||||||
|
|
||||||
|
def ls_objects(
|
||||||
|
self, repo: str, ref: str, path: str, presign: Optional[bool]
|
||||||
|
) -> List:
|
||||||
|
qp = {"prefix": path, "presign": presign}
|
||||||
|
eqp = urllib.parse.urlencode(qp)
|
||||||
|
objects_ls_endpoint = urljoin(
|
||||||
|
self.__endpoint, f"repositories/{repo}/refs/{ref}/objects/ls?{eqp}"
|
||||||
|
)
|
||||||
|
olsr = requests.get(objects_ls_endpoint, auth=self.__auth)
|
||||||
|
olsr.raise_for_status()
|
||||||
|
olsr_json = olsr.json()
|
||||||
|
return list(
|
||||||
|
map(
|
||||||
|
lambda res: (res["path"], res["physical_address"]), olsr_json["results"]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
def is_presign_supported(self) -> bool:
|
||||||
|
config_endpoint = self.__endpoint + "config"
|
||||||
|
response = requests.get(config_endpoint, auth=self.__auth)
|
||||||
|
response.raise_for_status()
|
||||||
|
config = response.json()
|
||||||
|
return config["storage_config"]["pre_sign_support"]
|
||||||
|
|
||||||
|
|
||||||
|
class LakeFSLoader(BaseLoader):
|
||||||
|
"""Load from `lakeFS`."""
|
||||||
|
|
||||||
|
repo: str
|
||||||
|
ref: str
|
||||||
|
path: str
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
lakefs_access_key: str,
|
||||||
|
lakefs_secret_key: str,
|
||||||
|
lakefs_endpoint: str,
|
||||||
|
repo: Optional[str] = None,
|
||||||
|
ref: Optional[str] = "main",
|
||||||
|
path: Optional[str] = "",
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
|
||||||
|
:param lakefs_access_key: [required] lakeFS server's access key
|
||||||
|
:param lakefs_secret_key: [required] lakeFS server's secret key
|
||||||
|
:param lakefs_endpoint: [required] lakeFS server's endpoint address,
|
||||||
|
ex: https://example.my-lakefs.com
|
||||||
|
:param repo: [optional, default = ''] target repository
|
||||||
|
:param ref: [optional, default = 'main'] target ref (branch name,
|
||||||
|
tag, or commit ID)
|
||||||
|
:param path: [optional, default = ''] target path
|
||||||
|
"""
|
||||||
|
|
||||||
|
self.__lakefs_client = LakeFSClient(
|
||||||
|
lakefs_access_key, lakefs_secret_key, lakefs_endpoint
|
||||||
|
)
|
||||||
|
self.repo = "" if repo is None or repo == "" else str(repo)
|
||||||
|
self.ref = "main" if ref is None or ref == "" else str(ref)
|
||||||
|
self.path = "" if path is None else str(path)
|
||||||
|
|
||||||
|
def set_path(self, path: str) -> None:
|
||||||
|
self.path = path
|
||||||
|
|
||||||
|
def set_ref(self, ref: str) -> None:
|
||||||
|
self.ref = ref
|
||||||
|
|
||||||
|
def set_repo(self, repo: str) -> None:
|
||||||
|
self.repo = repo
|
||||||
|
|
||||||
|
def load(self) -> List[Document]:
|
||||||
|
self.__validate_instance()
|
||||||
|
presigned = self.__lakefs_client.is_presign_supported()
|
||||||
|
docs: List[Document] = []
|
||||||
|
objs = self.__lakefs_client.ls_objects(
|
||||||
|
repo=self.repo, ref=self.ref, path=self.path, presign=presigned
|
||||||
|
)
|
||||||
|
for obj in objs:
|
||||||
|
lakefs_unstructured_loader = UnstructuredLakeFSLoader(
|
||||||
|
obj[1], self.repo, self.ref, obj[0], presigned
|
||||||
|
)
|
||||||
|
docs.extend(lakefs_unstructured_loader.load())
|
||||||
|
return docs
|
||||||
|
|
||||||
|
def __validate_instance(self) -> None:
|
||||||
|
if self.repo is None or self.repo == "":
|
||||||
|
raise ValueError(
|
||||||
|
"no repository was provided. use `set_repo` to specify a repository"
|
||||||
|
)
|
||||||
|
if self.ref is None or self.ref == "":
|
||||||
|
raise ValueError("no ref was provided. use `set_ref` to specify a ref")
|
||||||
|
if self.path is None:
|
||||||
|
raise ValueError("no path was provided. use `set_path` to specify a path")
|
||||||
|
|
||||||
|
|
||||||
|
class UnstructuredLakeFSLoader(UnstructuredBaseLoader):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
url: str,
|
||||||
|
repo: str,
|
||||||
|
ref: str = "main",
|
||||||
|
path: str = "",
|
||||||
|
presign: bool = True,
|
||||||
|
**unstructured_kwargs: Any,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
|
||||||
|
Args:
|
||||||
|
|
||||||
|
:param lakefs_access_key:
|
||||||
|
:param lakefs_secret_key:
|
||||||
|
:param lakefs_endpoint:
|
||||||
|
:param repo:
|
||||||
|
:param ref:
|
||||||
|
"""
|
||||||
|
|
||||||
|
super().__init__(**unstructured_kwargs)
|
||||||
|
self.url = url
|
||||||
|
self.repo = repo
|
||||||
|
self.ref = ref
|
||||||
|
self.path = path
|
||||||
|
self.presign = presign
|
||||||
|
|
||||||
|
def _get_metadata(self) -> dict:
|
||||||
|
return {"repo": self.repo, "ref": self.ref, "path": self.path}
|
||||||
|
|
||||||
|
def _get_elements(self) -> List:
|
||||||
|
from unstructured.partition.auto import partition
|
||||||
|
|
||||||
|
local_prefix = "local://"
|
||||||
|
|
||||||
|
if self.presign:
|
||||||
|
with tempfile.TemporaryDirectory() as temp_dir:
|
||||||
|
file_path = f"{temp_dir}/{self.path.split('/')[-1]}"
|
||||||
|
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
||||||
|
response = requests.get(self.url)
|
||||||
|
response.raise_for_status()
|
||||||
|
with open(file_path, mode="wb") as file:
|
||||||
|
file.write(response.content)
|
||||||
|
return partition(filename=file_path)
|
||||||
|
elif not self.url.startswith(local_prefix):
|
||||||
|
raise ValueError(
|
||||||
|
"Non pre-signed URLs are supported only with 'local' blockstore"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
local_path = self.url[len(local_prefix) :]
|
||||||
|
return partition(filename=local_path)
|
41
libs/langchain/poetry.lock
generated
41
libs/langchain/poetry.lock
generated
@ -4605,6 +4605,16 @@ files = [
|
|||||||
{file = "MarkupSafe-2.1.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:5bbe06f8eeafd38e5d0a4894ffec89378b6c6a625ff57e3028921f8ff59318ac"},
|
{file = "MarkupSafe-2.1.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:5bbe06f8eeafd38e5d0a4894ffec89378b6c6a625ff57e3028921f8ff59318ac"},
|
||||||
{file = "MarkupSafe-2.1.3-cp311-cp311-win32.whl", hash = "sha256:dd15ff04ffd7e05ffcb7fe79f1b98041b8ea30ae9234aed2a9168b5797c3effb"},
|
{file = "MarkupSafe-2.1.3-cp311-cp311-win32.whl", hash = "sha256:dd15ff04ffd7e05ffcb7fe79f1b98041b8ea30ae9234aed2a9168b5797c3effb"},
|
||||||
{file = "MarkupSafe-2.1.3-cp311-cp311-win_amd64.whl", hash = "sha256:134da1eca9ec0ae528110ccc9e48041e0828d79f24121a1a146161103c76e686"},
|
{file = "MarkupSafe-2.1.3-cp311-cp311-win_amd64.whl", hash = "sha256:134da1eca9ec0ae528110ccc9e48041e0828d79f24121a1a146161103c76e686"},
|
||||||
|
{file = "MarkupSafe-2.1.3-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:f698de3fd0c4e6972b92290a45bd9b1536bffe8c6759c62471efaa8acb4c37bc"},
|
||||||
|
{file = "MarkupSafe-2.1.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:aa57bd9cf8ae831a362185ee444e15a93ecb2e344c8e52e4d721ea3ab6ef1823"},
|
||||||
|
{file = "MarkupSafe-2.1.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ffcc3f7c66b5f5b7931a5aa68fc9cecc51e685ef90282f4a82f0f5e9b704ad11"},
|
||||||
|
{file = "MarkupSafe-2.1.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:47d4f1c5f80fc62fdd7777d0d40a2e9dda0a05883ab11374334f6c4de38adffd"},
|
||||||
|
{file = "MarkupSafe-2.1.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1f67c7038d560d92149c060157d623c542173016c4babc0c1913cca0564b9939"},
|
||||||
|
{file = "MarkupSafe-2.1.3-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:9aad3c1755095ce347e26488214ef77e0485a3c34a50c5a5e2471dff60b9dd9c"},
|
||||||
|
{file = "MarkupSafe-2.1.3-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:14ff806850827afd6b07a5f32bd917fb7f45b046ba40c57abdb636674a8b559c"},
|
||||||
|
{file = "MarkupSafe-2.1.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8f9293864fe09b8149f0cc42ce56e3f0e54de883a9de90cd427f191c346eb2e1"},
|
||||||
|
{file = "MarkupSafe-2.1.3-cp312-cp312-win32.whl", hash = "sha256:715d3562f79d540f251b99ebd6d8baa547118974341db04f5ad06d5ea3eb8007"},
|
||||||
|
{file = "MarkupSafe-2.1.3-cp312-cp312-win_amd64.whl", hash = "sha256:1b8dd8c3fd14349433c79fa8abeb573a55fc0fdd769133baac1f5e07abf54aeb"},
|
||||||
{file = "MarkupSafe-2.1.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:8e254ae696c88d98da6555f5ace2279cf7cd5b3f52be2b5cf97feafe883b58d2"},
|
{file = "MarkupSafe-2.1.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:8e254ae696c88d98da6555f5ace2279cf7cd5b3f52be2b5cf97feafe883b58d2"},
|
||||||
{file = "MarkupSafe-2.1.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cb0932dc158471523c9637e807d9bfb93e06a95cbf010f1a38b98623b929ef2b"},
|
{file = "MarkupSafe-2.1.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cb0932dc158471523c9637e807d9bfb93e06a95cbf010f1a38b98623b929ef2b"},
|
||||||
{file = "MarkupSafe-2.1.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9402b03f1a1b4dc4c19845e5c749e3ab82d5078d16a2a4c2cd2df62d57bb0707"},
|
{file = "MarkupSafe-2.1.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9402b03f1a1b4dc4c19845e5c749e3ab82d5078d16a2a4c2cd2df62d57bb0707"},
|
||||||
@ -7707,6 +7717,7 @@ files = [
|
|||||||
{file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938"},
|
{file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938"},
|
||||||
{file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d"},
|
{file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d"},
|
||||||
{file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515"},
|
{file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515"},
|
||||||
|
{file = "PyYAML-6.0.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290"},
|
||||||
{file = "PyYAML-6.0.1-cp310-cp310-win32.whl", hash = "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924"},
|
{file = "PyYAML-6.0.1-cp310-cp310-win32.whl", hash = "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924"},
|
||||||
{file = "PyYAML-6.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d"},
|
{file = "PyYAML-6.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d"},
|
||||||
{file = "PyYAML-6.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007"},
|
{file = "PyYAML-6.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007"},
|
||||||
@ -7714,8 +7725,15 @@ files = [
|
|||||||
{file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d"},
|
{file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d"},
|
||||||
{file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc"},
|
{file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc"},
|
||||||
{file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673"},
|
{file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673"},
|
||||||
|
{file = "PyYAML-6.0.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b"},
|
||||||
{file = "PyYAML-6.0.1-cp311-cp311-win32.whl", hash = "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741"},
|
{file = "PyYAML-6.0.1-cp311-cp311-win32.whl", hash = "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741"},
|
||||||
{file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
|
{file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
|
||||||
|
{file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
|
||||||
|
{file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
|
||||||
|
{file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
|
||||||
|
{file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
|
||||||
|
{file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
|
||||||
|
{file = "PyYAML-6.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df"},
|
||||||
{file = "PyYAML-6.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47"},
|
{file = "PyYAML-6.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47"},
|
||||||
{file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98"},
|
{file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98"},
|
||||||
{file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c"},
|
{file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c"},
|
||||||
@ -7732,6 +7750,7 @@ files = [
|
|||||||
{file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5"},
|
{file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5"},
|
||||||
{file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696"},
|
{file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696"},
|
||||||
{file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735"},
|
{file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735"},
|
||||||
|
{file = "PyYAML-6.0.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6"},
|
||||||
{file = "PyYAML-6.0.1-cp38-cp38-win32.whl", hash = "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206"},
|
{file = "PyYAML-6.0.1-cp38-cp38-win32.whl", hash = "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206"},
|
||||||
{file = "PyYAML-6.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62"},
|
{file = "PyYAML-6.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62"},
|
||||||
{file = "PyYAML-6.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8"},
|
{file = "PyYAML-6.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8"},
|
||||||
@ -7739,6 +7758,7 @@ files = [
|
|||||||
{file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6"},
|
{file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6"},
|
||||||
{file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0"},
|
{file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0"},
|
||||||
{file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c"},
|
{file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c"},
|
||||||
|
{file = "PyYAML-6.0.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5"},
|
||||||
{file = "PyYAML-6.0.1-cp39-cp39-win32.whl", hash = "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c"},
|
{file = "PyYAML-6.0.1-cp39-cp39-win32.whl", hash = "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c"},
|
||||||
{file = "PyYAML-6.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486"},
|
{file = "PyYAML-6.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486"},
|
||||||
{file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"},
|
{file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"},
|
||||||
@ -8274,6 +8294,25 @@ files = [
|
|||||||
requests = ">=1.0.0"
|
requests = ">=1.0.0"
|
||||||
six = "*"
|
six = "*"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "requests-mock"
|
||||||
|
version = "1.11.0"
|
||||||
|
description = "Mock out responses from the requests package"
|
||||||
|
optional = false
|
||||||
|
python-versions = "*"
|
||||||
|
files = [
|
||||||
|
{file = "requests-mock-1.11.0.tar.gz", hash = "sha256:ef10b572b489a5f28e09b708697208c4a3b2b89ef80a9f01584340ea357ec3c4"},
|
||||||
|
{file = "requests_mock-1.11.0-py2.py3-none-any.whl", hash = "sha256:f7fae383f228633f6bececebdab236c478ace2284d6292c6e7e2867b9ab74d15"},
|
||||||
|
]
|
||||||
|
|
||||||
|
[package.dependencies]
|
||||||
|
requests = ">=2.3,<3"
|
||||||
|
six = "*"
|
||||||
|
|
||||||
|
[package.extras]
|
||||||
|
fixture = ["fixtures"]
|
||||||
|
test = ["fixtures", "mock", "purl", "pytest", "requests-futures", "sphinx", "testtools"]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "requests-oauthlib"
|
name = "requests-oauthlib"
|
||||||
version = "1.3.1"
|
version = "1.3.1"
|
||||||
@ -11047,4 +11086,4 @@ text-helpers = ["chardet"]
|
|||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "2.0"
|
lock-version = "2.0"
|
||||||
python-versions = ">=3.8.1,<4.0"
|
python-versions = ">=3.8.1,<4.0"
|
||||||
content-hash = "1da427450c9a9c6b20082b98e6a1371d9cc0fb48da2e45831070587c27449b40"
|
content-hash = "9345cd37346e9f369702f51b7e10dde8da91d5f7b659c8c204e5b46c360cd028"
|
||||||
|
@ -159,6 +159,8 @@ pandas = "^2.0.0"
|
|||||||
pytest-mock = "^3.10.0"
|
pytest-mock = "^3.10.0"
|
||||||
pytest-socket = "^0.6.0"
|
pytest-socket = "^0.6.0"
|
||||||
syrupy = "^4.0.2"
|
syrupy = "^4.0.2"
|
||||||
|
requests-mock = "^1.11.0"
|
||||||
|
|
||||||
|
|
||||||
[tool.poetry.group.codespell.dependencies]
|
[tool.poetry.group.codespell.dependencies]
|
||||||
codespell = "^2.2.0"
|
codespell = "^2.2.0"
|
||||||
|
@ -0,0 +1,71 @@
|
|||||||
|
import unittest
|
||||||
|
from typing import Any
|
||||||
|
from unittest.mock import patch
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import requests_mock
|
||||||
|
from requests_mock.mocker import Mocker
|
||||||
|
|
||||||
|
from langchain.document_loaders.lakefs import LakeFSLoader
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def mock_lakefs_client() -> Any:
|
||||||
|
with patch("langchain.document_loaders.lakefs.LakeFSClient") as mock_lakefs_client:
|
||||||
|
mock_lakefs_client.return_value.ls_objects.return_value = [
|
||||||
|
("path_bla.txt", "https://physical_address_bla")
|
||||||
|
]
|
||||||
|
mock_lakefs_client.return_value.is_presign_supported.return_value = True
|
||||||
|
yield mock_lakefs_client.return_value
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def mock_lakefs_client_no_presign_not_local() -> Any:
|
||||||
|
with patch("langchain.document_loaders.lakefs.LakeFSClient") as mock_lakefs_client:
|
||||||
|
mock_lakefs_client.return_value.ls_objects.return_value = [
|
||||||
|
("path_bla.txt", "https://physical_address_bla")
|
||||||
|
]
|
||||||
|
mock_lakefs_client.return_value.is_presign_supported.return_value = False
|
||||||
|
yield mock_lakefs_client.return_value
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def mock_unstructured_local() -> Any:
|
||||||
|
with patch(
|
||||||
|
"langchain.document_loaders.lakefs.UnstructuredLakeFSLoader"
|
||||||
|
) as mock_unstructured_lakefs:
|
||||||
|
mock_unstructured_lakefs.return_value.load.return_value = [
|
||||||
|
("text content", "pdf content")
|
||||||
|
]
|
||||||
|
yield mock_unstructured_lakefs.return_value
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def mock_lakefs_client_no_presign_local() -> Any:
|
||||||
|
with patch("langchain.document_loaders.lakefs.LakeFSClient") as mock_lakefs_client:
|
||||||
|
mock_lakefs_client.return_value.ls_objects.return_value = [
|
||||||
|
("path_bla.txt", "local:///physical_address_bla")
|
||||||
|
]
|
||||||
|
mock_lakefs_client.return_value.is_presign_supported.return_value = False
|
||||||
|
yield mock_lakefs_client.return_value
|
||||||
|
|
||||||
|
|
||||||
|
class TestLakeFSLoader(unittest.TestCase):
|
||||||
|
lakefs_access_key = "lakefs_access_key"
|
||||||
|
lakefs_secret_key = "lakefs_secret_key"
|
||||||
|
endpoint = "endpoint"
|
||||||
|
repo = "repo"
|
||||||
|
ref = "ref"
|
||||||
|
path = "path"
|
||||||
|
|
||||||
|
@requests_mock.Mocker()
|
||||||
|
@pytest.mark.usefixtures("mock_lakefs_client")
|
||||||
|
def test_presigned_loading(self, mocker: Mocker) -> None:
|
||||||
|
mocker.register_uri("GET", requests_mock.ANY, text="data")
|
||||||
|
loader = LakeFSLoader(
|
||||||
|
self.lakefs_access_key, self.lakefs_secret_key, self.endpoint
|
||||||
|
)
|
||||||
|
loader.set_repo(self.repo)
|
||||||
|
loader.set_ref(self.ref)
|
||||||
|
loader.set_path(self.path)
|
||||||
|
loader.load()
|
@ -0,0 +1,88 @@
|
|||||||
|
import unittest
|
||||||
|
from typing import Any
|
||||||
|
from unittest.mock import patch
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import requests_mock
|
||||||
|
from requests_mock.mocker import Mocker
|
||||||
|
|
||||||
|
from langchain.document_loaders.lakefs import LakeFSLoader
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def mock_lakefs_client() -> Any:
|
||||||
|
with patch("langchain.document_loaders.lakefs.LakeFSClient") as mock_lakefs_client:
|
||||||
|
mock_lakefs_client.return_value.ls_objects.return_value = [
|
||||||
|
("path_bla.txt", "https://physical_address_bla")
|
||||||
|
]
|
||||||
|
mock_lakefs_client.return_value.is_presign_supported.return_value = True
|
||||||
|
yield mock_lakefs_client.return_value
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def mock_lakefs_client_no_presign_not_local() -> Any:
|
||||||
|
with patch("langchain.document_loaders.lakefs.LakeFSClient") as mock_lakefs_client:
|
||||||
|
mock_lakefs_client.return_value.ls_objects.return_value = [
|
||||||
|
("path_bla.txt", "https://physical_address_bla")
|
||||||
|
]
|
||||||
|
mock_lakefs_client.return_value.is_presign_supported.return_value = False
|
||||||
|
yield mock_lakefs_client.return_value
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def mock_unstructured_local() -> Any:
|
||||||
|
with patch(
|
||||||
|
"langchain.document_loaders.lakefs.UnstructuredLakeFSLoader"
|
||||||
|
) as mock_unstructured_lakefs:
|
||||||
|
mock_unstructured_lakefs.return_value.load.return_value = [
|
||||||
|
("text content", "pdf content")
|
||||||
|
]
|
||||||
|
yield mock_unstructured_lakefs.return_value
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def mock_lakefs_client_no_presign_local() -> Any:
|
||||||
|
with patch("langchain.document_loaders.lakefs.LakeFSClient") as mock_lakefs_client:
|
||||||
|
mock_lakefs_client.return_value.ls_objects.return_value = [
|
||||||
|
("path_bla.txt", "local:///physical_address_bla")
|
||||||
|
]
|
||||||
|
mock_lakefs_client.return_value.is_presign_supported.return_value = False
|
||||||
|
yield mock_lakefs_client.return_value
|
||||||
|
|
||||||
|
|
||||||
|
class TestLakeFSLoader(unittest.TestCase):
|
||||||
|
lakefs_access_key = "lakefs_access_key"
|
||||||
|
lakefs_secret_key = "lakefs_secret_key"
|
||||||
|
endpoint = "endpoint"
|
||||||
|
repo = "repo"
|
||||||
|
ref = "ref"
|
||||||
|
path = "path"
|
||||||
|
|
||||||
|
@requests_mock.Mocker()
|
||||||
|
@pytest.mark.usefixtures("mock_lakefs_client_no_presign_not_local")
|
||||||
|
def test_non_presigned_loading_fail(self, mocker: Mocker) -> None:
|
||||||
|
mocker.register_uri(requests_mock.ANY, requests_mock.ANY, status_code=200)
|
||||||
|
loader = LakeFSLoader(
|
||||||
|
self.lakefs_access_key, self.lakefs_secret_key, self.endpoint
|
||||||
|
)
|
||||||
|
loader.set_repo(self.repo)
|
||||||
|
loader.set_ref(self.ref)
|
||||||
|
loader.set_path(self.path)
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
loader.load()
|
||||||
|
|
||||||
|
@requests_mock.Mocker()
|
||||||
|
@pytest.mark.usefixtures(
|
||||||
|
"mock_lakefs_client_no_presign_local", "mock_unstructured_local"
|
||||||
|
)
|
||||||
|
def test_non_presigned_loading(self, mocker: Mocker) -> None:
|
||||||
|
mocker.register_uri(requests_mock.ANY, requests_mock.ANY, status_code=200)
|
||||||
|
loader = LakeFSLoader(
|
||||||
|
lakefs_access_key="lakefs_access_key",
|
||||||
|
lakefs_secret_key="lakefs_secret_key",
|
||||||
|
lakefs_endpoint=self.endpoint,
|
||||||
|
)
|
||||||
|
loader.set_repo(self.repo)
|
||||||
|
loader.set_ref(self.ref)
|
||||||
|
loader.set_path(self.path)
|
||||||
|
loader.load()
|
@ -69,21 +69,24 @@ def test_test_group_dependencies(poetry_conf: Mapping[str, Any]) -> None:
|
|||||||
|
|
||||||
test_group_deps = sorted(poetry_conf["group"]["test"]["dependencies"])
|
test_group_deps = sorted(poetry_conf["group"]["test"]["dependencies"])
|
||||||
|
|
||||||
assert test_group_deps == [
|
assert test_group_deps == sorted(
|
||||||
"duckdb-engine",
|
[
|
||||||
"freezegun",
|
"duckdb-engine",
|
||||||
"lark",
|
"freezegun",
|
||||||
"pandas",
|
"lark",
|
||||||
"pytest",
|
"pandas",
|
||||||
"pytest-asyncio",
|
"pytest",
|
||||||
"pytest-cov",
|
"pytest-asyncio",
|
||||||
"pytest-dotenv",
|
"pytest-cov",
|
||||||
"pytest-mock",
|
"pytest-dotenv",
|
||||||
"pytest-socket",
|
"pytest-mock",
|
||||||
"pytest-watcher",
|
"pytest-socket",
|
||||||
"responses",
|
"pytest-watcher",
|
||||||
"syrupy",
|
"responses",
|
||||||
]
|
"syrupy",
|
||||||
|
"requests-mock",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_imports() -> None:
|
def test_imports() -> None:
|
||||||
|
Loading…
Reference in New Issue
Block a user