mirror of
https://github.com/hwchase17/langchain
synced 2024-11-04 06:00:26 +00:00
Fix GitLoader to handle repeated load calls (#8412)
**Description: a description of the change** In this pull request, GitLoader has been updated to handle multiple load calls, provided the same repository is being cloned. Previously, calling `load` multiple times would raise an error if a clone URL was provided. Additionally, a check has been added to raise a ValueError when attempting to clone a different repository into an existing path. New tests have also been introduced to verify the correct behavior of the GitLoader class when `load` is called multiple times. Lastly, the GitPython package, a dependency for the GitLoader class, has been added to the project dependencies (pyproject.toml and poetry.lock). **Issue: the issue # it fixes (if applicable)** None **Dependencies: any dependencies required for this change** GitPython **Tag maintainer: for a quicker response, tag the relevant maintainer (see below)** - DataLoaders / VectorStores / Retrievers: @rlancemartin, @eyurtsev
This commit is contained in:
parent
9975ba4124
commit
a795c3d860
@ -49,7 +49,18 @@ class GitLoader(BaseLoader):
|
||||
if not os.path.exists(self.repo_path) and self.clone_url is None:
|
||||
raise ValueError(f"Path {self.repo_path} does not exist")
|
||||
elif self.clone_url:
|
||||
repo = Repo.clone_from(self.clone_url, self.repo_path)
|
||||
# If the repo_path already contains a git repository, verify that it's the
|
||||
# same repository as the one we're trying to clone.
|
||||
if os.path.isdir(os.path.join(self.repo_path, ".git")):
|
||||
repo = Repo(self.repo_path)
|
||||
# If the existing repository is not the same as the one we're trying to
|
||||
# clone, raise an error.
|
||||
if repo.remotes.origin.url != self.clone_url:
|
||||
raise ValueError(
|
||||
"A different repository is already cloned at this path."
|
||||
)
|
||||
else:
|
||||
repo = Repo.clone_from(self.clone_url, self.repo_path)
|
||||
repo.git.checkout(self.branch)
|
||||
else:
|
||||
repo = Repo(self.repo_path)
|
||||
|
11
libs/langchain/poetry.lock
generated
11
libs/langchain/poetry.lock
generated
@ -3269,14 +3269,14 @@ smmap = ">=3.0.1,<6"
|
||||
|
||||
[[package]]
|
||||
name = "gitpython"
|
||||
version = "3.1.31"
|
||||
version = "3.1.32"
|
||||
description = "GitPython is a Python library used to interact with Git repositories"
|
||||
category = "main"
|
||||
optional = true
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "GitPython-3.1.31-py3-none-any.whl", hash = "sha256:f04893614f6aa713a60cbbe1e6a97403ef633103cdd0ef5eb6efe0deb98dbe8d"},
|
||||
{file = "GitPython-3.1.31.tar.gz", hash = "sha256:8ce3bcf69adfdf7c7d503e78fd3b1c492af782d58893b650adb2ac8912ddd573"},
|
||||
{file = "GitPython-3.1.32-py3-none-any.whl", hash = "sha256:e3d59b1c2c6ebb9dfa7a184daf3b6dd4914237e7488a1730a6d8f6f5d0b4187f"},
|
||||
{file = "GitPython-3.1.32.tar.gz", hash = "sha256:8d9b8cb1e80b9735e8717c9362079d3ce4c6e5ddeebedd0361b228c3a67a62f6"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
@ -4652,6 +4652,7 @@ optional = false
|
||||
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, !=3.6.*"
|
||||
files = [
|
||||
{file = "jsonpointer-2.4-py2.py3-none-any.whl", hash = "sha256:15d51bba20eea3165644553647711d150376234112651b4f1811022aecad7d7a"},
|
||||
{file = "jsonpointer-2.4.tar.gz", hash = "sha256:585cee82b70211fa9e6043b7bb89db6e1aa49524340dde8ad6b63206ea689d88"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@ -13229,7 +13230,7 @@ clarifai = ["clarifai"]
|
||||
cohere = ["cohere"]
|
||||
docarray = ["docarray"]
|
||||
embeddings = ["sentence-transformers"]
|
||||
extended-testing = ["atlassian-python-api", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "esprima", "geopandas", "gql", "html2text", "jinja2", "jq", "lxml", "mwparserfromhell", "mwxml", "openai", "openai", "pandas", "pdfminer-six", "pgvector", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "requests-toolbelt", "scikit-learn", "streamlit", "sympy", "telethon", "tqdm", "xinference", "zep-python"]
|
||||
extended-testing = ["atlassian-python-api", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "esprima", "geopandas", "gitpython", "gql", "html2text", "jinja2", "jq", "lxml", "mwparserfromhell", "mwxml", "openai", "openai", "pandas", "pdfminer-six", "pgvector", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "requests-toolbelt", "scikit-learn", "streamlit", "sympy", "telethon", "tqdm", "xinference", "zep-python"]
|
||||
javascript = ["esprima"]
|
||||
llms = ["anthropic", "clarifai", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "openllm", "openlm", "torch", "transformers", "xinference"]
|
||||
openai = ["openai", "tiktoken"]
|
||||
@ -13239,4 +13240,4 @@ text-helpers = ["chardet"]
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = ">=3.8.1,<4.0"
|
||||
content-hash = "5b1c718874d76c0e3b4023b2bceebe11a5e26e5e05d6797acf91b01b0438b2f7"
|
||||
content-hash = "ef2b1d30e0fa872ce764c8a4cbc6e0a460bc9391a6465ee29d657e83b5459391"
|
||||
|
@ -126,6 +126,7 @@ amadeus = {version = ">=8.1.0", optional = true}
|
||||
geopandas = {version = "^0.13.1", optional = true}
|
||||
xinference = {version = "^0.0.6", optional = true}
|
||||
python-arango = {version = "^7.5.9", optional = true}
|
||||
gitpython = {version = "^3.1.32", optional = true}
|
||||
|
||||
[tool.poetry.group.test.dependencies]
|
||||
# The only dependencies that should be added are
|
||||
@ -359,6 +360,7 @@ extended_testing = [
|
||||
"geopandas",
|
||||
"jinja2",
|
||||
"xinference",
|
||||
"gitpython",
|
||||
]
|
||||
|
||||
[tool.ruff]
|
||||
|
65
libs/langchain/tests/unit_tests/document_loaders/test_git.py
Normal file
65
libs/langchain/tests/unit_tests/document_loaders/test_git.py
Normal file
@ -0,0 +1,65 @@
|
||||
import os
|
||||
|
||||
import py
|
||||
import pytest
|
||||
|
||||
from langchain.document_loaders import GitLoader
|
||||
|
||||
|
||||
def init_repo(tmpdir: py.path.local, dir_name: str) -> str:
|
||||
from git import Repo
|
||||
|
||||
repo_dir = tmpdir.mkdir(dir_name)
|
||||
repo = Repo.init(repo_dir)
|
||||
git = repo.git
|
||||
git.checkout(b="main")
|
||||
|
||||
git.config("user.name", "Test User")
|
||||
git.config("user.email", "test@example.com")
|
||||
|
||||
sample_file = "file.txt"
|
||||
with open(os.path.join(repo_dir, sample_file), "w") as f:
|
||||
f.write("content")
|
||||
git.add([sample_file])
|
||||
git.commit(m="Initial commit")
|
||||
|
||||
return repo_dir
|
||||
|
||||
|
||||
@pytest.mark.requires("git")
|
||||
def test_load_twice(tmpdir: py.path.local) -> None:
|
||||
"""
|
||||
Test that loading documents twice from the same repository does not raise an error.
|
||||
"""
|
||||
|
||||
clone_url = init_repo(tmpdir, "remote_repo")
|
||||
|
||||
repo_path = tmpdir.mkdir("local_repo").strpath
|
||||
loader = GitLoader(repo_path=repo_path, clone_url=clone_url)
|
||||
|
||||
documents = loader.load()
|
||||
assert len(documents) == 1
|
||||
|
||||
documents = loader.load()
|
||||
assert len(documents) == 1
|
||||
|
||||
|
||||
@pytest.mark.requires("git")
|
||||
def test_clone_different_repo(tmpdir: py.path.local) -> None:
|
||||
"""
|
||||
Test that trying to clone a different repository into a directory already
|
||||
containing a clone raises a ValueError.
|
||||
"""
|
||||
|
||||
clone_url = init_repo(tmpdir, "remote_repo")
|
||||
|
||||
repo_path = tmpdir.mkdir("local_repo").strpath
|
||||
loader = GitLoader(repo_path=repo_path, clone_url=clone_url)
|
||||
|
||||
documents = loader.load()
|
||||
assert len(documents) == 1
|
||||
|
||||
other_clone_url = init_repo(tmpdir, "other_remote_repo")
|
||||
other_loader = GitLoader(repo_path=repo_path, clone_url=other_clone_url)
|
||||
with pytest.raises(ValueError):
|
||||
other_loader.load()
|
Loading…
Reference in New Issue
Block a user