Fix GitLoader to handle repeated load calls (#8412)

**Description: a description of the change**

In this pull request, GitLoader has been updated to handle multiple load
calls, provided the same repository is being cloned. Previously, calling
`load` multiple times would raise an error if a clone URL was provided.

Additionally, a check has been added to raise a ValueError when
attempting to clone a different repository into an existing path.

New tests have also been introduced to verify the correct behavior of
the GitLoader class when `load` is called multiple times.

Lastly, the GitPython package, a dependency for the GitLoader class, has
been added to the project dependencies (pyproject.toml and poetry.lock).

**Issue: the issue # it fixes (if applicable)**

None

**Dependencies: any dependencies required for this change**

GitPython

**Tag maintainer: for a quicker response, tag the relevant maintainer
(see below)**

- DataLoaders / VectorStores / Retrievers: @rlancemartin, @eyurtsev
This commit is contained in:
os1ma 2023-07-31 13:27:20 +09:00 committed by GitHub
parent 9975ba4124
commit a795c3d860
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 85 additions and 6 deletions

View File

@ -49,7 +49,18 @@ class GitLoader(BaseLoader):
if not os.path.exists(self.repo_path) and self.clone_url is None:
raise ValueError(f"Path {self.repo_path} does not exist")
elif self.clone_url:
repo = Repo.clone_from(self.clone_url, self.repo_path)
# If the repo_path already contains a git repository, verify that it's the
# same repository as the one we're trying to clone.
if os.path.isdir(os.path.join(self.repo_path, ".git")):
repo = Repo(self.repo_path)
# If the existing repository is not the same as the one we're trying to
# clone, raise an error.
if repo.remotes.origin.url != self.clone_url:
raise ValueError(
"A different repository is already cloned at this path."
)
else:
repo = Repo.clone_from(self.clone_url, self.repo_path)
repo.git.checkout(self.branch)
else:
repo = Repo(self.repo_path)

View File

@ -3269,14 +3269,14 @@ smmap = ">=3.0.1,<6"
[[package]]
name = "gitpython"
version = "3.1.31"
version = "3.1.32"
description = "GitPython is a Python library used to interact with Git repositories"
category = "main"
optional = true
python-versions = ">=3.7"
files = [
{file = "GitPython-3.1.31-py3-none-any.whl", hash = "sha256:f04893614f6aa713a60cbbe1e6a97403ef633103cdd0ef5eb6efe0deb98dbe8d"},
{file = "GitPython-3.1.31.tar.gz", hash = "sha256:8ce3bcf69adfdf7c7d503e78fd3b1c492af782d58893b650adb2ac8912ddd573"},
{file = "GitPython-3.1.32-py3-none-any.whl", hash = "sha256:e3d59b1c2c6ebb9dfa7a184daf3b6dd4914237e7488a1730a6d8f6f5d0b4187f"},
{file = "GitPython-3.1.32.tar.gz", hash = "sha256:8d9b8cb1e80b9735e8717c9362079d3ce4c6e5ddeebedd0361b228c3a67a62f6"},
]
[package.dependencies]
@ -4652,6 +4652,7 @@ optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, !=3.6.*"
files = [
{file = "jsonpointer-2.4-py2.py3-none-any.whl", hash = "sha256:15d51bba20eea3165644553647711d150376234112651b4f1811022aecad7d7a"},
{file = "jsonpointer-2.4.tar.gz", hash = "sha256:585cee82b70211fa9e6043b7bb89db6e1aa49524340dde8ad6b63206ea689d88"},
]
[[package]]
@ -13229,7 +13230,7 @@ clarifai = ["clarifai"]
cohere = ["cohere"]
docarray = ["docarray"]
embeddings = ["sentence-transformers"]
extended-testing = ["atlassian-python-api", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "esprima", "geopandas", "gql", "html2text", "jinja2", "jq", "lxml", "mwparserfromhell", "mwxml", "openai", "openai", "pandas", "pdfminer-six", "pgvector", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "requests-toolbelt", "scikit-learn", "streamlit", "sympy", "telethon", "tqdm", "xinference", "zep-python"]
extended-testing = ["atlassian-python-api", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "esprima", "geopandas", "gitpython", "gql", "html2text", "jinja2", "jq", "lxml", "mwparserfromhell", "mwxml", "openai", "openai", "pandas", "pdfminer-six", "pgvector", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "requests-toolbelt", "scikit-learn", "streamlit", "sympy", "telethon", "tqdm", "xinference", "zep-python"]
javascript = ["esprima"]
llms = ["anthropic", "clarifai", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "openllm", "openlm", "torch", "transformers", "xinference"]
openai = ["openai", "tiktoken"]
@ -13239,4 +13240,4 @@ text-helpers = ["chardet"]
[metadata]
lock-version = "2.0"
python-versions = ">=3.8.1,<4.0"
content-hash = "5b1c718874d76c0e3b4023b2bceebe11a5e26e5e05d6797acf91b01b0438b2f7"
content-hash = "ef2b1d30e0fa872ce764c8a4cbc6e0a460bc9391a6465ee29d657e83b5459391"

View File

@ -126,6 +126,7 @@ amadeus = {version = ">=8.1.0", optional = true}
geopandas = {version = "^0.13.1", optional = true}
xinference = {version = "^0.0.6", optional = true}
python-arango = {version = "^7.5.9", optional = true}
gitpython = {version = "^3.1.32", optional = true}
[tool.poetry.group.test.dependencies]
# The only dependencies that should be added are
@ -359,6 +360,7 @@ extended_testing = [
"geopandas",
"jinja2",
"xinference",
"gitpython",
]
[tool.ruff]

View File

@ -0,0 +1,65 @@
import os
import py
import pytest
from langchain.document_loaders import GitLoader
def init_repo(tmpdir: py.path.local, dir_name: str) -> str:
from git import Repo
repo_dir = tmpdir.mkdir(dir_name)
repo = Repo.init(repo_dir)
git = repo.git
git.checkout(b="main")
git.config("user.name", "Test User")
git.config("user.email", "test@example.com")
sample_file = "file.txt"
with open(os.path.join(repo_dir, sample_file), "w") as f:
f.write("content")
git.add([sample_file])
git.commit(m="Initial commit")
return repo_dir
@pytest.mark.requires("git")
def test_load_twice(tmpdir: py.path.local) -> None:
"""
Test that loading documents twice from the same repository does not raise an error.
"""
clone_url = init_repo(tmpdir, "remote_repo")
repo_path = tmpdir.mkdir("local_repo").strpath
loader = GitLoader(repo_path=repo_path, clone_url=clone_url)
documents = loader.load()
assert len(documents) == 1
documents = loader.load()
assert len(documents) == 1
@pytest.mark.requires("git")
def test_clone_different_repo(tmpdir: py.path.local) -> None:
"""
Test that trying to clone a different repository into a directory already
containing a clone raises a ValueError.
"""
clone_url = init_repo(tmpdir, "remote_repo")
repo_path = tmpdir.mkdir("local_repo").strpath
loader = GitLoader(repo_path=repo_path, clone_url=clone_url)
documents = loader.load()
assert len(documents) == 1
other_clone_url = init_repo(tmpdir, "other_remote_repo")
other_loader = GitLoader(repo_path=repo_path, clone_url=other_clone_url)
with pytest.raises(ValueError):
other_loader.load()