From a795c3d860a939d18fb66a57e23545438a96b0c1 Mon Sep 17 00:00:00 2001 From: os1ma <39944763+os1ma@users.noreply.github.com> Date: Mon, 31 Jul 2023 13:27:20 +0900 Subject: [PATCH] Fix GitLoader to handle repeated load calls (#8412) **Description: a description of the change** In this pull request, GitLoader has been updated to handle multiple load calls, provided the same repository is being cloned. Previously, calling `load` multiple times would raise an error if a clone URL was provided. Additionally, a check has been added to raise a ValueError when attempting to clone a different repository into an existing path. New tests have also been introduced to verify the correct behavior of the GitLoader class when `load` is called multiple times. Lastly, the GitPython package, a dependency for the GitLoader class, has been added to the project dependencies (pyproject.toml and poetry.lock). **Issue: the issue # it fixes (if applicable)** None **Dependencies: any dependencies required for this change** GitPython **Tag maintainer: for a quicker response, tag the relevant maintainer (see below)** - DataLoaders / VectorStores / Retrievers: @rlancemartin, @eyurtsev --- .../langchain/document_loaders/git.py | 13 +++- libs/langchain/poetry.lock | 11 ++-- libs/langchain/pyproject.toml | 2 + .../unit_tests/document_loaders/test_git.py | 65 +++++++++++++++++++ 4 files changed, 85 insertions(+), 6 deletions(-) create mode 100644 libs/langchain/tests/unit_tests/document_loaders/test_git.py diff --git a/libs/langchain/langchain/document_loaders/git.py b/libs/langchain/langchain/document_loaders/git.py index 3898381170..14b2676a45 100644 --- a/libs/langchain/langchain/document_loaders/git.py +++ b/libs/langchain/langchain/document_loaders/git.py @@ -49,7 +49,18 @@ class GitLoader(BaseLoader): if not os.path.exists(self.repo_path) and self.clone_url is None: raise ValueError(f"Path {self.repo_path} does not exist") elif self.clone_url: - repo = Repo.clone_from(self.clone_url, self.repo_path) + # If the repo_path already contains a git repository, verify that it's the + # same repository as the one we're trying to clone. + if os.path.isdir(os.path.join(self.repo_path, ".git")): + repo = Repo(self.repo_path) + # If the existing repository is not the same as the one we're trying to + # clone, raise an error. + if repo.remotes.origin.url != self.clone_url: + raise ValueError( + "A different repository is already cloned at this path." + ) + else: + repo = Repo.clone_from(self.clone_url, self.repo_path) repo.git.checkout(self.branch) else: repo = Repo(self.repo_path) diff --git a/libs/langchain/poetry.lock b/libs/langchain/poetry.lock index 0ece5987b8..e0135c3cdb 100644 --- a/libs/langchain/poetry.lock +++ b/libs/langchain/poetry.lock @@ -3269,14 +3269,14 @@ smmap = ">=3.0.1,<6" [[package]] name = "gitpython" -version = "3.1.31" +version = "3.1.32" description = "GitPython is a Python library used to interact with Git repositories" category = "main" optional = true python-versions = ">=3.7" files = [ - {file = "GitPython-3.1.31-py3-none-any.whl", hash = "sha256:f04893614f6aa713a60cbbe1e6a97403ef633103cdd0ef5eb6efe0deb98dbe8d"}, - {file = "GitPython-3.1.31.tar.gz", hash = "sha256:8ce3bcf69adfdf7c7d503e78fd3b1c492af782d58893b650adb2ac8912ddd573"}, + {file = "GitPython-3.1.32-py3-none-any.whl", hash = "sha256:e3d59b1c2c6ebb9dfa7a184daf3b6dd4914237e7488a1730a6d8f6f5d0b4187f"}, + {file = "GitPython-3.1.32.tar.gz", hash = "sha256:8d9b8cb1e80b9735e8717c9362079d3ce4c6e5ddeebedd0361b228c3a67a62f6"}, ] [package.dependencies] @@ -4652,6 +4652,7 @@ optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, !=3.6.*" files = [ {file = "jsonpointer-2.4-py2.py3-none-any.whl", hash = "sha256:15d51bba20eea3165644553647711d150376234112651b4f1811022aecad7d7a"}, + {file = "jsonpointer-2.4.tar.gz", hash = "sha256:585cee82b70211fa9e6043b7bb89db6e1aa49524340dde8ad6b63206ea689d88"}, ] [[package]] @@ -13229,7 +13230,7 @@ clarifai = ["clarifai"] cohere = ["cohere"] docarray = ["docarray"] embeddings = ["sentence-transformers"] -extended-testing = ["atlassian-python-api", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "esprima", "geopandas", "gql", "html2text", "jinja2", "jq", "lxml", "mwparserfromhell", "mwxml", "openai", "openai", "pandas", "pdfminer-six", "pgvector", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "requests-toolbelt", "scikit-learn", "streamlit", "sympy", "telethon", "tqdm", "xinference", "zep-python"] +extended-testing = ["atlassian-python-api", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "esprima", "geopandas", "gitpython", "gql", "html2text", "jinja2", "jq", "lxml", "mwparserfromhell", "mwxml", "openai", "openai", "pandas", "pdfminer-six", "pgvector", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "requests-toolbelt", "scikit-learn", "streamlit", "sympy", "telethon", "tqdm", "xinference", "zep-python"] javascript = ["esprima"] llms = ["anthropic", "clarifai", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "openllm", "openlm", "torch", "transformers", "xinference"] openai = ["openai", "tiktoken"] @@ -13239,4 +13240,4 @@ text-helpers = ["chardet"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<4.0" -content-hash = "5b1c718874d76c0e3b4023b2bceebe11a5e26e5e05d6797acf91b01b0438b2f7" +content-hash = "ef2b1d30e0fa872ce764c8a4cbc6e0a460bc9391a6465ee29d657e83b5459391" diff --git a/libs/langchain/pyproject.toml b/libs/langchain/pyproject.toml index c5c224f180..06c4476738 100644 --- a/libs/langchain/pyproject.toml +++ b/libs/langchain/pyproject.toml @@ -126,6 +126,7 @@ amadeus = {version = ">=8.1.0", optional = true} geopandas = {version = "^0.13.1", optional = true} xinference = {version = "^0.0.6", optional = true} python-arango = {version = "^7.5.9", optional = true} +gitpython = {version = "^3.1.32", optional = true} [tool.poetry.group.test.dependencies] # The only dependencies that should be added are @@ -359,6 +360,7 @@ extended_testing = [ "geopandas", "jinja2", "xinference", + "gitpython", ] [tool.ruff] diff --git a/libs/langchain/tests/unit_tests/document_loaders/test_git.py b/libs/langchain/tests/unit_tests/document_loaders/test_git.py new file mode 100644 index 0000000000..c667cbdc85 --- /dev/null +++ b/libs/langchain/tests/unit_tests/document_loaders/test_git.py @@ -0,0 +1,65 @@ +import os + +import py +import pytest + +from langchain.document_loaders import GitLoader + + +def init_repo(tmpdir: py.path.local, dir_name: str) -> str: + from git import Repo + + repo_dir = tmpdir.mkdir(dir_name) + repo = Repo.init(repo_dir) + git = repo.git + git.checkout(b="main") + + git.config("user.name", "Test User") + git.config("user.email", "test@example.com") + + sample_file = "file.txt" + with open(os.path.join(repo_dir, sample_file), "w") as f: + f.write("content") + git.add([sample_file]) + git.commit(m="Initial commit") + + return repo_dir + + +@pytest.mark.requires("git") +def test_load_twice(tmpdir: py.path.local) -> None: + """ + Test that loading documents twice from the same repository does not raise an error. + """ + + clone_url = init_repo(tmpdir, "remote_repo") + + repo_path = tmpdir.mkdir("local_repo").strpath + loader = GitLoader(repo_path=repo_path, clone_url=clone_url) + + documents = loader.load() + assert len(documents) == 1 + + documents = loader.load() + assert len(documents) == 1 + + +@pytest.mark.requires("git") +def test_clone_different_repo(tmpdir: py.path.local) -> None: + """ + Test that trying to clone a different repository into a directory already + containing a clone raises a ValueError. + """ + + clone_url = init_repo(tmpdir, "remote_repo") + + repo_path = tmpdir.mkdir("local_repo").strpath + loader = GitLoader(repo_path=repo_path, clone_url=clone_url) + + documents = loader.load() + assert len(documents) == 1 + + other_clone_url = init_repo(tmpdir, "other_remote_repo") + other_loader = GitLoader(repo_path=repo_path, clone_url=other_clone_url) + with pytest.raises(ValueError): + other_loader.load()