Add file filter param to Git loader (#2904)

Allows users to specify what files should be loaded instead of indiscriminately loading the entire repo. extends #2851 NOTE: for reviewers, `hide whitespace` option recommended since I changed the indentation of an if-block to use `continue` instead so it looks less like a Christmas tree :)
1 year ago · a508afa91c
parent 7e525a3b91
commit a508afa91c
3 changed files with 65 additions and 47 deletions
--- a/docs/modules/indexes/document_loaders/examples/example_data/test_repo1
+++ b/docs/modules/indexes/document_loaders/examples/example_data/test_repo1
@ -0,0 +1 @@
+Subproject commit 7e525a3b91ddf2cc0d9b3add5928460449c1d600
--- a/docs/modules/indexes/document_loaders/examples/git.ipynb
+++ b/docs/modules/indexes/document_loaders/examples/git.ipynb
@ -18,7 +18,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
@ -32,7 +32,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@ -41,7 +41,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
@ -50,7 +50,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
@ -59,27 +59,16 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "1040"
-      ]
-     },
-     "execution_count": 25,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
   "source": [
    "len(data)"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
@ -103,7 +92,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
@ -112,7 +101,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
@ -125,7 +114,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
@ -134,16 +123,16 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 30,
+   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
-       "1040"
+       "1074"
      ]
     },
-     "execution_count": 30,
+     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -152,6 +141,25 @@
    "len(data)"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Filtering files to load"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.document_loaders import GitLoader\n",
+    "\n",
+    "# eg. loading only python files\n",
+    "loader = GitLoader(repo_path=\"./example_data/test_repo1/\", file_filter=lambda file_path: file_path.endswith(\".py\"))"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
@ -176,7 +184,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.9.1"
+   "version": "3.11.3"
  }
 },
 "nbformat": 4,
--- a/langchain/document_loaders/git.py
+++ b/langchain/document_loaders/git.py
@ -1,5 +1,5 @@
 import os
-from typing import List, Optional
+from typing import Callable, List, Optional

 from langchain.docstore.document import Document
 from langchain.document_loaders.base import BaseLoader
@ -21,10 +21,12 @@ class GitLoader(BaseLoader):
        repo_path: str,
        clone_url: Optional[str] = None,
        branch: Optional[str] = "main",
+        file_filter: Optional[Callable[[str], bool]] = None,
    ):
        self.repo_path = repo_path
        self.clone_url = clone_url
        self.branch = branch
+        self.file_filter = file_filter

    def load(self) -> List[Document]:
        try:
@ -47,28 +49,35 @@ class GitLoader(BaseLoader):
        docs: List[Document] = []

        for item in repo.tree().traverse():
-            if isinstance(item, Blob):
-                file_path = os.path.join(self.repo_path, item.path)
-                rel_file_path = os.path.relpath(file_path, self.repo_path)
-                try:
-                    with open(file_path, "rb") as f:
-                        content = f.read()
-                        file_type = os.path.splitext(item.name)[1]
+            if not isinstance(item, Blob):
+                continue

-                        # loads only text files
-                        try:
-                            text_content = content.decode("utf-8")
-                        except UnicodeDecodeError:
-                            continue
+            file_path = os.path.join(self.repo_path, item.path)

-                        metadata = {
-                            "file_path": rel_file_path,
-                            "file_name": item.name,
-                            "file_type": file_type,
-                        }
-                        doc = Document(page_content=text_content, metadata=metadata)
-                        docs.append(doc)
-                except Exception as e:
-                    print(f"Error reading file {file_path}: {e}")
+            # uses filter to skip files
+            if self.file_filter and not self.file_filter(file_path):
+                continue
+
+            rel_file_path = os.path.relpath(file_path, self.repo_path)
+            try:
+                with open(file_path, "rb") as f:
+                    content = f.read()
+                    file_type = os.path.splitext(item.name)[1]
+
+                    # loads only text files
+                    try:
+                        text_content = content.decode("utf-8")
+                    except UnicodeDecodeError:
+                        continue
+
+                    metadata = {
+                        "file_path": rel_file_path,
+                        "file_name": item.name,
+                        "file_type": file_type,
+                    }
+                    doc = Document(page_content=text_content, metadata=metadata)
+                    docs.append(doc)
+            except Exception as e:
+                print(f"Error reading file {file_path}: {e}")

        return docs
				`@ -0,0 +1 @@`
				`Subproject commit 7e525a3b91ddf2cc0d9b3add5928460449c1d600`