diff --git a/docs/modules/indexes/document_loaders/examples/example_data/test_repo1 b/docs/modules/indexes/document_loaders/examples/example_data/test_repo1 new file mode 160000 index 00000000..7e525a3b --- /dev/null +++ b/docs/modules/indexes/document_loaders/examples/example_data/test_repo1 @@ -0,0 +1 @@ +Subproject commit 7e525a3b91ddf2cc0d9b3add5928460449c1d600 diff --git a/docs/modules/indexes/document_loaders/examples/git.ipynb b/docs/modules/indexes/document_loaders/examples/git.ipynb index 306a68fa..d2ac89fb 100644 --- a/docs/modules/indexes/document_loaders/examples/git.ipynb +++ b/docs/modules/indexes/document_loaders/examples/git.ipynb @@ -18,7 +18,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -32,7 +32,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -41,7 +41,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -50,7 +50,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -59,27 +59,16 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "1040" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "len(data)" ] }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -103,7 +92,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -112,7 +101,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -125,7 +114,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -134,16 +123,16 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "1040" + "1074" ] }, - "execution_count": 30, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -152,6 +141,25 @@ "len(data)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Filtering files to load" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import GitLoader\n", + "\n", + "# eg. loading only python files\n", + "loader = GitLoader(repo_path=\"./example_data/test_repo1/\", file_filter=lambda file_path: file_path.endswith(\".py\"))" + ] + }, { "cell_type": "code", "execution_count": null, @@ -176,7 +184,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.1" + "version": "3.11.3" } }, "nbformat": 4, diff --git a/langchain/document_loaders/git.py b/langchain/document_loaders/git.py index 15576762..eb10dcde 100644 --- a/langchain/document_loaders/git.py +++ b/langchain/document_loaders/git.py @@ -1,5 +1,5 @@ import os -from typing import List, Optional +from typing import Callable, List, Optional from langchain.docstore.document import Document from langchain.document_loaders.base import BaseLoader @@ -21,10 +21,12 @@ class GitLoader(BaseLoader): repo_path: str, clone_url: Optional[str] = None, branch: Optional[str] = "main", + file_filter: Optional[Callable[[str], bool]] = None, ): self.repo_path = repo_path self.clone_url = clone_url self.branch = branch + self.file_filter = file_filter def load(self) -> List[Document]: try: @@ -47,28 +49,35 @@ class GitLoader(BaseLoader): docs: List[Document] = [] for item in repo.tree().traverse(): - if isinstance(item, Blob): - file_path = os.path.join(self.repo_path, item.path) - rel_file_path = os.path.relpath(file_path, self.repo_path) - try: - with open(file_path, "rb") as f: - content = f.read() - file_type = os.path.splitext(item.name)[1] + if not isinstance(item, Blob): + continue - # loads only text files - try: - text_content = content.decode("utf-8") - except UnicodeDecodeError: - continue + file_path = os.path.join(self.repo_path, item.path) - metadata = { - "file_path": rel_file_path, - "file_name": item.name, - "file_type": file_type, - } - doc = Document(page_content=text_content, metadata=metadata) - docs.append(doc) - except Exception as e: - print(f"Error reading file {file_path}: {e}") + # uses filter to skip files + if self.file_filter and not self.file_filter(file_path): + continue + + rel_file_path = os.path.relpath(file_path, self.repo_path) + try: + with open(file_path, "rb") as f: + content = f.read() + file_type = os.path.splitext(item.name)[1] + + # loads only text files + try: + text_content = content.decode("utf-8") + except UnicodeDecodeError: + continue + + metadata = { + "file_path": rel_file_path, + "file_name": item.name, + "file_type": file_type, + } + doc = Document(page_content=text_content, metadata=metadata) + docs.append(doc) + except Exception as e: + print(f"Error reading file {file_path}: {e}") return docs