From 016738e676a83fc1794e0551d85561fc870ce378 Mon Sep 17 00:00:00 2001 From: ecneladis Date: Fri, 14 Apr 2023 06:39:20 +0200 Subject: [PATCH] Add GitLoader (#2851) --- .../document_loaders/examples/git.ipynb | 199 ++++++++++++++++++ langchain/document_loaders/git.py | 74 +++++++ 2 files changed, 273 insertions(+) create mode 100644 docs/modules/indexes/document_loaders/examples/git.ipynb create mode 100644 langchain/document_loaders/git.py diff --git a/docs/modules/indexes/document_loaders/examples/git.ipynb b/docs/modules/indexes/document_loaders/examples/git.ipynb new file mode 100644 index 00000000..ffebd95d --- /dev/null +++ b/docs/modules/indexes/document_loaders/examples/git.ipynb @@ -0,0 +1,199 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Git\n", + "\n", + "This notebook shows how to load text files from Git repository." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load existing repository from disk" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from git import Repo\n", + "\n", + "repo = Repo.clone_from(\n", + " \"https://github.com/hwchase17/langchain\", to_path=\"./example_data/test_repo1\"\n", + ")\n", + "branch = repo.head.reference" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders.git import GitLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "ename": "TypeError", + "evalue": "__init__() got an unexpected keyword argument 'path'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[3], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m loader \u001b[39m=\u001b[39m GitLoader(path\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39m./example_data/test_repo1/\u001b[39;49m\u001b[39m\"\u001b[39;49m, branch\u001b[39m=\u001b[39;49mbranch)\n", + "\u001b[0;31mTypeError\u001b[0m: __init__() got an unexpected keyword argument 'path'" + ] + } + ], + "source": [ + "loader = GitLoader(repo_path=\"./example_data/test_repo1/\", branch=branch)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1040" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_content='.venv\\n.github\\n.git\\n.mypy_cache\\n.pytest_cache\\nDockerfile' metadata={'file_path': '.dockerignore', 'file_name': '.dockerignore', 'file_type': ''}\n" + ] + } + ], + "source": [ + "print(data[0])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Clone repository from url" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders.git import GitLoader" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "loader = GitLoader(\n", + " clone_url=\"https://github.com/hwchase17/langchain\",\n", + " repo_path=\"./example_data/test_repo2/\",\n", + " branch=\"master\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1040" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "ai", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/langchain/document_loaders/git.py b/langchain/document_loaders/git.py new file mode 100644 index 00000000..39a9235e --- /dev/null +++ b/langchain/document_loaders/git.py @@ -0,0 +1,74 @@ +import os +from typing import List, Optional + +from langchain.docstore.document import Document +from langchain.document_loaders.base import BaseLoader + + +class GitLoader(BaseLoader): + """Loads files from a Git repository into a list of documents. + Repository can be local on disk available at `repo_path`, + or remote at `clone_url` that will be cloned to `repo_path`. + Currently supports only text files. + + Each document represents one file in the repository. The `path` points to + the local Git repository, and the `branch` specifies the branch to load + files from. By default, it loads from the `main` branch. + """ + + def __init__( + self, + repo_path: str, + clone_url: Optional[str] = None, + branch: Optional[str] = "main", + ): + self.repo_path = repo_path + self.clone_url = clone_url + self.branch = branch + + def load(self) -> List[Document]: + try: + from git import Blob, Repo + except ImportError as ex: + raise ImportError( + "Could not import git python package. " + "Please install it with `pip install GitPython`." + ) from ex + + if not os.path.exists(self.repo_path) and self.clone_url is None: + raise ValueError(f"Path {self.repo_path} does not exist") + elif self.clone_url: + repo = Repo.clone_from(self.clone_url, self.repo_path) + repo.git.checkout(self.branch) + else: + repo = Repo(self.repo_path) + repo.git.checkout(self.branch) + + docs: List[Document] = [] + + for item in repo.tree().traverse(): + if isinstance(item, Blob): + file_path = os.path.join(self.repo_path, item.path) + rel_file_path = os.path.relpath(file_path, self.repo_path) + try: + with open(file_path, "rb") as f: + content = f.read() + file_type = os.path.splitext(item.name)[1] + + # loads only text files + try: + text_content = content.decode("utf-8") + except UnicodeDecodeError: + continue + + metadata = { + "file_path": rel_file_path, + "file_name": item.name, + "file_type": file_type, + } + doc = Document(page_content=text_content, metadata=metadata) + docs.append(doc) + except Exception as e: + print(f"Error reading file {file_path}: {e}") + + return docs