Add GitLoader (#2851)

This commit is contained in:
ecneladis 2023-04-14 06:39:20 +02:00 committed by GitHub
parent 8cfec2c5fe
commit 016738e676
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 273 additions and 0 deletions

View File

@ -0,0 +1,199 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Git\n",
"\n",
"This notebook shows how to load text files from Git repository."
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load existing repository from disk"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from git import Repo\n",
"\n",
"repo = Repo.clone_from(\n",
" \"https://github.com/hwchase17/langchain\", to_path=\"./example_data/test_repo1\"\n",
")\n",
"branch = repo.head.reference"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from langchain.document_loaders.git import GitLoader"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"ename": "TypeError",
"evalue": "__init__() got an unexpected keyword argument 'path'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[3], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m loader \u001b[39m=\u001b[39m GitLoader(path\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39m./example_data/test_repo1/\u001b[39;49m\u001b[39m\"\u001b[39;49m, branch\u001b[39m=\u001b[39;49mbranch)\n",
"\u001b[0;31mTypeError\u001b[0m: __init__() got an unexpected keyword argument 'path'"
]
}
],
"source": [
"loader = GitLoader(repo_path=\"./example_data/test_repo1/\", branch=branch)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data = loader.load()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1040"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(data)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"page_content='.venv\\n.github\\n.git\\n.mypy_cache\\n.pytest_cache\\nDockerfile' metadata={'file_path': '.dockerignore', 'file_name': '.dockerignore', 'file_type': ''}\n"
]
}
],
"source": [
"print(data[0])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Clone repository from url"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from langchain.document_loaders.git import GitLoader"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"loader = GitLoader(\n",
" clone_url=\"https://github.com/hwchase17/langchain\",\n",
" repo_path=\"./example_data/test_repo2/\",\n",
" branch=\"master\",\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data = loader.load()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1040"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(data)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "ai",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.6"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -0,0 +1,74 @@
import os
from typing import List, Optional
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
class GitLoader(BaseLoader):
"""Loads files from a Git repository into a list of documents.
Repository can be local on disk available at `repo_path`,
or remote at `clone_url` that will be cloned to `repo_path`.
Currently supports only text files.
Each document represents one file in the repository. The `path` points to
the local Git repository, and the `branch` specifies the branch to load
files from. By default, it loads from the `main` branch.
"""
def __init__(
self,
repo_path: str,
clone_url: Optional[str] = None,
branch: Optional[str] = "main",
):
self.repo_path = repo_path
self.clone_url = clone_url
self.branch = branch
def load(self) -> List[Document]:
try:
from git import Blob, Repo
except ImportError as ex:
raise ImportError(
"Could not import git python package. "
"Please install it with `pip install GitPython`."
) from ex
if not os.path.exists(self.repo_path) and self.clone_url is None:
raise ValueError(f"Path {self.repo_path} does not exist")
elif self.clone_url:
repo = Repo.clone_from(self.clone_url, self.repo_path)
repo.git.checkout(self.branch)
else:
repo = Repo(self.repo_path)
repo.git.checkout(self.branch)
docs: List[Document] = []
for item in repo.tree().traverse():
if isinstance(item, Blob):
file_path = os.path.join(self.repo_path, item.path)
rel_file_path = os.path.relpath(file_path, self.repo_path)
try:
with open(file_path, "rb") as f:
content = f.read()
file_type = os.path.splitext(item.name)[1]
# loads only text files
try:
text_content = content.decode("utf-8")
except UnicodeDecodeError:
continue
metadata = {
"file_path": rel_file_path,
"file_name": item.name,
"file_type": file_type,
}
doc = Document(page_content=text_content, metadata=metadata)
docs.append(doc)
except Exception as e:
print(f"Error reading file {file_path}: {e}")
return docs