forked from Archives/langchain
Add GitLoader (#2851)
This commit is contained in:
parent
8cfec2c5fe
commit
016738e676
199
docs/modules/indexes/document_loaders/examples/git.ipynb
Normal file
199
docs/modules/indexes/document_loaders/examples/git.ipynb
Normal file
@ -0,0 +1,199 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Git\n",
|
||||
"\n",
|
||||
"This notebook shows how to load text files from Git repository."
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Load existing repository from disk"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from git import Repo\n",
|
||||
"\n",
|
||||
"repo = Repo.clone_from(\n",
|
||||
" \"https://github.com/hwchase17/langchain\", to_path=\"./example_data/test_repo1\"\n",
|
||||
")\n",
|
||||
"branch = repo.head.reference"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders.git import GitLoader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"ename": "TypeError",
|
||||
"evalue": "__init__() got an unexpected keyword argument 'path'",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
|
||||
"Cell \u001b[0;32mIn[3], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m loader \u001b[39m=\u001b[39m GitLoader(path\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39m./example_data/test_repo1/\u001b[39;49m\u001b[39m\"\u001b[39;49m, branch\u001b[39m=\u001b[39;49mbranch)\n",
|
||||
"\u001b[0;31mTypeError\u001b[0m: __init__() got an unexpected keyword argument 'path'"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"loader = GitLoader(repo_path=\"./example_data/test_repo1/\", branch=branch)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"1040"
|
||||
]
|
||||
},
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"len(data)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"page_content='.venv\\n.github\\n.git\\n.mypy_cache\\n.pytest_cache\\nDockerfile' metadata={'file_path': '.dockerignore', 'file_name': '.dockerignore', 'file_type': ''}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(data[0])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Clone repository from url"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders.git import GitLoader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = GitLoader(\n",
|
||||
" clone_url=\"https://github.com/hwchase17/langchain\",\n",
|
||||
" repo_path=\"./example_data/test_repo2/\",\n",
|
||||
" branch=\"master\",\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"1040"
|
||||
]
|
||||
},
|
||||
"execution_count": 20,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"len(data)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "ai",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.6"
|
||||
},
|
||||
"orig_nbformat": 4
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
74
langchain/document_loaders/git.py
Normal file
74
langchain/document_loaders/git.py
Normal file
@ -0,0 +1,74 @@
|
||||
import os
|
||||
from typing import List, Optional
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
class GitLoader(BaseLoader):
|
||||
"""Loads files from a Git repository into a list of documents.
|
||||
Repository can be local on disk available at `repo_path`,
|
||||
or remote at `clone_url` that will be cloned to `repo_path`.
|
||||
Currently supports only text files.
|
||||
|
||||
Each document represents one file in the repository. The `path` points to
|
||||
the local Git repository, and the `branch` specifies the branch to load
|
||||
files from. By default, it loads from the `main` branch.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
repo_path: str,
|
||||
clone_url: Optional[str] = None,
|
||||
branch: Optional[str] = "main",
|
||||
):
|
||||
self.repo_path = repo_path
|
||||
self.clone_url = clone_url
|
||||
self.branch = branch
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
try:
|
||||
from git import Blob, Repo
|
||||
except ImportError as ex:
|
||||
raise ImportError(
|
||||
"Could not import git python package. "
|
||||
"Please install it with `pip install GitPython`."
|
||||
) from ex
|
||||
|
||||
if not os.path.exists(self.repo_path) and self.clone_url is None:
|
||||
raise ValueError(f"Path {self.repo_path} does not exist")
|
||||
elif self.clone_url:
|
||||
repo = Repo.clone_from(self.clone_url, self.repo_path)
|
||||
repo.git.checkout(self.branch)
|
||||
else:
|
||||
repo = Repo(self.repo_path)
|
||||
repo.git.checkout(self.branch)
|
||||
|
||||
docs: List[Document] = []
|
||||
|
||||
for item in repo.tree().traverse():
|
||||
if isinstance(item, Blob):
|
||||
file_path = os.path.join(self.repo_path, item.path)
|
||||
rel_file_path = os.path.relpath(file_path, self.repo_path)
|
||||
try:
|
||||
with open(file_path, "rb") as f:
|
||||
content = f.read()
|
||||
file_type = os.path.splitext(item.name)[1]
|
||||
|
||||
# loads only text files
|
||||
try:
|
||||
text_content = content.decode("utf-8")
|
||||
except UnicodeDecodeError:
|
||||
continue
|
||||
|
||||
metadata = {
|
||||
"file_path": rel_file_path,
|
||||
"file_name": item.name,
|
||||
"file_type": file_type,
|
||||
}
|
||||
doc = Document(page_content=text_content, metadata=metadata)
|
||||
docs.append(doc)
|
||||
except Exception as e:
|
||||
print(f"Error reading file {file_path}: {e}")
|
||||
|
||||
return docs
|
Loading…
Reference in New Issue
Block a user