From 8259f9b7facae95236dd5156e2a14d87a0e1f90c Mon Sep 17 00:00:00 2001 From: UmerHA <40663591+UmerHA@users.noreply.github.com> Date: Tue, 30 May 2023 05:11:21 +0200 Subject: [PATCH] DocumentLoader for GitHub (#5408) # Creates GitHubLoader (#5257) GitHubLoader is a DocumentLoader that loads issues and PRs from GitHub. Fixes #5257 --------- Co-authored-by: Dev 2049 --- .../document_loaders/examples/github.ipynb | 261 ++++++++++++++++++ langchain/document_loaders/__init__.py | 2 + langchain/document_loaders/github.py | 182 ++++++++++++ .../document_loaders/test_github.py | 12 + .../document_loaders/test_github.py | 114 ++++++++ 5 files changed, 571 insertions(+) create mode 100644 docs/modules/indexes/document_loaders/examples/github.ipynb create mode 100644 langchain/document_loaders/github.py create mode 100644 tests/integration_tests/document_loaders/test_github.py create mode 100644 tests/unit_tests/document_loaders/test_github.py diff --git a/docs/modules/indexes/document_loaders/examples/github.ipynb b/docs/modules/indexes/document_loaders/examples/github.ipynb new file mode 100644 index 00000000..b9639dc9 --- /dev/null +++ b/docs/modules/indexes/document_loaders/examples/github.ipynb @@ -0,0 +1,261 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# GitHub\n", + "\n", + "This notebooks shows how you can load issues and pull requests (PRs) for a given repository on [GitHub](https://github.com/). We will use the LangChain Python repository as an example." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup access token" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To access the GitHub API, you need a personal access token - you can set up yours here: https://github.com/settings/tokens?type=beta. You can either set this token as the environment variable ``GITHUB_PERSONAL_ACCESS_TOKEN`` and it will be automatically pulled in, or you can pass it in directly at initializaiton as the ``access_token`` named parameter." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# If you haven't set your access token as an environment variable, pass it in here.\n", + "from getpass import getpass\n", + "\n", + "ACCESS_TOKEN = getpass()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load Issues and PRs" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from langchain.document_loaders import GitHubIssuesLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "loader = GitHubIssuesLoader(\n", + " repo=\"hwchase17/langchain\",\n", + " access_token=ACCESS_TOKEN, # delete/comment out this argument if you've set the access token as an env var.\n", + " creator=\"UmerHA\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's load all issues and PRs created by \"UmerHA\".\n", + "\n", + "Here's a list of all filters you can use:\n", + "- include_prs\n", + "- milestone\n", + "- state\n", + "- assignee\n", + "- creator\n", + "- mentioned\n", + "- labels\n", + "- sort\n", + "- direction\n", + "- since\n", + "\n", + "For more info, see https://docs.github.com/en/rest/issues/issues?apiVersion=2022-11-28#list-repository-issues." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "docs = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "# Creates GitHubLoader (#5257)\r\n", + "\r\n", + "GitHubLoader is a DocumentLoader that loads issues and PRs from GitHub.\r\n", + "\r\n", + "Fixes #5257\r\n", + "\r\n", + "Community members can review the PR once tests pass. Tag maintainers/contributors who might be interested:\r\n", + "DataLoaders\r\n", + "- @eyurtsev\r\n", + "\n", + "{'url': 'https://github.com/hwchase17/langchain/pull/5408', 'title': 'DocumentLoader for GitHub', 'creator': 'UmerHA', 'created_at': '2023-05-29T14:50:53Z', 'comments': 0, 'state': 'open', 'labels': ['enhancement', 'lgtm', 'doc loader'], 'assignee': None, 'milestone': None, 'locked': False, 'number': 5408, 'is_pull_request': True}\n" + ] + } + ], + "source": [ + "print(docs[0].page_content)\n", + "print(docs[0].metadata)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Only load issues" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "By default, the GitHub API returns considers pull requests to also be issues. To only get 'pure' issues (i.e., no pull requests), use `include_prs=False`" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "loader = GitHubIssuesLoader(\n", + " repo=\"hwchase17/langchain\",\n", + " access_token=ACCESS_TOKEN, # delete/comment out this argument if you've set the access token as an env var.\n", + " creator=\"UmerHA\",\n", + " include_prs=False,\n", + ")\n", + "docs = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "### System Info\n", + "\n", + "LangChain version = 0.0.167\r\n", + "Python version = 3.11.0\r\n", + "System = Windows 11 (using Jupyter)\n", + "\n", + "### Who can help?\n", + "\n", + "- @hwchase17\r\n", + "- @agola11\r\n", + "- @UmerHA (I have a fix ready, will submit a PR)\n", + "\n", + "### Information\n", + "\n", + "- [ ] The official example notebooks/scripts\n", + "- [X] My own modified scripts\n", + "\n", + "### Related Components\n", + "\n", + "- [X] LLMs/Chat Models\n", + "- [ ] Embedding Models\n", + "- [X] Prompts / Prompt Templates / Prompt Selectors\n", + "- [ ] Output Parsers\n", + "- [ ] Document Loaders\n", + "- [ ] Vector Stores / Retrievers\n", + "- [ ] Memory\n", + "- [ ] Agents / Agent Executors\n", + "- [ ] Tools / Toolkits\n", + "- [ ] Chains\n", + "- [ ] Callbacks/Tracing\n", + "- [ ] Async\n", + "\n", + "### Reproduction\n", + "\n", + "```\r\n", + "import os\r\n", + "os.environ[\"OPENAI_API_KEY\"] = \"...\"\r\n", + "\r\n", + "from langchain.chains import LLMChain\r\n", + "from langchain.chat_models import ChatOpenAI\r\n", + "from langchain.prompts import PromptTemplate\r\n", + "from langchain.prompts.chat import ChatPromptTemplate\r\n", + "from langchain.schema import messages_from_dict\r\n", + "\r\n", + "role_strings = [\r\n", + " (\"system\", \"you are a bird expert\"), \r\n", + " (\"human\", \"which bird has a point beak?\")\r\n", + "]\r\n", + "prompt = ChatPromptTemplate.from_role_strings(role_strings)\r\n", + "chain = LLMChain(llm=ChatOpenAI(), prompt=prompt)\r\n", + "chain.run({})\r\n", + "```\n", + "\n", + "### Expected behavior\n", + "\n", + "Chain should run\n", + "{'url': 'https://github.com/hwchase17/langchain/issues/5027', 'title': \"ChatOpenAI models don't work with prompts created via ChatPromptTemplate.from_role_strings\", 'creator': 'UmerHA', 'created_at': '2023-05-20T10:39:18Z', 'comments': 1, 'state': 'open', 'labels': [], 'assignee': None, 'milestone': None, 'locked': False, 'number': 5027, 'is_pull_request': False}\n" + ] + } + ], + "source": [ + "print(docs[0].page_content)\n", + "print(docs[0].metadata)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py index 3155fe24..e96c4efe 100644 --- a/langchain/document_loaders/__init__.py +++ b/langchain/document_loaders/__init__.py @@ -37,6 +37,7 @@ from langchain.document_loaders.gcs_directory import GCSDirectoryLoader from langchain.document_loaders.gcs_file import GCSFileLoader from langchain.document_loaders.git import GitLoader from langchain.document_loaders.gitbook import GitbookLoader +from langchain.document_loaders.github import GitHubIssuesLoader from langchain.document_loaders.googledrive import GoogleDriveLoader from langchain.document_loaders.gutenberg import GutenbergLoader from langchain.document_loaders.hn import HNLoader @@ -153,6 +154,7 @@ __all__ = [ "GCSDirectoryLoader", "GCSFileLoader", "GitLoader", + "GitHubIssuesLoader", "GitbookLoader", "GoogleApiClient", "GoogleApiYoutubeLoader", diff --git a/langchain/document_loaders/github.py b/langchain/document_loaders/github.py new file mode 100644 index 00000000..0dd83894 --- /dev/null +++ b/langchain/document_loaders/github.py @@ -0,0 +1,182 @@ +from abc import ABC +from datetime import datetime +from typing import Dict, Iterator, List, Literal, Optional, Union + +import requests +from pydantic import BaseModel, root_validator, validator + +from langchain.docstore.document import Document +from langchain.document_loaders.base import BaseLoader +from langchain.utils import get_from_dict_or_env + + +class BaseGitHubLoader(BaseLoader, BaseModel, ABC): + """Load issues of a GitHub repository.""" + + repo: str + """Name of repository""" + access_token: str + """Personal access token - see https://github.com/settings/tokens?type=beta""" + + @root_validator(pre=True) + def validate_environment(cls, values: Dict) -> Dict: + """Validate that access token exists in environment.""" + values["access_token"] = get_from_dict_or_env( + values, "access_token", "GITHUB_PERSONAL_ACCESS_TOKEN" + ) + return values + + @property + def headers(self) -> Dict[str, str]: + return { + "Accept": "application/vnd.github+json", + "Authorization": f"Bearer {self.access_token}", + } + + +class GitHubIssuesLoader(BaseGitHubLoader): + include_prs: bool = True + """If True include Pull Requests in results, otherwise ignore them.""" + milestone: Union[int, Literal["*", "none"], None] = None + """If integer is passed, it should be a milestone's number field. + If the string '*' is passed, issues with any milestone are accepted. + If the string 'none' is passed, issues without milestones are returned. + """ + state: Optional[Literal["open", "closed", "all"]] = None + """Filter on issue state. Can be one of: 'open', 'closed', 'all'.""" + assignee: Optional[str] = None + """Filter on assigned user. Pass 'none' for no user and '*' for any user.""" + creator: Optional[str] = None + """Filter on the user that created the issue.""" + mentioned: Optional[str] = None + """Filter on a user that's mentioned in the issue.""" + labels: Optional[List[str]] = None + """Label names to filter one. Example: bug,ui,@high.""" + sort: Optional[Literal["created", "updated", "comments"]] = None + """What to sort results by. Can be one of: 'created', 'updated', 'comments'. + Default is 'created'.""" + direction: Optional[Literal["asc", "desc"]] = None + """The direction to sort the results by. Can be one of: 'asc', 'desc'.""" + since: Optional[str] = None + """Only show notifications updated after the given time. + This is a timestamp in ISO 8601 format: YYYY-MM-DDTHH:MM:SSZ.""" + + @validator("since") + def validate_since(cls, v: Optional[str]) -> Optional[str]: + if v: + try: + datetime.strptime(v, "%Y-%m-%dT%H:%M:%SZ") + except ValueError: + raise ValueError( + "Invalid value for 'since'. Expected a date string in " + f"YYYY-MM-DDTHH:MM:SSZ format. Received: {v}" + ) + return v + + def lazy_load(self) -> Iterator[Document]: + """ + Get issues of a GitHub repository. + + Returns: + A list of Documents with attributes: + - page_content + - metadata + - url + - title + - creator + - created_at + - last_update_time + - closed_time + - number of comments + - state + - labels + - assignee + - assignees + - milestone + - locked + - number + - is_pull_request + """ + url: Optional[str] = self.url + while url: + response = requests.get(url, headers=self.headers) + response.raise_for_status() + issues = response.json() + for issue in issues: + doc = self.parse_issue(issue) + if not self.include_prs and doc.metadata["is_pull_request"]: + continue + yield doc + if response.links and response.links.get("next"): + url = response.links["next"]["url"] + else: + url = None + + def load(self) -> List[Document]: + """ + Get issues of a GitHub repository. + + Returns: + A list of Documents with attributes: + - page_content + - metadata + - url + - title + - creator + - created_at + - last_update_time + - closed_time + - number of comments + - state + - labels + - assignee + - assignees + - milestone + - locked + - number + - is_pull_request + """ + return list(self.lazy_load()) + + def parse_issue(self, issue: dict) -> Document: + """Create Document objects from a list of GitHub issues.""" + metadata = { + "url": issue["html_url"], + "title": issue["title"], + "creator": issue["user"]["login"], + "created_at": issue["created_at"], + "comments": issue["comments"], + "state": issue["state"], + "labels": [label["name"] for label in issue["labels"]], + "assignee": issue["assignee"]["login"] if issue["assignee"] else None, + "milestone": issue["milestone"]["title"] if issue["milestone"] else None, + "locked": issue["locked"], + "number": issue["number"], + "is_pull_request": "pull_request" in issue, + } + content = issue["body"] if issue["body"] is not None else "" + return Document(page_content=content, metadata=metadata) + + @property + def query_params(self) -> str: + labels = ",".join(self.labels) if self.labels else self.labels + query_params_dict = { + "milestone": self.milestone, + "state": self.state, + "assignee": self.assignee, + "creator": self.creator, + "mentioned": self.mentioned, + "labels": labels, + "sort": self.sort, + "direction": self.direction, + "since": self.since, + } + query_params_list = [ + f"{k}={v}" for k, v in query_params_dict.items() if v is not None + ] + query_params = "&".join(query_params_list) + return query_params + + @property + def url(self) -> str: + return f"https://api.github.com/repos/{self.repo}/issues?{self.query_params}" diff --git a/tests/integration_tests/document_loaders/test_github.py b/tests/integration_tests/document_loaders/test_github.py new file mode 100644 index 00000000..d3e7da79 --- /dev/null +++ b/tests/integration_tests/document_loaders/test_github.py @@ -0,0 +1,12 @@ +from langchain.document_loaders.github import GitHubIssuesLoader + + +def test_issues_load() -> None: + title = "DocumentLoader for GitHub" + loader = GitHubIssuesLoader( + repo="hwchase17/langchain", creator="UmerHA", state="all" + ) + docs = loader.load() + titles = [d.metadata["title"] for d in docs] + assert title in titles + assert all(doc.metadata["creator"] == "UmerHA" for doc in docs) diff --git a/tests/unit_tests/document_loaders/test_github.py b/tests/unit_tests/document_loaders/test_github.py new file mode 100644 index 00000000..ef880265 --- /dev/null +++ b/tests/unit_tests/document_loaders/test_github.py @@ -0,0 +1,114 @@ +import pytest +from pytest_mock import MockerFixture + +from langchain.docstore.document import Document +from langchain.document_loaders.github import GitHubIssuesLoader + + +def test_initialization() -> None: + loader = GitHubIssuesLoader(repo="repo", access_token="access_token") + assert loader.repo == "repo" + assert loader.access_token == "access_token" + assert loader.headers == { + "Accept": "application/vnd.github+json", + "Authorization": "Bearer access_token", + } + + +def test_invalid_initialization() -> None: + # Invalid parameter + with pytest.raises(ValueError): + GitHubIssuesLoader(invalid="parameter") + + # Invalid value for valid parameter + with pytest.raises(ValueError): + GitHubIssuesLoader(state="invalid_state") + + # Invalid type for labels + with pytest.raises(ValueError): + GitHubIssuesLoader(labels="not_a_list") + + # Invalid date format for since + with pytest.raises(ValueError): + GitHubIssuesLoader(since="not_a_date") + + +def test_load(mocker: MockerFixture) -> None: + mocker.patch( + "requests.get", return_value=mocker.MagicMock(json=lambda: [], links=None) + ) + loader = GitHubIssuesLoader(repo="repo", access_token="access_token") + documents = loader.load() + assert documents == [] + + +def test_parse_issue() -> None: + issue = { + "html_url": "https://github.com/repo/issue/1", + "title": "Example Issue 1", + "user": {"login": "username1"}, + "created_at": "2023-01-01T00:00:00Z", + "comments": 1, + "state": "open", + "labels": [{"name": "bug"}], + "assignee": {"login": "username2"}, + "milestone": {"title": "v1.0"}, + "locked": "False", + "number": "1", + "body": "This is an example issue 1", + } + expected_document = Document( + page_content=issue["body"], # type: ignore + metadata={ + "url": issue["html_url"], + "title": issue["title"], + "creator": issue["user"]["login"], # type: ignore + "created_at": issue["created_at"], + "comments": issue["comments"], + "state": issue["state"], + "labels": [label["name"] for label in issue["labels"]], # type: ignore + "assignee": issue["assignee"]["login"], # type: ignore + "milestone": issue["milestone"]["title"], # type: ignore + "locked": issue["locked"], + "number": issue["number"], + "is_pull_request": False, + }, + ) + loader = GitHubIssuesLoader(repo="repo", access_token="access_token") + document = loader.parse_issue(issue) + assert document == expected_document + + +def test_url() -> None: + # No parameters + loader = GitHubIssuesLoader(repo="repo", access_token="access_token") + assert loader.url == "https://api.github.com/repos/repo/issues?" + + # parameters: state, sort + loader = GitHubIssuesLoader( + repo="repo", access_token="access_token", state="open", sort="created" + ) + assert ( + loader.url == "https://api.github.com/repos/repo/issues?state=open&sort=created" + ) + + # parameters: milestone, state, assignee, creator, mentioned, labels, sort, + # direction, since + loader = GitHubIssuesLoader( + repo="repo", + access_token="access_token", + milestone="*", + state="closed", + assignee="user1", + creator="user2", + mentioned="user3", + labels=["bug", "ui", "@high"], + sort="comments", + direction="asc", + since="2023-05-26T00:00:00Z", + ) + assert loader.url == ( + "https://api.github.com/repos/repo/issues?milestone=*&state=closed" + "&assignee=user1&creator=user2&mentioned=user3&labels=bug,ui,@high" + "&sort=comments&direction=asc&since=2023-05-26T00:00:00Z" + )