community[patch]: Add Pagination to GitHubIssuesLoader for Efficient GitHub Issues Retrieval (#16934)

- **Description:** Add Pagination to GitHubIssuesLoader for Efficient
GitHub Issues Retrieval
- **Issue:** [the issue # it fixes if
applicable,](https://github.com/langchain-ai/langchain/issues/16864)

---------

Co-authored-by: root <root@ip-172-31-46-160.ap-southeast-1.compute.internal>
Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
yin1991 2024-02-13 10:30:36 +08:00 committed by GitHub
parent b87d6f9f48
commit 37ef6ac113
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 21 additions and 3 deletions

View File

@ -65,6 +65,12 @@ class GitHubIssuesLoader(BaseGitHubLoader):
since: Optional[str] = None since: Optional[str] = None
"""Only show notifications updated after the given time. """Only show notifications updated after the given time.
This is a timestamp in ISO 8601 format: YYYY-MM-DDTHH:MM:SSZ.""" This is a timestamp in ISO 8601 format: YYYY-MM-DDTHH:MM:SSZ."""
page: Optional[int] = None
"""The page number for paginated results.
Defaults to 1 in the GitHub API."""
per_page: Optional[int] = None
"""Number of items per page.
Defaults to 30 in the GitHub API."""
@validator("since", allow_reuse=True) @validator("since", allow_reuse=True)
def validate_since(cls, v: Optional[str]) -> Optional[str]: def validate_since(cls, v: Optional[str]) -> Optional[str]:
@ -112,7 +118,11 @@ class GitHubIssuesLoader(BaseGitHubLoader):
if not self.include_prs and doc.metadata["is_pull_request"]: if not self.include_prs and doc.metadata["is_pull_request"]:
continue continue
yield doc yield doc
if response.links and response.links.get("next"): if (
response.links
and response.links.get("next")
and (not self.page and not self.per_page)
):
url = response.links["next"]["url"] url = response.links["next"]["url"]
else: else:
url = None url = None
@ -176,6 +186,8 @@ class GitHubIssuesLoader(BaseGitHubLoader):
"sort": self.sort, "sort": self.sort,
"direction": self.direction, "direction": self.direction,
"since": self.since, "since": self.since,
"page": self.page,
"per_page": self.per_page,
} }
query_params_list = [ query_params_list = [
f"{k}={v}" for k, v in query_params_dict.items() if v is not None f"{k}={v}" for k, v in query_params_dict.items() if v is not None

View File

@ -2,11 +2,17 @@ from langchain_community.document_loaders.github import GitHubIssuesLoader
def test_issues_load() -> None: def test_issues_load() -> None:
title = "DocumentLoader for GitHub" title = " Add caching to BaseChatModel (issue #1644)"
loader = GitHubIssuesLoader( loader = GitHubIssuesLoader(
repo="langchain-ai/langchain", creator="UmerHA", state="all" repo="langchain-ai/langchain",
creator="UmerHA",
state="all",
per_page=3,
page=2,
access_token="""""",
) )
docs = loader.load() docs = loader.load()
titles = [d.metadata["title"] for d in docs] titles = [d.metadata["title"] for d in docs]
assert title in titles assert title in titles
assert all(doc.metadata["creator"] == "UmerHA" for doc in docs) assert all(doc.metadata["creator"] == "UmerHA" for doc in docs)
assert len(docs) == 3