mirror of
https://github.com/hwchase17/langchain
synced 2024-11-16 06:13:16 +00:00
375051a64e
👥 Update LangChain people data
---------
Co-authored-by: github-actions <github-actions@github.com>
641 lines
18 KiB
Python
641 lines
18 KiB
Python
# Adapted from https://github.com/tiangolo/fastapi/blob/master/.github/actions/people/app/main.py
|
|
|
|
import logging
|
|
import subprocess
|
|
import sys
|
|
from collections import Counter
|
|
from datetime import datetime, timedelta, timezone
|
|
from pathlib import Path
|
|
from typing import Any, Container, Dict, List, Set, Union
|
|
|
|
import httpx
|
|
import yaml
|
|
from github import Github
|
|
from pydantic import BaseModel, SecretStr
|
|
from pydantic_settings import BaseSettings
|
|
|
|
github_graphql_url = "https://api.github.com/graphql"
|
|
questions_category_id = "DIC_kwDOIPDwls4CS6Ve"
|
|
|
|
# discussions_query = """
|
|
# query Q($after: String, $category_id: ID) {
|
|
# repository(name: "langchain", owner: "langchain-ai") {
|
|
# discussions(first: 100, after: $after, categoryId: $category_id) {
|
|
# edges {
|
|
# cursor
|
|
# node {
|
|
# number
|
|
# author {
|
|
# login
|
|
# avatarUrl
|
|
# url
|
|
# }
|
|
# title
|
|
# createdAt
|
|
# comments(first: 100) {
|
|
# nodes {
|
|
# createdAt
|
|
# author {
|
|
# login
|
|
# avatarUrl
|
|
# url
|
|
# }
|
|
# isAnswer
|
|
# replies(first: 10) {
|
|
# nodes {
|
|
# createdAt
|
|
# author {
|
|
# login
|
|
# avatarUrl
|
|
# url
|
|
# }
|
|
# }
|
|
# }
|
|
# }
|
|
# }
|
|
# }
|
|
# }
|
|
# }
|
|
# }
|
|
# }
|
|
# """
|
|
|
|
# issues_query = """
|
|
# query Q($after: String) {
|
|
# repository(name: "langchain", owner: "langchain-ai") {
|
|
# issues(first: 100, after: $after) {
|
|
# edges {
|
|
# cursor
|
|
# node {
|
|
# number
|
|
# author {
|
|
# login
|
|
# avatarUrl
|
|
# url
|
|
# }
|
|
# title
|
|
# createdAt
|
|
# state
|
|
# comments(first: 100) {
|
|
# nodes {
|
|
# createdAt
|
|
# author {
|
|
# login
|
|
# avatarUrl
|
|
# url
|
|
# }
|
|
# }
|
|
# }
|
|
# }
|
|
# }
|
|
# }
|
|
# }
|
|
# }
|
|
# """
|
|
|
|
prs_query = """
|
|
query Q($after: String) {
|
|
repository(name: "langchain", owner: "langchain-ai") {
|
|
pullRequests(first: 100, after: $after, states: MERGED) {
|
|
edges {
|
|
cursor
|
|
node {
|
|
changedFiles
|
|
additions
|
|
deletions
|
|
number
|
|
labels(first: 100) {
|
|
nodes {
|
|
name
|
|
}
|
|
}
|
|
author {
|
|
login
|
|
avatarUrl
|
|
url
|
|
... on User {
|
|
twitterUsername
|
|
}
|
|
}
|
|
title
|
|
createdAt
|
|
state
|
|
reviews(first:100) {
|
|
nodes {
|
|
author {
|
|
login
|
|
avatarUrl
|
|
url
|
|
... on User {
|
|
twitterUsername
|
|
}
|
|
}
|
|
state
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
"""
|
|
|
|
|
|
class Author(BaseModel):
|
|
login: str
|
|
avatarUrl: str
|
|
url: str
|
|
twitterUsername: Union[str, None] = None
|
|
|
|
|
|
# Issues and Discussions
|
|
|
|
|
|
class CommentsNode(BaseModel):
|
|
createdAt: datetime
|
|
author: Union[Author, None] = None
|
|
|
|
|
|
class Replies(BaseModel):
|
|
nodes: List[CommentsNode]
|
|
|
|
|
|
class DiscussionsCommentsNode(CommentsNode):
|
|
replies: Replies
|
|
|
|
|
|
class Comments(BaseModel):
|
|
nodes: List[CommentsNode]
|
|
|
|
|
|
class DiscussionsComments(BaseModel):
|
|
nodes: List[DiscussionsCommentsNode]
|
|
|
|
|
|
class IssuesNode(BaseModel):
|
|
number: int
|
|
author: Union[Author, None] = None
|
|
title: str
|
|
createdAt: datetime
|
|
state: str
|
|
comments: Comments
|
|
|
|
|
|
class DiscussionsNode(BaseModel):
|
|
number: int
|
|
author: Union[Author, None] = None
|
|
title: str
|
|
createdAt: datetime
|
|
comments: DiscussionsComments
|
|
|
|
|
|
class IssuesEdge(BaseModel):
|
|
cursor: str
|
|
node: IssuesNode
|
|
|
|
|
|
class DiscussionsEdge(BaseModel):
|
|
cursor: str
|
|
node: DiscussionsNode
|
|
|
|
|
|
class Issues(BaseModel):
|
|
edges: List[IssuesEdge]
|
|
|
|
|
|
class Discussions(BaseModel):
|
|
edges: List[DiscussionsEdge]
|
|
|
|
|
|
class IssuesRepository(BaseModel):
|
|
issues: Issues
|
|
|
|
|
|
class DiscussionsRepository(BaseModel):
|
|
discussions: Discussions
|
|
|
|
|
|
class IssuesResponseData(BaseModel):
|
|
repository: IssuesRepository
|
|
|
|
|
|
class DiscussionsResponseData(BaseModel):
|
|
repository: DiscussionsRepository
|
|
|
|
|
|
class IssuesResponse(BaseModel):
|
|
data: IssuesResponseData
|
|
|
|
|
|
class DiscussionsResponse(BaseModel):
|
|
data: DiscussionsResponseData
|
|
|
|
|
|
# PRs
|
|
|
|
|
|
class LabelNode(BaseModel):
|
|
name: str
|
|
|
|
|
|
class Labels(BaseModel):
|
|
nodes: List[LabelNode]
|
|
|
|
|
|
class ReviewNode(BaseModel):
|
|
author: Union[Author, None] = None
|
|
state: str
|
|
|
|
|
|
class Reviews(BaseModel):
|
|
nodes: List[ReviewNode]
|
|
|
|
|
|
class PullRequestNode(BaseModel):
|
|
number: int
|
|
labels: Labels
|
|
author: Union[Author, None] = None
|
|
changedFiles: int
|
|
additions: int
|
|
deletions: int
|
|
title: str
|
|
createdAt: datetime
|
|
state: str
|
|
reviews: Reviews
|
|
# comments: Comments
|
|
|
|
|
|
class PullRequestEdge(BaseModel):
|
|
cursor: str
|
|
node: PullRequestNode
|
|
|
|
|
|
class PullRequests(BaseModel):
|
|
edges: List[PullRequestEdge]
|
|
|
|
|
|
class PRsRepository(BaseModel):
|
|
pullRequests: PullRequests
|
|
|
|
|
|
class PRsResponseData(BaseModel):
|
|
repository: PRsRepository
|
|
|
|
|
|
class PRsResponse(BaseModel):
|
|
data: PRsResponseData
|
|
|
|
|
|
class Settings(BaseSettings):
|
|
input_token: SecretStr
|
|
github_repository: str
|
|
httpx_timeout: int = 30
|
|
|
|
|
|
def get_graphql_response(
|
|
*,
|
|
settings: Settings,
|
|
query: str,
|
|
after: Union[str, None] = None,
|
|
category_id: Union[str, None] = None,
|
|
) -> Dict[str, Any]:
|
|
headers = {"Authorization": f"token {settings.input_token.get_secret_value()}"}
|
|
# category_id is only used by one query, but GraphQL allows unused variables, so
|
|
# keep it here for simplicity
|
|
variables = {"after": after, "category_id": category_id}
|
|
response = httpx.post(
|
|
github_graphql_url,
|
|
headers=headers,
|
|
timeout=settings.httpx_timeout,
|
|
json={"query": query, "variables": variables, "operationName": "Q"},
|
|
)
|
|
if response.status_code != 200:
|
|
logging.error(
|
|
f"Response was not 200, after: {after}, category_id: {category_id}"
|
|
)
|
|
logging.error(response.text)
|
|
raise RuntimeError(response.text)
|
|
data = response.json()
|
|
if "errors" in data:
|
|
logging.error(f"Errors in response, after: {after}, category_id: {category_id}")
|
|
logging.error(data["errors"])
|
|
logging.error(response.text)
|
|
raise RuntimeError(response.text)
|
|
return data
|
|
|
|
|
|
# def get_graphql_issue_edges(*, settings: Settings, after: Union[str, None] = None):
|
|
# data = get_graphql_response(settings=settings, query=issues_query, after=after)
|
|
# graphql_response = IssuesResponse.model_validate(data)
|
|
# return graphql_response.data.repository.issues.edges
|
|
|
|
|
|
# def get_graphql_question_discussion_edges(
|
|
# *,
|
|
# settings: Settings,
|
|
# after: Union[str, None] = None,
|
|
# ):
|
|
# data = get_graphql_response(
|
|
# settings=settings,
|
|
# query=discussions_query,
|
|
# after=after,
|
|
# category_id=questions_category_id,
|
|
# )
|
|
# graphql_response = DiscussionsResponse.model_validate(data)
|
|
# return graphql_response.data.repository.discussions.edges
|
|
|
|
|
|
def get_graphql_pr_edges(*, settings: Settings, after: Union[str, None] = None):
|
|
if after is None:
|
|
print("Querying PRs...")
|
|
else:
|
|
print(f"Querying PRs with cursor {after}...")
|
|
data = get_graphql_response(
|
|
settings=settings,
|
|
query=prs_query,
|
|
after=after
|
|
)
|
|
graphql_response = PRsResponse.model_validate(data)
|
|
return graphql_response.data.repository.pullRequests.edges
|
|
|
|
|
|
# def get_issues_experts(settings: Settings):
|
|
# issue_nodes: List[IssuesNode] = []
|
|
# issue_edges = get_graphql_issue_edges(settings=settings)
|
|
|
|
# while issue_edges:
|
|
# for edge in issue_edges:
|
|
# issue_nodes.append(edge.node)
|
|
# last_edge = issue_edges[-1]
|
|
# issue_edges = get_graphql_issue_edges(settings=settings, after=last_edge.cursor)
|
|
|
|
# commentors = Counter()
|
|
# last_month_commentors = Counter()
|
|
# authors: Dict[str, Author] = {}
|
|
|
|
# now = datetime.now(tz=timezone.utc)
|
|
# one_month_ago = now - timedelta(days=30)
|
|
|
|
# for issue in issue_nodes:
|
|
# issue_author_name = None
|
|
# if issue.author:
|
|
# authors[issue.author.login] = issue.author
|
|
# issue_author_name = issue.author.login
|
|
# issue_commentors = set()
|
|
# for comment in issue.comments.nodes:
|
|
# if comment.author:
|
|
# authors[comment.author.login] = comment.author
|
|
# if comment.author.login != issue_author_name:
|
|
# issue_commentors.add(comment.author.login)
|
|
# for author_name in issue_commentors:
|
|
# commentors[author_name] += 1
|
|
# if issue.createdAt > one_month_ago:
|
|
# last_month_commentors[author_name] += 1
|
|
|
|
# return commentors, last_month_commentors, authors
|
|
|
|
|
|
# def get_discussions_experts(settings: Settings):
|
|
# discussion_nodes: List[DiscussionsNode] = []
|
|
# discussion_edges = get_graphql_question_discussion_edges(settings=settings)
|
|
|
|
# while discussion_edges:
|
|
# for discussion_edge in discussion_edges:
|
|
# discussion_nodes.append(discussion_edge.node)
|
|
# last_edge = discussion_edges[-1]
|
|
# discussion_edges = get_graphql_question_discussion_edges(
|
|
# settings=settings, after=last_edge.cursor
|
|
# )
|
|
|
|
# commentors = Counter()
|
|
# last_month_commentors = Counter()
|
|
# authors: Dict[str, Author] = {}
|
|
|
|
# now = datetime.now(tz=timezone.utc)
|
|
# one_month_ago = now - timedelta(days=30)
|
|
|
|
# for discussion in discussion_nodes:
|
|
# discussion_author_name = None
|
|
# if discussion.author:
|
|
# authors[discussion.author.login] = discussion.author
|
|
# discussion_author_name = discussion.author.login
|
|
# discussion_commentors = set()
|
|
# for comment in discussion.comments.nodes:
|
|
# if comment.author:
|
|
# authors[comment.author.login] = comment.author
|
|
# if comment.author.login != discussion_author_name:
|
|
# discussion_commentors.add(comment.author.login)
|
|
# for reply in comment.replies.nodes:
|
|
# if reply.author:
|
|
# authors[reply.author.login] = reply.author
|
|
# if reply.author.login != discussion_author_name:
|
|
# discussion_commentors.add(reply.author.login)
|
|
# for author_name in discussion_commentors:
|
|
# commentors[author_name] += 1
|
|
# if discussion.createdAt > one_month_ago:
|
|
# last_month_commentors[author_name] += 1
|
|
# return commentors, last_month_commentors, authors
|
|
|
|
|
|
# def get_experts(settings: Settings):
|
|
# (
|
|
# discussions_commentors,
|
|
# discussions_last_month_commentors,
|
|
# discussions_authors,
|
|
# ) = get_discussions_experts(settings=settings)
|
|
# commentors = discussions_commentors
|
|
# last_month_commentors = discussions_last_month_commentors
|
|
# authors = {**discussions_authors}
|
|
# return commentors, last_month_commentors, authors
|
|
|
|
|
|
def _logistic(x, k):
|
|
return x / (x + k)
|
|
|
|
|
|
def get_contributors(settings: Settings):
|
|
pr_nodes: List[PullRequestNode] = []
|
|
pr_edges = get_graphql_pr_edges(settings=settings)
|
|
|
|
while pr_edges:
|
|
for edge in pr_edges:
|
|
pr_nodes.append(edge.node)
|
|
last_edge = pr_edges[-1]
|
|
pr_edges = get_graphql_pr_edges(settings=settings, after=last_edge.cursor)
|
|
|
|
contributors = Counter()
|
|
contributor_scores = Counter()
|
|
recent_contributor_scores = Counter()
|
|
reviewers = Counter()
|
|
authors: Dict[str, Author] = {}
|
|
|
|
for pr in pr_nodes:
|
|
pr_reviewers: Set[str] = set()
|
|
for review in pr.reviews.nodes:
|
|
if review.author:
|
|
authors[review.author.login] = review.author
|
|
pr_reviewers.add(review.author.login)
|
|
for reviewer in pr_reviewers:
|
|
reviewers[reviewer] += 1
|
|
if pr.author:
|
|
authors[pr.author.login] = pr.author
|
|
contributors[pr.author.login] += 1
|
|
files_changed = pr.changedFiles
|
|
lines_changed = pr.additions + pr.deletions
|
|
score = _logistic(files_changed, 20) + _logistic(lines_changed, 100)
|
|
contributor_scores[pr.author.login] += score
|
|
three_months_ago = (datetime.now(timezone.utc) - timedelta(days=3*30))
|
|
if pr.createdAt > three_months_ago:
|
|
recent_contributor_scores[pr.author.login] += score
|
|
return contributors, contributor_scores, recent_contributor_scores, reviewers, authors
|
|
|
|
|
|
def get_top_users(
|
|
*,
|
|
counter: Counter,
|
|
min_count: int,
|
|
authors: Dict[str, Author],
|
|
skip_users: Container[str],
|
|
):
|
|
users = []
|
|
for commentor, count in counter.most_common():
|
|
if commentor in skip_users:
|
|
continue
|
|
if count >= min_count:
|
|
author = authors[commentor]
|
|
users.append(
|
|
{
|
|
"login": commentor,
|
|
"count": count,
|
|
"avatarUrl": author.avatarUrl,
|
|
"twitterUsername": author.twitterUsername,
|
|
"url": author.url,
|
|
}
|
|
)
|
|
return users
|
|
|
|
|
|
if __name__ == "__main__":
|
|
logging.basicConfig(level=logging.INFO)
|
|
settings = Settings()
|
|
logging.info(f"Using config: {settings.model_dump_json()}")
|
|
g = Github(settings.input_token.get_secret_value())
|
|
repo = g.get_repo(settings.github_repository)
|
|
# question_commentors, question_last_month_commentors, question_authors = get_experts(
|
|
# settings=settings
|
|
# )
|
|
contributors, contributor_scores, recent_contributor_scores, reviewers, pr_authors = get_contributors(
|
|
settings=settings
|
|
)
|
|
# authors = {**question_authors, **pr_authors}
|
|
authors = {**pr_authors}
|
|
maintainers_logins = {
|
|
"hwchase17",
|
|
"agola11",
|
|
"baskaryan",
|
|
"hinthornw",
|
|
"nfcampos",
|
|
"efriis",
|
|
"eyurtsev",
|
|
"rlancemartin"
|
|
}
|
|
hidden_logins = {
|
|
"dev2049",
|
|
"vowelparrot",
|
|
"obi1kenobi",
|
|
"langchain-infra",
|
|
"jacoblee93",
|
|
"dqbd",
|
|
"bracesproul",
|
|
"akira",
|
|
}
|
|
bot_names = {"dosubot", "github-actions", "CodiumAI-Agent"}
|
|
maintainers = []
|
|
for login in maintainers_logins:
|
|
user = authors[login]
|
|
maintainers.append(
|
|
{
|
|
"login": login,
|
|
"count": contributors[login], #+ question_commentors[login],
|
|
"avatarUrl": user.avatarUrl,
|
|
"twitterUsername": user.twitterUsername,
|
|
"url": user.url,
|
|
}
|
|
)
|
|
|
|
# min_count_expert = 10
|
|
# min_count_last_month = 3
|
|
min_score_contributor = 1
|
|
min_count_reviewer = 5
|
|
skip_users = maintainers_logins | bot_names | hidden_logins
|
|
# experts = get_top_users(
|
|
# counter=question_commentors,
|
|
# min_count=min_count_expert,
|
|
# authors=authors,
|
|
# skip_users=skip_users,
|
|
# )
|
|
# last_month_active = get_top_users(
|
|
# counter=question_last_month_commentors,
|
|
# min_count=min_count_last_month,
|
|
# authors=authors,
|
|
# skip_users=skip_users,
|
|
# )
|
|
top_recent_contributors = get_top_users(
|
|
counter=recent_contributor_scores,
|
|
min_count=min_score_contributor,
|
|
authors=authors,
|
|
skip_users=skip_users,
|
|
)
|
|
top_contributors = get_top_users(
|
|
counter=contributor_scores,
|
|
min_count=min_score_contributor,
|
|
authors=authors,
|
|
skip_users=skip_users,
|
|
)
|
|
top_reviewers = get_top_users(
|
|
counter=reviewers,
|
|
min_count=min_count_reviewer,
|
|
authors=authors,
|
|
skip_users=skip_users,
|
|
)
|
|
|
|
people = {
|
|
"maintainers": maintainers,
|
|
# "experts": experts,
|
|
# "last_month_active": last_month_active,
|
|
"top_recent_contributors": top_recent_contributors,
|
|
"top_contributors": top_contributors,
|
|
"top_reviewers": top_reviewers,
|
|
}
|
|
people_path = Path("./docs/data/people.yml")
|
|
people_old_content = people_path.read_text(encoding="utf-8")
|
|
new_people_content = yaml.dump(
|
|
people, sort_keys=False, width=200, allow_unicode=True
|
|
)
|
|
if (
|
|
people_old_content == new_people_content
|
|
):
|
|
logging.info("The LangChain People data hasn't changed, finishing.")
|
|
sys.exit(0)
|
|
people_path.write_text(new_people_content, encoding="utf-8")
|
|
logging.info("Setting up GitHub Actions git user")
|
|
subprocess.run(["git", "config", "user.name", "github-actions"], check=True)
|
|
subprocess.run(
|
|
["git", "config", "user.email", "github-actions@github.com"], check=True
|
|
)
|
|
branch_name = "langchain/langchain-people"
|
|
logging.info(f"Creating a new branch {branch_name}")
|
|
subprocess.run(["git", "checkout", "-B", branch_name], check=True)
|
|
logging.info("Adding updated file")
|
|
subprocess.run(
|
|
["git", "add", str(people_path)], check=True
|
|
)
|
|
logging.info("Committing updated file")
|
|
message = "👥 Update LangChain people data"
|
|
result = subprocess.run(["git", "commit", "-m", message], check=True)
|
|
logging.info("Pushing branch")
|
|
subprocess.run(["git", "push", "origin", branch_name, "-f"], check=True)
|
|
logging.info("Creating PR")
|
|
pr = repo.create_pull(title=message, body=message, base="master", head=branch_name)
|
|
logging.info(f"Created PR: {pr.number}")
|
|
logging.info("Finished") |