langchain/.github/scripts/check_diff.py
Bagatur 255ad39ae3
infra: run CI on large diffs (#23192)
currently we skip CI on diffs >= 300 files. think we should just run it
on all packages instead

---------

Co-authored-by: Erick Friis <erick@langchain.dev>
2024-06-19 19:30:56 +00:00

134 lines
4.7 KiB
Python

import json
import sys
import os
from typing import Dict, List, Set
import tomllib
from collections import defaultdict
import glob
LANGCHAIN_DIRS = [
"libs/core",
"libs/text-splitters",
"libs/langchain",
"libs/community",
"libs/experimental",
]
def all_package_dirs() -> Set[str]:
return {"/".join(path.split("/")[:-1]) for path in glob.glob("./libs/**/pyproject.toml", recursive=True)}
def dependents_graph() -> dict:
dependents = defaultdict(set)
for path in glob.glob("./libs/**/pyproject.toml", recursive=True):
if "template" in path:
continue
with open(path, "rb") as f:
pyproject = tomllib.load(f)['tool']['poetry']
pkg_dir = "libs" + "/".join(path.split("libs")[1].split("/")[:-1])
for dep in pyproject['dependencies']:
if "langchain" in dep:
dependents[dep].add(pkg_dir)
return dependents
def add_dependents(dirs_to_eval: Set[str], dependents: dict) -> List[str]:
updated = set()
for dir_ in dirs_to_eval:
# handle core manually because it has so many dependents
if "core" in dir_:
updated.add(dir_)
continue
pkg = "langchain-" + dir_.split("/")[-1]
updated.update(dependents[pkg])
updated.add(dir_)
return list(updated)
if __name__ == "__main__":
files = sys.argv[1:]
dirs_to_run: Dict[str, set] = {
"lint": set(),
"test": set(),
"extended-test": set(),
}
docs_edited = False
if len(files) >= 300:
# max diff length is 300 files - there are likely files missing
dirs_to_run["lint"] = all_package_dirs()
dirs_to_run["test"] = all_package_dirs()
dirs_to_run["extended-test"] = set(LANGCHAIN_DIRS)
for file in files:
if any(
file.startswith(dir_)
for dir_ in (
".github/workflows",
".github/tools",
".github/actions",
".github/scripts/check_diff.py",
)
):
# add all LANGCHAIN_DIRS for infra changes
dirs_to_run["extended-test"].update(LANGCHAIN_DIRS)
dirs_to_run["lint"].add(".")
if any(file.startswith(dir_) for dir_ in LANGCHAIN_DIRS):
# add that dir and all dirs after in LANGCHAIN_DIRS
# for extended testing
found = False
for dir_ in LANGCHAIN_DIRS:
if file.startswith(dir_):
found = True
if found:
dirs_to_run["extended-test"].add(dir_)
elif file.startswith("libs/standard-tests"):
# TODO: update to include all packages that rely on standard-tests (all partner packages)
# note: won't run on external repo partners
dirs_to_run["lint"].add("libs/standard-tests")
dirs_to_run["test"].add("libs/partners/mistralai")
dirs_to_run["test"].add("libs/partners/openai")
dirs_to_run["test"].add("libs/partners/anthropic")
dirs_to_run["test"].add("libs/partners/ai21")
dirs_to_run["test"].add("libs/partners/fireworks")
dirs_to_run["test"].add("libs/partners/groq")
elif file.startswith("libs/cli"):
# todo: add cli makefile
pass
elif file.startswith("libs/partners"):
partner_dir = file.split("/")[2]
if os.path.isdir(f"libs/partners/{partner_dir}") and [
filename
for filename in os.listdir(f"libs/partners/{partner_dir}")
if not filename.startswith(".")
] != ["README.md"]:
dirs_to_run["test"].add(f"libs/partners/{partner_dir}")
# Skip if the directory was deleted or is just a tombstone readme
elif file.startswith("libs/"):
raise ValueError(
f"Unknown lib: {file}. check_diff.py likely needs "
"an update for this new library!"
)
elif any(file.startswith(p) for p in ["docs/", "templates/", "cookbook/"]):
if file.startswith("docs/"):
docs_edited = True
dirs_to_run["lint"].add(".")
dependents = dependents_graph()
outputs = {
"dirs-to-lint": add_dependents(
dirs_to_run["lint"] | dirs_to_run["test"] | dirs_to_run["extended-test"], dependents
),
"dirs-to-test": add_dependents(dirs_to_run["test"] | dirs_to_run["extended-test"], dependents),
"dirs-to-extended-test": list(dirs_to_run["extended-test"]),
"docs-edited": "true" if docs_edited else "",
}
for key, value in outputs.items():
json_output = json.dumps(value)
print(f"{key}={json_output}")