langchain/docs/scripts/check_templates.py

96 lines
2.8 KiB
Python

import json
import re
import sys
from functools import cache
from pathlib import Path
from typing import Dict, Iterable, List, Union
CURR_DIR = Path(__file__).parent.absolute()
CLI_TEMPLATE_DIR = (
CURR_DIR.parent.parent / "libs/cli/langchain_cli/integration_template/docs"
)
INFO_BY_DIR: Dict[str, Dict[str, Union[int, str]]] = {
"chat": {
"issue_number": 22296,
},
"document_loaders": {
"issue_number": 22866,
},
"stores": {"issue_number": 24888},
"llms": {
"issue_number": 24803,
},
"text_embedding": {"issue_number": 14856},
"toolkits": {"issue_number": 24820},
"tools": {"issue_number": "TODO"},
"vectorstores": {"issue_number": 24800},
"retrievers": {"issue_number": 24908},
}
@cache
def _get_headers(doc_dir: str) -> Iterable[str]:
"""Gets all markdown headers ## and below from the integration template.
Ignores headers that contain "TODO"."""
ipynb_name = f"{doc_dir}.ipynb"
if not (CLI_TEMPLATE_DIR / ipynb_name).exists():
raise FileNotFoundError(f"Could not find {ipynb_name} in {CLI_TEMPLATE_DIR}")
with open(CLI_TEMPLATE_DIR / ipynb_name, "r") as f:
nb = json.load(f)
headers: List[str] = []
for cell in nb["cells"]:
if cell["cell_type"] == "markdown":
for line in cell["source"]:
if not line.startswith("##") or "TODO" in line:
continue
header = line.strip()
headers.append(header)
return headers
def check_header_order(path: Path) -> None:
if path.name.startswith("index."):
# skip index pages
return
doc_dir = path.parent.name
if doc_dir not in INFO_BY_DIR:
# Skip if not a directory we care about
return
headers = _get_headers(doc_dir)
issue_number = INFO_BY_DIR[doc_dir].get("issue_number", "nonexistent")
print(f"Checking {doc_dir} page {path}")
with open(path, "r") as f:
doc = f.read()
regex = r".*".join(headers)
if not re.search(regex, doc, re.DOTALL):
issueline = (
(
" Please see https://github.com/langchain-ai/langchain/issues/"
f"{issue_number} for instructions on how to correctly format a "
f"{doc_dir} integration page."
)
if isinstance(issue_number, int)
else ""
)
raise ValueError(
f"Document {path} does not match the expected header order.{issueline}"
)
def main(*new_doc_paths: Union[str, Path]) -> None:
for path in new_doc_paths:
path = Path(path).resolve().absolute()
if CURR_DIR.parent / "docs" / "integrations" in path.parents:
check_header_order(path)
else:
continue
if __name__ == "__main__":
main(*sys.argv[1:])