langchain/docs/scripts/check_templates.py

import json
import re
import sys
from functools import cache
from pathlib import Path
from typing import Dict, Iterable, List, Union

CURR_DIR = Path(__file__).parent.absolute()
CLI_TEMPLATE_DIR = (
    CURR_DIR.parent.parent / "libs/cli/langchain_cli/integration_template/docs"
)

INFO_BY_DIR: Dict[str, Dict[str, Union[int, str]]] = {
    "chat": {
        "issue_number": 22296,
    },
    "document_loaders": {
        "issue_number": 22866,
    },
    "stores": {"issue_number": 24888},
    "llms": {
        "issue_number": 24803,
    },
    "text_embedding": {"issue_number": 14856},
    "toolkits": {"issue_number": 24820},
    "tools": {"issue_number": "TODO"},
    "vectorstores": {"issue_number": 24800},
    "retrievers": {"issue_number": 24908},
}


@cache
def _get_headers(doc_dir: str) -> Iterable[str]:
    """Gets all markdown headers ## and below from the integration template.

    Ignores headers that contain "TODO"."""
    ipynb_name = f"{doc_dir}.ipynb"
    if not (CLI_TEMPLATE_DIR / ipynb_name).exists():
        raise FileNotFoundError(f"Could not find {ipynb_name} in {CLI_TEMPLATE_DIR}")
    with open(CLI_TEMPLATE_DIR / ipynb_name, "r") as f:
        nb = json.load(f)

    headers: List[str] = []
    for cell in nb["cells"]:
        if cell["cell_type"] == "markdown":
            for line in cell["source"]:
                if not line.startswith("##") or "TODO" in line:
                    continue
                header = line.strip()
                headers.append(header)
    return headers


def check_header_order(path: Path) -> None:
    if path.name.startswith("index."):
        # skip index pages
        return
    doc_dir = path.parent.name
    if doc_dir not in INFO_BY_DIR:
        # Skip if not a directory we care about
        return
    headers = _get_headers(doc_dir)
    issue_number = INFO_BY_DIR[doc_dir].get("issue_number", "nonexistent")

    print(f"Checking {doc_dir} page {path}")

    with open(path, "r") as f:
        doc = f.read()
    regex = r".*".join(headers)
    if not re.search(regex, doc, re.DOTALL):
        issueline = (
            (
                " Please see https://github.com/langchain-ai/langchain/issues/"
                f"{issue_number} for instructions on how to correctly format a "
                f"{doc_dir} integration page."
            )
            if isinstance(issue_number, int)
            else ""
        )
        raise ValueError(
            f"Document {path} does not match the expected header order.{issueline}"
        )


def main(*new_doc_paths: Union[str, Path]) -> None:
    for path in new_doc_paths:
        path = Path(path).resolve().absolute()
        if CURR_DIR.parent / "docs" / "integrations" in path.parents:
            check_header_order(path)
        else:
            continue


if __name__ == "__main__":
    main(*sys.argv[1:])