langchain/docs/scripts/check_templates.py

import json
import re
import sys
from functools import cache
from pathlib import Path
from typing import Dict, Iterable, List, Union

CURR_DIR = Path(__file__).parent.absolute()
CLI_TEMPLATE_DIR = (
    CURR_DIR.parent.parent / "libs/cli/langchain_cli/integration_template/docs"
)

INFO_BY_DIR: Dict[str, Dict[str, Union[int, str]]] = {
    "chat": {
        "issue_number": 22296,
    },
    "document_loaders": {
        "issue_number": 22866,
    },
    "stores": {"issue_number": 24888},
    "llms": {
        "issue_number": 24803,
    },
    "text_embedding": {"issue_number": 14856},
    "toolkits": {"issue_number": 24820},
    "tools": {"issue_number": "TODO"},
    "vectorstores": {"issue_number": 24800},
    "retrievers": {"issue_number": 24908},
}


@cache
def _get_headers(doc_dir: str) -> Iterable[str]:
    """Gets all markdown headers ## and below from the integration template.

    Ignores headers that contain "TODO"."""
    ipynb_name = f"{doc_dir}.ipynb"
    if not (CLI_TEMPLATE_DIR / ipynb_name).exists():
        raise FileNotFoundError(f"Could not find {ipynb_name} in {CLI_TEMPLATE_DIR}")
    with open(CLI_TEMPLATE_DIR / ipynb_name, "r") as f:
        nb = json.load(f)

    headers: List[str] = []
    for cell in nb["cells"]:
        if cell["cell_type"] == "markdown":
            for line in cell["source"]:
                if not line.startswith("##") or "TODO" in line:
                    continue
                header = line.strip()
                headers.append(header)
    return headers


def check_header_order(path: Path) -> None:
    if path.name.startswith("index."):
        # skip index pages
        return
    doc_dir = path.parent.name
    if doc_dir not in INFO_BY_DIR:
        # Skip if not a directory we care about
        return
    headers = _get_headers(doc_dir)
    issue_number = INFO_BY_DIR[doc_dir].get("issue_number", "nonexistent")

    print(f"Checking {doc_dir} page {path}")

    with open(path, "r") as f:
        doc = f.read()
    regex = r".*".join(headers)
    if not re.search(regex, doc, re.DOTALL):
        issueline = (
            (
                " Please see https://github.com/langchain-ai/langchain/issues/"
                f"{issue_number} for instructions on how to correctly format a "
                f"{doc_dir} integration page."
            )
            if isinstance(issue_number, int)
            else ""
        )
        raise ValueError(
            f"Document {path} does not match the expected header order.{issueline}"
        )


def main(*new_doc_paths: Union[str, Path]) -> None:
    for path in new_doc_paths:
        path = Path(path).resolve().absolute()
        if CURR_DIR.parent / "docs" / "integrations" in path.parents:
            check_header_order(path)
        else:
            continue


if __name__ == "__main__":
    main(*sys.argv[1:])
infra: check templates based on integration (#24857) instead of hardcoding a linter for each, iterate through the lines of the template notebook and find lines that start with `##` (includes lower headings), and enforce that those headings are found in new docs that are contributed 2024-07-31 20:19:50 +00:00			`import json`
infra: lint new docs to match templates (#22786) 2024-06-11 20:26:35 +00:00			`import re`
			`import sys`
infra: check templates based on integration (#24857) instead of hardcoding a linter for each, iterate through the lines of the template notebook and find lines that start with `##` (includes lower headings), and enforce that those headings are found in new docs that are contributed 2024-07-31 20:19:50 +00:00			`from functools import cache`
infra: lint new docs to match templates (#22786) 2024-06-11 20:26:35 +00:00			`from pathlib import Path`
infra: check templates based on integration (#24857) instead of hardcoding a linter for each, iterate through the lines of the template notebook and find lines that start with `##` (includes lower headings), and enforce that those headings are found in new docs that are contributed 2024-07-31 20:19:50 +00:00			`from typing import Dict, Iterable, List, Union`
infra: lint new docs to match templates (#22786) 2024-06-11 20:26:35 +00:00
			`CURR_DIR = Path(__file__).parent.absolute()`
infra: check templates based on integration (#24857) instead of hardcoding a linter for each, iterate through the lines of the template notebook and find lines that start with `##` (includes lower headings), and enforce that those headings are found in new docs that are contributed 2024-07-31 20:19:50 +00:00			`CLI_TEMPLATE_DIR = (`
			`CURR_DIR.parent.parent / "libs/cli/langchain_cli/integration_template/docs"`
infra: lint new docs to match templates (#22786) 2024-06-11 20:26:35 +00:00			`)`

infra: check templates based on integration (#24857) instead of hardcoding a linter for each, iterate through the lines of the template notebook and find lines that start with `##` (includes lower headings), and enforce that those headings are found in new docs that are contributed 2024-07-31 20:19:50 +00:00			`INFO_BY_DIR: Dict[str, Dict[str, Union[int, str]]] = {`
			`"chat": {`
			`"issue_number": 22296,`
			`},`
			`"document_loaders": {`
			`"issue_number": 22866,`
			`},`
infra,cli: template matching registration (#25110) 2024-08-06 22:29:55 +00:00			`"stores": {"issue_number": 24888},`
infra: check templates based on integration (#24857) instead of hardcoding a linter for each, iterate through the lines of the template notebook and find lines that start with `##` (includes lower headings), and enforce that those headings are found in new docs that are contributed 2024-07-31 20:19:50 +00:00			`"llms": {`
			`"issue_number": 24803,`
			`},`
			`"text_embedding": {"issue_number": 14856},`
infra,cli: template matching registration (#25110) 2024-08-06 22:29:55 +00:00			`"toolkits": {"issue_number": 24820},`
infra: check templates based on integration (#24857) instead of hardcoding a linter for each, iterate through the lines of the template notebook and find lines that start with `##` (includes lower headings), and enforce that those headings are found in new docs that are contributed 2024-07-31 20:19:50 +00:00			`"tools": {"issue_number": "TODO"},`
			`"vectorstores": {"issue_number": 24800},`
infra,cli: template matching registration (#25110) 2024-08-06 22:29:55 +00:00			`"retrievers": {"issue_number": 24908},`
infra: check templates based on integration (#24857) instead of hardcoding a linter for each, iterate through the lines of the template notebook and find lines that start with `##` (includes lower headings), and enforce that those headings are found in new docs that are contributed 2024-07-31 20:19:50 +00:00			`}`
infra: lint new docs to match doc loader template (#22867) 2024-06-14 02:34:50 +00:00
infra: lint new docs to match templates (#22786) 2024-06-11 20:26:35 +00:00
infra: check templates based on integration (#24857) instead of hardcoding a linter for each, iterate through the lines of the template notebook and find lines that start with `##` (includes lower headings), and enforce that those headings are found in new docs that are contributed 2024-07-31 20:19:50 +00:00			`@cache`
			`def _get_headers(doc_dir: str) -> Iterable[str]:`
			`"""Gets all markdown headers ## and below from the integration template.`
infra: lint new docs to match templates (#22786) 2024-06-11 20:26:35 +00:00
infra: check templates based on integration (#24857) instead of hardcoding a linter for each, iterate through the lines of the template notebook and find lines that start with `##` (includes lower headings), and enforce that those headings are found in new docs that are contributed 2024-07-31 20:19:50 +00:00			`Ignores headers that contain "TODO"."""`
			`ipynb_name = f"{doc_dir}.ipynb"`
			`if not (CLI_TEMPLATE_DIR / ipynb_name).exists():`
			`raise FileNotFoundError(f"Could not find {ipynb_name} in {CLI_TEMPLATE_DIR}")`
			`with open(CLI_TEMPLATE_DIR / ipynb_name, "r") as f:`
			`nb = json.load(f)`

			`headers: List[str] = []`
			`for cell in nb["cells"]:`
			`if cell["cell_type"] == "markdown":`
			`for line in cell["source"]:`
			`if not line.startswith("##") or "TODO" in line:`
			`continue`
			`header = line.strip()`
			`headers.append(header)`
			`return headers`


			`def check_header_order(path: Path) -> None:`
infra: check doc script skip index page (#25088) 2024-08-05 23:38:30 +00:00			`if path.name.startswith("index."):`
			`# skip index pages`
			`return`
infra: check templates based on integration (#24857) instead of hardcoding a linter for each, iterate through the lines of the template notebook and find lines that start with `##` (includes lower headings), and enforce that those headings are found in new docs that are contributed 2024-07-31 20:19:50 +00:00			`doc_dir = path.parent.name`
			`if doc_dir not in INFO_BY_DIR:`
			`# Skip if not a directory we care about`
			`return`
			`headers = _get_headers(doc_dir)`
			`issue_number = INFO_BY_DIR[doc_dir].get("issue_number", "nonexistent")`

			`print(f"Checking {doc_dir} page {path}")`
infra: lint new docs to match templates (#22786) 2024-06-11 20:26:35 +00:00
infra: lint new docs to match doc loader template (#22867) 2024-06-14 02:34:50 +00:00			`with open(path, "r") as f:`
			`doc = f.read()`
infra: check templates based on integration (#24857) instead of hardcoding a linter for each, iterate through the lines of the template notebook and find lines that start with `##` (includes lower headings), and enforce that those headings are found in new docs that are contributed 2024-07-31 20:19:50 +00:00			`regex = r".*".join(headers)`
			`if not re.search(regex, doc, re.DOTALL):`
			`issueline = (`
			`(`
			`" Please see https://github.com/langchain-ai/langchain/issues/"`
			`f"{issue_number} for instructions on how to correctly format a "`
			`f"{doc_dir} integration page."`
			`)`
			`if isinstance(issue_number, int)`
			`else ""`
			`)`
infra: lint new docs to match doc loader template (#22867) 2024-06-14 02:34:50 +00:00			`raise ValueError(`
infra: check templates based on integration (#24857) instead of hardcoding a linter for each, iterate through the lines of the template notebook and find lines that start with `##` (includes lower headings), and enforce that those headings are found in new docs that are contributed 2024-07-31 20:19:50 +00:00			`f"Document {path} does not match the expected header order.{issueline}"`
infra: lint new docs to match doc loader template (#22867) 2024-06-14 02:34:50 +00:00			`)`


infra: lint new docs to match templates (#22786) 2024-06-11 20:26:35 +00:00			`def main(*new_doc_paths: Union[str, Path]) -> None:`
			`for path in new_doc_paths:`
			`path = Path(path).resolve().absolute()`
infra: check templates based on integration (#24857) instead of hardcoding a linter for each, iterate through the lines of the template notebook and find lines that start with `##` (includes lower headings), and enforce that those headings are found in new docs that are contributed 2024-07-31 20:19:50 +00:00			`if CURR_DIR.parent / "docs" / "integrations" in path.parents:`
			`check_header_order(path)`
infra: lint new docs to match templates (#22786) 2024-06-11 20:26:35 +00:00			`else:`
			`continue`


			`if __name__ == "__main__":`
			`main(*sys.argv[1:])`