infra: check templates based on integration (#24857)

instead of hardcoding a linter for each, iterate through the lines of the template notebook and find lines that start with `##` (includes lower headings), and enforce that those headings are found in new docs that are contributed
2024-11-10 01:10:59 +00:00 · 2024-07-31 13:19:50 -07:00 · 2024-07-31 13:19:50 -07:00 · 17a06cb7a6
commit 17a06cb7a6
parent a7380dd531
1 changed files with 67 additions and 47 deletions
--- a/docs/scripts/check_templates.py
+++ b/docs/scripts/check_templates.py
@ -1,69 +1,89 @@
+import json
 import re
 import sys
+from functools import cache
 from pathlib import Path
-from typing import Union
+from typing import Dict, Iterable, List, Union

 CURR_DIR = Path(__file__).parent.absolute()
-
-CHAT_MODEL_HEADERS = (
-    "## Overview",
-    "### Integration details",
-    "### Model features",
-    "## Setup",
-    "## Instantiation",
-    "## Invocation",
-    "## Chaining",
-    "## API reference",
+CLI_TEMPLATE_DIR = (
+    CURR_DIR.parent.parent / "libs/cli/langchain_cli/integration_template/docs"
 )
-CHAT_MODEL_REGEX = r".*".join(CHAT_MODEL_HEADERS)

-DOCUMENT_LOADER_HEADERS = (
-    "## Overview",
-    "### Integration details",
-    "### Loader features",
-    "## Setup",
-    "## Instantiation",
-    "## Load",
-    "## Lazy Load",
-    "## API reference",
-)
-DOCUMENT_LOADER_REGEX = r".*".join(DOCUMENT_LOADER_HEADERS)
+INFO_BY_DIR: Dict[str, Dict[str, Union[int, str]]] = {
+    "chat": {
+        "issue_number": 22296,
+    },
+    "document_loaders": {
+        "issue_number": 22866,
+    },
+    "stores": {},
+    "llms": {
+        "issue_number": 24803,
+    },
+    "text_embedding": {"issue_number": 14856},
+    "toolkits": {"issue_number": "TODO"},
+    "tools": {"issue_number": "TODO"},
+    "vectorstores": {"issue_number": 24800},
+    "retrievers": {"issue_number": "TODO"},
+}


-def check_chat_model(path: Path) -> None:
+@cache
+def _get_headers(doc_dir: str) -> Iterable[str]:
+    """Gets all markdown headers ## and below from the integration template.
+
+    Ignores headers that contain "TODO"."""
+    ipynb_name = f"{doc_dir}.ipynb"
+    if not (CLI_TEMPLATE_DIR / ipynb_name).exists():
+        raise FileNotFoundError(f"Could not find {ipynb_name} in {CLI_TEMPLATE_DIR}")
+    with open(CLI_TEMPLATE_DIR / ipynb_name, "r") as f:
+        nb = json.load(f)
+
+    headers: List[str] = []
+    for cell in nb["cells"]:
+        if cell["cell_type"] == "markdown":
+            for line in cell["source"]:
+                if not line.startswith("##") or "TODO" in line:
+                    continue
+                header = line.strip()
+                headers.append(header)
+    return headers
+
+
+def check_header_order(path: Path) -> None:
+    doc_dir = path.parent.name
+    if doc_dir not in INFO_BY_DIR:
+        # Skip if not a directory we care about
+        return
+    headers = _get_headers(doc_dir)
+    issue_number = INFO_BY_DIR[doc_dir].get("issue_number", "nonexistent")
+
+    print(f"Checking {doc_dir} page {path}")
+
    with open(path, "r") as f:
        doc = f.read()
-    if not re.search(CHAT_MODEL_REGEX, doc, re.DOTALL):
-        raise ValueError(
-            f"Document {path} does not match the ChatModel Integration page template. "
-            f"Please see https://github.com/langchain-ai/langchain/issues/22296 for "
-            f"instructions on how to correctly format a ChatModel Integration page."
+    regex = r".*".join(headers)
+    if not re.search(regex, doc, re.DOTALL):
+        issueline = (
+            (
+                " Please see https://github.com/langchain-ai/langchain/issues/"
+                f"{issue_number} for instructions on how to correctly format a "
+                f"{doc_dir} integration page."
+            )
+            if isinstance(issue_number, int)
+            else ""
        )
-
-
-def check_document_loader(path: Path) -> None:
-    with open(path, "r") as f:
-        doc = f.read()
-    if not re.search(DOCUMENT_LOADER_REGEX, doc, re.DOTALL):
        raise ValueError(
-            f"Document {path} does not match the DocumentLoader Integration page template. "
-            f"Please see https://github.com/langchain-ai/langchain/issues/22866 for "
-            f"instructions on how to correctly format a DocumentLoader Integration page."
+            f"Document {path} does not match the expected header order.{issueline}"
        )


 def main(*new_doc_paths: Union[str, Path]) -> None:
    for path in new_doc_paths:
        path = Path(path).resolve().absolute()
-        if CURR_DIR.parent / "docs" / "integrations" / "chat" in path.parents:
-            print(f"Checking chat model page {path}")
-            check_chat_model(path)
-        elif (
-            CURR_DIR.parent / "docs" / "integrations" / "document_loaders"
-            in path.parents
-        ):
-            print(f"Checking document loader page {path}")
-            check_document_loader(path)
+        if CURR_DIR.parent / "docs" / "integrations" in path.parents:
+            check_header_order(path)
        else:
            continue