langchain/docs/scripts/check_templates.py

import re
import sys
from pathlib import Path
from typing import Union

CURR_DIR = Path(__file__).parent.absolute()

CHAT_MODEL_HEADERS = (
    "## Overview",
    "### Integration details",
    "### Model features",
    "## Setup",
    "## Instantiation",
    "## Invocation",
    "## Chaining",
    "## API reference",
)
CHAT_MODEL_REGEX = r".*".join(CHAT_MODEL_HEADERS)

DOCUMENT_LOADER_HEADERS = (
    "## Overview",
    "### Integration details",
    "### Loader features",
    "## Setup",
    "## Instantiation",
    "## Load",
    "## Lazy Load",
    "## API reference",
)
DOCUMENT_LOADER_REGEX = r".*".join(DOCUMENT_LOADER_HEADERS)


def check_chat_model(path: Path) -> None:
    with open(path, "r") as f:
        doc = f.read()
    if not re.search(CHAT_MODEL_REGEX, doc, re.DOTALL):
        raise ValueError(
            f"Document {path} does not match the ChatModel Integration page template. "
            f"Please see https://github.com/langchain-ai/langchain/issues/22296 for "
            f"instructions on how to correctly format a ChatModel Integration page."
        )


def check_document_loader(path: Path) -> None:
    with open(path, "r") as f:
        doc = f.read()
    if not re.search(DOCUMENT_LOADER_REGEX, doc, re.DOTALL):
        raise ValueError(
            f"Document {path} does not match the DocumentLoader Integration page template. "
            f"Please see https://github.com/langchain-ai/langchain/issues/22866 for "
            f"instructions on how to correctly format a DocumentLoader Integration page."
        )


def main(*new_doc_paths: Union[str, Path]) -> None:
    for path in new_doc_paths:
        path = Path(path).resolve().absolute()
        if CURR_DIR.parent / "docs" / "integrations" / "chat" in path.parents:
            print(f"Checking chat model page {path}")
            check_chat_model(path)
        elif (
            CURR_DIR.parent / "docs" / "integrations" / "document_loaders"
            in path.parents
        ):
            print(f"Checking document loader page {path}")
            check_document_loader(path)
        else:
            continue


if __name__ == "__main__":
    main(*sys.argv[1:])