diff --git a/docs/scripts/check_templates.py b/docs/scripts/check_templates.py index ca551dc95c..4344cba283 100644 --- a/docs/scripts/check_templates.py +++ b/docs/scripts/check_templates.py @@ -17,6 +17,18 @@ CHAT_MODEL_HEADERS = ( ) CHAT_MODEL_REGEX = r".*".join(CHAT_MODEL_HEADERS) +DOCUMENT_LOADER_HEADERS = ( + "## Overview", + "### Integration details", + "### Loader features", + "## Setup", + "## Instantiation", + "## Load", + "## Lazy Load", + "## API reference", +) +DOCUMENT_LOADER_REGEX = r".*".join(DOCUMENT_LOADER_HEADERS) + def check_chat_model(path: Path) -> None: with open(path, "r") as f: @@ -29,12 +41,29 @@ def check_chat_model(path: Path) -> None: ) +def check_document_loader(path: Path) -> None: + with open(path, "r") as f: + doc = f.read() + if not re.search(DOCUMENT_LOADER_REGEX, doc, re.DOTALL): + raise ValueError( + f"Document {path} does not match the DocumentLoader Integration page template. " + f"Please see https://github.com/langchain-ai/langchain/issues/22866 for " + f"instructions on how to correctly format a DocumentLoader Integration page." + ) + + def main(*new_doc_paths: Union[str, Path]) -> None: for path in new_doc_paths: path = Path(path).resolve().absolute() if CURR_DIR.parent / "docs" / "integrations" / "chat" in path.parents: print(f"Checking chat model page {path}") check_chat_model(path) + elif ( + CURR_DIR.parent / "docs" / "integrations" / "document_loaders" + in path.parents + ): + print(f"Checking document loader page {path}") + check_document_loader(path) else: continue