From f9a6d5c8455e524c01df17d02dccfba2be0b997a Mon Sep 17 00:00:00 2001 From: Isaac Francisco <78627776+isahers1@users.noreply.github.com> Date: Thu, 13 Jun 2024 19:34:50 -0700 Subject: [PATCH] infra: lint new docs to match doc loader template (#22867) --- docs/scripts/check_templates.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/docs/scripts/check_templates.py b/docs/scripts/check_templates.py index ca551dc95c..4344cba283 100644 --- a/docs/scripts/check_templates.py +++ b/docs/scripts/check_templates.py @@ -17,6 +17,18 @@ CHAT_MODEL_HEADERS = ( ) CHAT_MODEL_REGEX = r".*".join(CHAT_MODEL_HEADERS) +DOCUMENT_LOADER_HEADERS = ( + "## Overview", + "### Integration details", + "### Loader features", + "## Setup", + "## Instantiation", + "## Load", + "## Lazy Load", + "## API reference", +) +DOCUMENT_LOADER_REGEX = r".*".join(DOCUMENT_LOADER_HEADERS) + def check_chat_model(path: Path) -> None: with open(path, "r") as f: @@ -29,12 +41,29 @@ def check_chat_model(path: Path) -> None: ) +def check_document_loader(path: Path) -> None: + with open(path, "r") as f: + doc = f.read() + if not re.search(DOCUMENT_LOADER_REGEX, doc, re.DOTALL): + raise ValueError( + f"Document {path} does not match the DocumentLoader Integration page template. " + f"Please see https://github.com/langchain-ai/langchain/issues/22866 for " + f"instructions on how to correctly format a DocumentLoader Integration page." + ) + + def main(*new_doc_paths: Union[str, Path]) -> None: for path in new_doc_paths: path = Path(path).resolve().absolute() if CURR_DIR.parent / "docs" / "integrations" / "chat" in path.parents: print(f"Checking chat model page {path}") check_chat_model(path) + elif ( + CURR_DIR.parent / "docs" / "integrations" / "document_loaders" + in path.parents + ): + print(f"Checking document loader page {path}") + check_document_loader(path) else: continue