2024-07-31 20:19:50 +00:00
|
|
|
import json
|
2024-06-11 20:26:35 +00:00
|
|
|
import re
|
|
|
|
import sys
|
2024-07-31 20:19:50 +00:00
|
|
|
from functools import cache
|
2024-06-11 20:26:35 +00:00
|
|
|
from pathlib import Path
|
2024-07-31 20:19:50 +00:00
|
|
|
from typing import Dict, Iterable, List, Union
|
2024-06-11 20:26:35 +00:00
|
|
|
|
|
|
|
CURR_DIR = Path(__file__).parent.absolute()
|
2024-07-31 20:19:50 +00:00
|
|
|
CLI_TEMPLATE_DIR = (
|
|
|
|
CURR_DIR.parent.parent / "libs/cli/langchain_cli/integration_template/docs"
|
2024-06-11 20:26:35 +00:00
|
|
|
)
|
|
|
|
|
2024-07-31 20:19:50 +00:00
|
|
|
INFO_BY_DIR: Dict[str, Dict[str, Union[int, str]]] = {
|
|
|
|
"chat": {
|
|
|
|
"issue_number": 22296,
|
|
|
|
},
|
|
|
|
"document_loaders": {
|
|
|
|
"issue_number": 22866,
|
|
|
|
},
|
2024-08-06 22:29:55 +00:00
|
|
|
"stores": {"issue_number": 24888},
|
2024-07-31 20:19:50 +00:00
|
|
|
"llms": {
|
|
|
|
"issue_number": 24803,
|
|
|
|
},
|
|
|
|
"text_embedding": {"issue_number": 14856},
|
2024-08-06 22:29:55 +00:00
|
|
|
"toolkits": {"issue_number": 24820},
|
2024-07-31 20:19:50 +00:00
|
|
|
"tools": {"issue_number": "TODO"},
|
|
|
|
"vectorstores": {"issue_number": 24800},
|
2024-08-06 22:29:55 +00:00
|
|
|
"retrievers": {"issue_number": 24908},
|
2024-07-31 20:19:50 +00:00
|
|
|
}
|
2024-06-14 02:34:50 +00:00
|
|
|
|
2024-06-11 20:26:35 +00:00
|
|
|
|
2024-07-31 20:19:50 +00:00
|
|
|
@cache
|
|
|
|
def _get_headers(doc_dir: str) -> Iterable[str]:
|
|
|
|
"""Gets all markdown headers ## and below from the integration template.
|
2024-06-11 20:26:35 +00:00
|
|
|
|
2024-07-31 20:19:50 +00:00
|
|
|
Ignores headers that contain "TODO"."""
|
|
|
|
ipynb_name = f"{doc_dir}.ipynb"
|
|
|
|
if not (CLI_TEMPLATE_DIR / ipynb_name).exists():
|
|
|
|
raise FileNotFoundError(f"Could not find {ipynb_name} in {CLI_TEMPLATE_DIR}")
|
|
|
|
with open(CLI_TEMPLATE_DIR / ipynb_name, "r") as f:
|
|
|
|
nb = json.load(f)
|
|
|
|
|
|
|
|
headers: List[str] = []
|
|
|
|
for cell in nb["cells"]:
|
|
|
|
if cell["cell_type"] == "markdown":
|
|
|
|
for line in cell["source"]:
|
|
|
|
if not line.startswith("##") or "TODO" in line:
|
|
|
|
continue
|
|
|
|
header = line.strip()
|
|
|
|
headers.append(header)
|
|
|
|
return headers
|
|
|
|
|
|
|
|
|
|
|
|
def check_header_order(path: Path) -> None:
|
2024-08-05 23:38:30 +00:00
|
|
|
if path.name.startswith("index."):
|
|
|
|
# skip index pages
|
|
|
|
return
|
2024-07-31 20:19:50 +00:00
|
|
|
doc_dir = path.parent.name
|
|
|
|
if doc_dir not in INFO_BY_DIR:
|
|
|
|
# Skip if not a directory we care about
|
|
|
|
return
|
|
|
|
headers = _get_headers(doc_dir)
|
|
|
|
issue_number = INFO_BY_DIR[doc_dir].get("issue_number", "nonexistent")
|
|
|
|
|
|
|
|
print(f"Checking {doc_dir} page {path}")
|
2024-06-11 20:26:35 +00:00
|
|
|
|
2024-06-14 02:34:50 +00:00
|
|
|
with open(path, "r") as f:
|
|
|
|
doc = f.read()
|
2024-07-31 20:19:50 +00:00
|
|
|
regex = r".*".join(headers)
|
|
|
|
if not re.search(regex, doc, re.DOTALL):
|
|
|
|
issueline = (
|
|
|
|
(
|
|
|
|
" Please see https://github.com/langchain-ai/langchain/issues/"
|
|
|
|
f"{issue_number} for instructions on how to correctly format a "
|
|
|
|
f"{doc_dir} integration page."
|
|
|
|
)
|
|
|
|
if isinstance(issue_number, int)
|
|
|
|
else ""
|
|
|
|
)
|
2024-06-14 02:34:50 +00:00
|
|
|
raise ValueError(
|
2024-07-31 20:19:50 +00:00
|
|
|
f"Document {path} does not match the expected header order.{issueline}"
|
2024-06-14 02:34:50 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
|
2024-06-11 20:26:35 +00:00
|
|
|
def main(*new_doc_paths: Union[str, Path]) -> None:
|
|
|
|
for path in new_doc_paths:
|
|
|
|
path = Path(path).resolve().absolute()
|
2024-07-31 20:19:50 +00:00
|
|
|
if CURR_DIR.parent / "docs" / "integrations" in path.parents:
|
|
|
|
check_header_order(path)
|
2024-06-11 20:26:35 +00:00
|
|
|
else:
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
main(*sys.argv[1:])
|