langchain/docs/scripts/generate_api_reference_links.py

import argparse
import importlib
import inspect
import json
import logging
import os
import re
from pathlib import Path

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Base URL for all class documentation
_BASE_URL = "https://api.python.langchain.com/en/latest/"

# Regular expression to match Python code blocks
code_block_re = re.compile(r"^(```\s?python\n)(.*?)(```)", re.DOTALL | re.MULTILINE)
# Regular expression to match langchain import lines
_IMPORT_RE = re.compile(
    r"from\s+(langchain(?:_\w+)?(?:\.\w+)*?)\s+import\s+"
    r"((?:\w+(?:,\s*)?)*"  # Match zero or more words separated by a comma+optional ws
    r"(?:\s*\(.*?\))?)",  # Match optional parentheses block
    re.DOTALL,  # Match newlines as well
)

_CURRENT_PATH = Path(__file__).parent.absolute()
# Directory where generated markdown files are stored
_DOCS_DIR = _CURRENT_PATH / "docs"


def find_files(path):
    """Find all MDX files in the given path"""
    # Check if is file first
    if os.path.isfile(path):
        yield path
        return
    for root, _, files in os.walk(path):
        for file in files:
            if file.endswith(".mdx") or file.endswith(".md"):
                yield os.path.join(root, file)


def get_full_module_name(module_path, class_name):
    """Get full module name using inspect"""
    module = importlib.import_module(module_path)
    class_ = getattr(module, class_name)
    return inspect.getmodule(class_).__name__


def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--docs_dir",
        type=str,
        default=_DOCS_DIR,
        help="Directory where generated markdown files are stored",
    )
    parser.add_argument(
        "--json_path",
        type=str,
        default=None,
        help="Path to store the generated JSON file",
    )
    return parser.parse_args()


def main():
    """Main function"""
    args = get_args()
    global_imports = {}

    for file in find_files(args.docs_dir):
        file_imports = replace_imports(file)

        if file_imports:
            # Use relative file path as key
            relative_path = (
                os.path.relpath(file, args.docs_dir)
                .replace(".mdx", "/")
                .replace(".md", "/")
            )

            doc_url = f"https://python.langchain.com/docs/{relative_path}"
            for import_info in file_imports:
                doc_title = import_info["title"]
                class_name = import_info["imported"]
                if class_name not in global_imports:
                    global_imports[class_name] = {}
                global_imports[class_name][doc_title] = doc_url

    # Write the global imports information to a JSON file
    if args.json_path:
        json_path = Path(args.json_path)
        json_path.parent.mkdir(parents=True, exist_ok=True)
        with json_path.open("w") as f:
            json.dump(global_imports, f)


def _get_doc_title(data: str, file_name: str) -> str:
    try:
        return re.findall(r"^#\s+(.*)", data, re.MULTILINE)[0]
    except IndexError:
        pass
    # Parse the rst-style titles
    try:
        return re.findall(r"^(.*)\n=+\n", data, re.MULTILINE)[0]
    except IndexError:
        return file_name


def replace_imports(file):
    """Replace imports in each Python code block with links to their
    documentation and append the import info in a comment"""
    all_imports = []
    with open(file, "r") as f:
        data = f.read()

    file_name = os.path.basename(file)
    _DOC_TITLE = _get_doc_title(data, file_name)

    def replacer(match):
        # Extract the code block content
        code = match.group(2)
        # Replace if any import comment exists
        # TODO: Use our own custom <code> component rather than this
        # injection method
        existing_comment_re = re.compile(r"^<!--IMPORTS:.*?-->\n", re.MULTILINE)
        code = existing_comment_re.sub("", code)

        # Process imports in the code block
        imports = []
        for import_match in _IMPORT_RE.finditer(code):
            module = import_match.group(1)
            if "pydantic_v1" in module:
                continue
            imports_str = (
                import_match.group(2).replace("(\n", "").replace("\n)", "")
            )  # Handle newlines within parentheses
            # remove any newline and spaces, then split by comma
            imported_classes = [
                imp.strip()
                for imp in re.split(r",\s*", imports_str.replace("\n", ""))
                if imp.strip()
            ]
            for class_name in imported_classes:
                try:
                    module_path = get_full_module_name(module, class_name)
                except AttributeError as e:
                    logger.warning(f"Could not find module for {class_name}, {e}")
                    continue
                except ImportError as e:
                    logger.warning(f"Failed to load for class {class_name}, {e}")
                    continue
                if len(module_path.split(".")) < 2:
                    continue
                url = (
                    _BASE_URL
                    + module_path.split(".")[1]
                    + "/"
                    + module_path
                    + "."
                    + class_name
                    + ".html"
                )

                # Add the import information to our list
                imports.append(
                    {
                        "imported": class_name,
                        "source": module,
                        "docs": url,
                        "title": _DOC_TITLE,
                    }
                )

        if imports:
            all_imports.extend(imports)
            # Create a unique comment containing the import information
            import_comment = f"<!--IMPORTS:{json.dumps(imports)}-->"
            # Inject the import comment at the start of the code block
            return match.group(1) + import_comment + "\n" + code + match.group(3)
        else:
            # If there are no imports, return the original match
            return match.group(0)

    # Use re.sub to replace each Python code block
    data = code_block_re.sub(replacer, data)

    # if all_imports:
    #     print(f"Adding {len(all_imports)} links for imports in {file}")  # noqa: T201
    with open(file, "w") as f:
        f.write(data)
    return all_imports


if __name__ == "__main__":
    main()
notebook fmt (#12498) 2023-10-29 22:50:09 +00:00			`import argparse`
Add api cross ref linking (#8275) Example of how it would show up in our python docs: ![image](https://github.com/langchain-ai/langchain/assets/13333726/0f0a88cc-ba4a-4778-bc47-118c66807f15) Examples added to the reference docs: https://api.python.langchain.com/en/wfh-api_crosslink/vectorstores/langchain.vectorstores.chroma.Chroma.html#langchain.vectorstores.chroma.Chroma ![image](https://github.com/langchain-ai/langchain/assets/13333726/dcd150de-cb56-4d42-b49a-a76a002a5a52) 2023-07-26 19:38:58 +00:00			`import importlib`
			`import inspect`
			`import json`
			`import logging`
			`import os`
			`import re`
			`from pathlib import Path`

			`logging.basicConfig(level=logging.INFO)`
			`logger = logging.getLogger(__name__)`
			`# Base URL for all class documentation`
fix links generation (#8471) 2023-07-30 01:31:33 +00:00			`_BASE_URL = "https://api.python.langchain.com/en/latest/"`
Add api cross ref linking (#8275) Example of how it would show up in our python docs: ![image](https://github.com/langchain-ai/langchain/assets/13333726/0f0a88cc-ba4a-4778-bc47-118c66807f15) Examples added to the reference docs: https://api.python.langchain.com/en/wfh-api_crosslink/vectorstores/langchain.vectorstores.chroma.Chroma.html#langchain.vectorstores.chroma.Chroma ![image](https://github.com/langchain-ai/langchain/assets/13333726/dcd150de-cb56-4d42-b49a-a76a002a5a52) 2023-07-26 19:38:58 +00:00
			`# Regular expression to match Python code blocks`
docs: fix api ref link autogeneration (#20766) 2024-04-23 00:36:41 +00:00			code_block_re = re.compile(r"^(```\s?python\n)(.*?)(```)", re.DOTALL \| re.MULTILINE)
Add api cross ref linking (#8275) Example of how it would show up in our python docs: ![image](https://github.com/langchain-ai/langchain/assets/13333726/0f0a88cc-ba4a-4778-bc47-118c66807f15) Examples added to the reference docs: https://api.python.langchain.com/en/wfh-api_crosslink/vectorstores/langchain.vectorstores.chroma.Chroma.html#langchain.vectorstores.chroma.Chroma ![image](https://github.com/langchain-ai/langchain/assets/13333726/dcd150de-cb56-4d42-b49a-a76a002a5a52) 2023-07-26 19:38:58 +00:00			`# Regular expression to match langchain import lines`
Wfh/ref links (#8454) 2023-07-29 15:44:32 +00:00			`_IMPORT_RE = re.compile(`
docs: fix api ref link autogeneration (#20766) 2024-04-23 00:36:41 +00:00			`r"from\s+(langchain(?:_\w+)?(?:\.\w+)*?)\s+import\s+"`
Wfh/ref links (#8454) 2023-07-29 15:44:32 +00:00			`r"((?:\w+(?:,\s)?)" # Match zero or more words separated by a comma+optional ws`
			`r"(?:\s\(.?\))?)", # Match optional parentheses block`
			`re.DOTALL, # Match newlines as well`
			`)`
Add api cross ref linking (#8275) Example of how it would show up in our python docs: ![image](https://github.com/langchain-ai/langchain/assets/13333726/0f0a88cc-ba4a-4778-bc47-118c66807f15) Examples added to the reference docs: https://api.python.langchain.com/en/wfh-api_crosslink/vectorstores/langchain.vectorstores.chroma.Chroma.html#langchain.vectorstores.chroma.Chroma ![image](https://github.com/langchain-ai/langchain/assets/13333726/dcd150de-cb56-4d42-b49a-a76a002a5a52) 2023-07-26 19:38:58 +00:00
			`_CURRENT_PATH = Path(__file__).parent.absolute()`
			`# Directory where generated markdown files are stored`
Restructure docs (#11620) 2023-10-10 19:55:19 +00:00			`_DOCS_DIR = _CURRENT_PATH / "docs"`
Add api cross ref linking (#8275) Example of how it would show up in our python docs: ![image](https://github.com/langchain-ai/langchain/assets/13333726/0f0a88cc-ba4a-4778-bc47-118c66807f15) Examples added to the reference docs: https://api.python.langchain.com/en/wfh-api_crosslink/vectorstores/langchain.vectorstores.chroma.Chroma.html#langchain.vectorstores.chroma.Chroma ![image](https://github.com/langchain-ai/langchain/assets/13333726/dcd150de-cb56-4d42-b49a-a76a002a5a52) 2023-07-26 19:38:58 +00:00

			`def find_files(path):`
			`"""Find all MDX files in the given path"""`
Wfh/ref links (#8454) 2023-07-29 15:44:32 +00:00			`# Check if is file first`
			`if os.path.isfile(path):`
			`yield path`
			`return`
Add api cross ref linking (#8275) Example of how it would show up in our python docs: ![image](https://github.com/langchain-ai/langchain/assets/13333726/0f0a88cc-ba4a-4778-bc47-118c66807f15) Examples added to the reference docs: https://api.python.langchain.com/en/wfh-api_crosslink/vectorstores/langchain.vectorstores.chroma.Chroma.html#langchain.vectorstores.chroma.Chroma ![image](https://github.com/langchain-ai/langchain/assets/13333726/dcd150de-cb56-4d42-b49a-a76a002a5a52) 2023-07-26 19:38:58 +00:00			`for root, _, files in os.walk(path):`
			`for file in files:`
			`if file.endswith(".mdx") or file.endswith(".md"):`
			`yield os.path.join(root, file)`


			`def get_full_module_name(module_path, class_name):`
			`"""Get full module name using inspect"""`
			`module = importlib.import_module(module_path)`
			`class_ = getattr(module, class_name)`
			`return inspect.getmodule(class_).__name__`


Wfh/ref links (#8454) 2023-07-29 15:44:32 +00:00			`def get_args():`
			`parser = argparse.ArgumentParser()`
			`parser.add_argument(`
			`"--docs_dir",`
			`type=str,`
			`default=_DOCS_DIR,`
			`help="Directory where generated markdown files are stored",`
			`)`
infra: cleanup docs build (#21134) Refactors the docs build in order to: - run the same `make build` command in both vercel and local build - incrementally build artifacts in 2 distinct steps, instead of building all docs in-place (in vercel) or in a _dist dir (locally) Highlights: - introduces `make build` in order to build the docs - collects and generates all files for the build in `docs/build/intermediate` - renders those jupyter notebook + markdown files into `docs/build/outputs` And now the outputs to host are in `docs/build/outputs`, which will need a vercel settings change. Todo: - [ ] figure out how to point the right directory (right now deleting and moving docs dir in vercel_build.sh isn't great) 2024-05-02 00:34:05 +00:00			`parser.add_argument(`
			`"--json_path",`
			`type=str,`
			`default=None,`
			`help="Path to store the generated JSON file",`
			`)`
Wfh/ref links (#8454) 2023-07-29 15:44:32 +00:00			`return parser.parse_args()`


Add api cross ref linking (#8275) Example of how it would show up in our python docs: ![image](https://github.com/langchain-ai/langchain/assets/13333726/0f0a88cc-ba4a-4778-bc47-118c66807f15) Examples added to the reference docs: https://api.python.langchain.com/en/wfh-api_crosslink/vectorstores/langchain.vectorstores.chroma.Chroma.html#langchain.vectorstores.chroma.Chroma ![image](https://github.com/langchain-ai/langchain/assets/13333726/dcd150de-cb56-4d42-b49a-a76a002a5a52) 2023-07-26 19:38:58 +00:00			`def main():`
			`"""Main function"""`
Wfh/ref links (#8454) 2023-07-29 15:44:32 +00:00			`args = get_args()`
Add api cross ref linking (#8275) Example of how it would show up in our python docs: ![image](https://github.com/langchain-ai/langchain/assets/13333726/0f0a88cc-ba4a-4778-bc47-118c66807f15) Examples added to the reference docs: https://api.python.langchain.com/en/wfh-api_crosslink/vectorstores/langchain.vectorstores.chroma.Chroma.html#langchain.vectorstores.chroma.Chroma ![image](https://github.com/langchain-ai/langchain/assets/13333726/dcd150de-cb56-4d42-b49a-a76a002a5a52) 2023-07-26 19:38:58 +00:00			`global_imports = {}`

Wfh/ref links (#8454) 2023-07-29 15:44:32 +00:00			`for file in find_files(args.docs_dir):`
Add api cross ref linking (#8275) Example of how it would show up in our python docs: ![image](https://github.com/langchain-ai/langchain/assets/13333726/0f0a88cc-ba4a-4778-bc47-118c66807f15) Examples added to the reference docs: https://api.python.langchain.com/en/wfh-api_crosslink/vectorstores/langchain.vectorstores.chroma.Chroma.html#langchain.vectorstores.chroma.Chroma ![image](https://github.com/langchain-ai/langchain/assets/13333726/dcd150de-cb56-4d42-b49a-a76a002a5a52) 2023-07-26 19:38:58 +00:00			`file_imports = replace_imports(file)`

			`if file_imports:`
			`# Use relative file path as key`
Wfh/ref links (#8454) 2023-07-29 15:44:32 +00:00			`relative_path = (`
docs: automatically add api ref links (#20755) ![Screenshot 2024-04-22 at 1 51 13 PM](https://github.com/langchain-ai/langchain/assets/22008038/b8b09fec-3800-4b97-bd26-5571b8308f4a) 2024-04-22 21:05:29 +00:00			`os.path.relpath(file, args.docs_dir)`
			`.replace(".mdx", "/")`
			`.replace(".md", "/")`
Wfh/ref links (#8454) 2023-07-29 15:44:32 +00:00			`)`

			`doc_url = f"https://python.langchain.com/docs/{relative_path}"`
Add api cross ref linking (#8275) Example of how it would show up in our python docs: ![image](https://github.com/langchain-ai/langchain/assets/13333726/0f0a88cc-ba4a-4778-bc47-118c66807f15) Examples added to the reference docs: https://api.python.langchain.com/en/wfh-api_crosslink/vectorstores/langchain.vectorstores.chroma.Chroma.html#langchain.vectorstores.chroma.Chroma ![image](https://github.com/langchain-ai/langchain/assets/13333726/dcd150de-cb56-4d42-b49a-a76a002a5a52) 2023-07-26 19:38:58 +00:00			`for import_info in file_imports:`
			`doc_title = import_info["title"]`
			`class_name = import_info["imported"]`
			`if class_name not in global_imports:`
			`global_imports[class_name] = {}`
			`global_imports[class_name][doc_title] = doc_url`

			`# Write the global imports information to a JSON file`
infra: cleanup docs build (#21134) Refactors the docs build in order to: - run the same `make build` command in both vercel and local build - incrementally build artifacts in 2 distinct steps, instead of building all docs in-place (in vercel) or in a _dist dir (locally) Highlights: - introduces `make build` in order to build the docs - collects and generates all files for the build in `docs/build/intermediate` - renders those jupyter notebook + markdown files into `docs/build/outputs` And now the outputs to host are in `docs/build/outputs`, which will need a vercel settings change. Todo: - [ ] figure out how to point the right directory (right now deleting and moving docs dir in vercel_build.sh isn't great) 2024-05-02 00:34:05 +00:00			`if args.json_path:`
			`json_path = Path(args.json_path)`
			`json_path.parent.mkdir(parents=True, exist_ok=True)`
			`with json_path.open("w") as f:`
			`json.dump(global_imports, f)`
Add api cross ref linking (#8275) Example of how it would show up in our python docs: ![image](https://github.com/langchain-ai/langchain/assets/13333726/0f0a88cc-ba4a-4778-bc47-118c66807f15) Examples added to the reference docs: https://api.python.langchain.com/en/wfh-api_crosslink/vectorstores/langchain.vectorstores.chroma.Chroma.html#langchain.vectorstores.chroma.Chroma ![image](https://github.com/langchain-ai/langchain/assets/13333726/dcd150de-cb56-4d42-b49a-a76a002a5a52) 2023-07-26 19:38:58 +00:00

			`def _get_doc_title(data: str, file_name: str) -> str:`
			`try:`
			`return re.findall(r"^#\s+(.*)", data, re.MULTILINE)[0]`
			`except IndexError:`
			`pass`
			`# Parse the rst-style titles`
			`try:`
			`return re.findall(r"^(.*)\n=+\n", data, re.MULTILINE)[0]`
			`except IndexError:`
			`return file_name`


			`def replace_imports(file):`
Wfh/ref links (#8454) 2023-07-29 15:44:32 +00:00			`"""Replace imports in each Python code block with links to their`
			`documentation and append the import info in a comment"""`
Add api cross ref linking (#8275) Example of how it would show up in our python docs: ![image](https://github.com/langchain-ai/langchain/assets/13333726/0f0a88cc-ba4a-4778-bc47-118c66807f15) Examples added to the reference docs: https://api.python.langchain.com/en/wfh-api_crosslink/vectorstores/langchain.vectorstores.chroma.Chroma.html#langchain.vectorstores.chroma.Chroma ![image](https://github.com/langchain-ai/langchain/assets/13333726/dcd150de-cb56-4d42-b49a-a76a002a5a52) 2023-07-26 19:38:58 +00:00			`all_imports = []`
			`with open(file, "r") as f:`
			`data = f.read()`

			`file_name = os.path.basename(file)`
			`_DOC_TITLE = _get_doc_title(data, file_name)`

			`def replacer(match):`
			`# Extract the code block content`
			`code = match.group(2)`
			`# Replace if any import comment exists`
			`# TODO: Use our own custom <code> component rather than this`
			`# injection method`
			`existing_comment_re = re.compile(r"^<!--IMPORTS:.*?-->\n", re.MULTILINE)`
			`code = existing_comment_re.sub("", code)`

			`# Process imports in the code block`
			`imports = []`
			`for import_match in _IMPORT_RE.finditer(code):`
Wfh/ref links (#8454) 2023-07-29 15:44:32 +00:00			`module = import_match.group(1)`
docs: automatically add api ref links (#20755) ![Screenshot 2024-04-22 at 1 51 13 PM](https://github.com/langchain-ai/langchain/assets/22008038/b8b09fec-3800-4b97-bd26-5571b8308f4a) 2024-04-22 21:05:29 +00:00			`if "pydantic_v1" in module:`
			`continue`
Wfh/ref links (#8454) 2023-07-29 15:44:32 +00:00			`imports_str = (`
docs: automatically add api ref links (#20755) ![Screenshot 2024-04-22 at 1 51 13 PM](https://github.com/langchain-ai/langchain/assets/22008038/b8b09fec-3800-4b97-bd26-5571b8308f4a) 2024-04-22 21:05:29 +00:00			`import_match.group(2).replace("(\n", "").replace("\n)", "")`
Wfh/ref links (#8454) 2023-07-29 15:44:32 +00:00			`) # Handle newlines within parentheses`
			`# remove any newline and spaces, then split by comma`
			`imported_classes = [`
			`imp.strip()`
			`for imp in re.split(r",\s*", imports_str.replace("\n", ""))`
			`if imp.strip()`
			`]`
			`for class_name in imported_classes:`
			`try:`
			`module_path = get_full_module_name(module, class_name)`
			`except AttributeError as e:`
			`logger.warning(f"Could not find module for {class_name}, {e}")`
			`continue`
			`except ImportError as e:`
			`logger.warning(f"Failed to load for class {class_name}, {e}")`
			`continue`
docs: automatically add api ref links (#20755) ![Screenshot 2024-04-22 at 1 51 13 PM](https://github.com/langchain-ai/langchain/assets/22008038/b8b09fec-3800-4b97-bd26-5571b8308f4a) 2024-04-22 21:05:29 +00:00			`if len(module_path.split(".")) < 2:`
			`continue`
Wfh/ref links (#8454) 2023-07-29 15:44:32 +00:00			`url = (`
			`_BASE_URL`
			`+ module_path.split(".")[1]`
			`+ "/"`
			`+ module_path`
			`+ "."`
			`+ class_name`
			`+ ".html"`
			`)`

			`# Add the import information to our list`
			`imports.append(`
			`{`
			`"imported": class_name,`
			`"source": module,`
			`"docs": url,`
			`"title": _DOC_TITLE,`
			`}`
			`)`
Add api cross ref linking (#8275) Example of how it would show up in our python docs: ![image](https://github.com/langchain-ai/langchain/assets/13333726/0f0a88cc-ba4a-4778-bc47-118c66807f15) Examples added to the reference docs: https://api.python.langchain.com/en/wfh-api_crosslink/vectorstores/langchain.vectorstores.chroma.Chroma.html#langchain.vectorstores.chroma.Chroma ![image](https://github.com/langchain-ai/langchain/assets/13333726/dcd150de-cb56-4d42-b49a-a76a002a5a52) 2023-07-26 19:38:58 +00:00
			`if imports:`
			`all_imports.extend(imports)`
			`# Create a unique comment containing the import information`
			`import_comment = f"<!--IMPORTS:{json.dumps(imports)}-->"`
			`# Inject the import comment at the start of the code block`
			`return match.group(1) + import_comment + "\n" + code + match.group(3)`
			`else:`
			`# If there are no imports, return the original match`
			`return match.group(0)`

			`# Use re.sub to replace each Python code block`
			`data = code_block_re.sub(replacer, data)`

infra: use nbconvert for docs build (#21135) todo - [x] remove quarto build semantics - [x] remove quarto download/install - [x] make `uv` not verbose 2024-05-07 19:30:17 +00:00			`# if all_imports:`
			`# print(f"Adding {len(all_imports)} links for imports in {file}") # noqa: T201`
Add api cross ref linking (#8275) Example of how it would show up in our python docs: ![image](https://github.com/langchain-ai/langchain/assets/13333726/0f0a88cc-ba4a-4778-bc47-118c66807f15) Examples added to the reference docs: https://api.python.langchain.com/en/wfh-api_crosslink/vectorstores/langchain.vectorstores.chroma.Chroma.html#langchain.vectorstores.chroma.Chroma ![image](https://github.com/langchain-ai/langchain/assets/13333726/dcd150de-cb56-4d42-b49a-a76a002a5a52) 2023-07-26 19:38:58 +00:00			`with open(file, "w") as f:`
			`f.write(data)`
			`return all_imports`


			`if __name__ == "__main__":`
			`main()`