import argparse import importlib import inspect import json import logging import os import re from pathlib import Path logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # Base URL for all class documentation _BASE_URL = "https://api.python.langchain.com/en/latest/" # Regular expression to match Python code blocks code_block_re = re.compile(r"^(```\s?python\n)(.*?)(```)", re.DOTALL | re.MULTILINE) # Regular expression to match langchain import lines _IMPORT_RE = re.compile( r"from\s+(langchain(?:_\w+)?(?:\.\w+)*?)\s+import\s+" r"((?:\w+(?:,\s*)?)*" # Match zero or more words separated by a comma+optional ws r"(?:\s*\(.*?\))?)", # Match optional parentheses block re.DOTALL, # Match newlines as well ) _CURRENT_PATH = Path(__file__).parent.absolute() # Directory where generated markdown files are stored _DOCS_DIR = _CURRENT_PATH.parent.parent / "docs" def find_files(path): """Find all MDX files in the given path""" # Check if is file first if ".ipynb_checkpoints" in str(path): return if os.path.isfile(path): yield path return for root, _, files in os.walk(path): for file in files: if file.endswith(".mdx") or file.endswith(".md"): full = os.path.join(root, file) if ".ipynb_checkpoints" in str(full): continue yield full def get_full_module_name(module_path, class_name): """Get full module name using inspect""" module = importlib.import_module(module_path) class_ = getattr(module, class_name) return inspect.getmodule(class_).__name__ def get_args(): parser = argparse.ArgumentParser() parser.add_argument( "--docs_dir", type=str, default=_DOCS_DIR, help="Directory where generated markdown files are stored", ) parser.add_argument( "--json_path", type=str, default=None, help="Path to store the generated JSON file", ) return parser.parse_args() def main(): """Main function""" args = get_args() global_imports = {} for file in find_files(args.docs_dir): file_imports = replace_imports(file) print(file) if file_imports: # Use relative file path as key relative_path = ( os.path.relpath(file, args.docs_dir) .replace(".mdx", "/") .replace(".md", "/") ) doc_url = f"https://python.langchain.com/v0.2/docs/{relative_path}" for import_info in file_imports: doc_title = import_info["title"] class_name = import_info["imported"] if class_name not in global_imports: global_imports[class_name] = {} global_imports[class_name][doc_title] = doc_url # Write the global imports information to a JSON file if args.json_path: json_path = Path(args.json_path) json_path.parent.mkdir(parents=True, exist_ok=True) with json_path.open("w") as f: json.dump(global_imports, f) def _get_doc_title(data: str, file_name: str) -> str: try: return re.findall(r"^#\s*(.*)", data, re.MULTILINE)[0] except IndexError: pass # Parse the rst-style titles try: return re.findall(r"^(.*)\n=+\n", data, re.MULTILINE)[0] except IndexError: return file_name def replace_imports(file): """Replace imports in each Python code block with links to their documentation and append the import info in a comment""" all_imports = [] with open(file, "r") as f: data = f.read() file_name = os.path.basename(file) _DOC_TITLE = _get_doc_title(data, file_name) def replacer(match): # Extract the code block content code = match.group(2) # Replace if any import comment exists # TODO: Use our own custom component rather than this # injection method existing_comment_re = re.compile(r"^\n", re.MULTILINE) code = existing_comment_re.sub("", code) # Process imports in the code block imports = [] for import_match in _IMPORT_RE.finditer(code): module = import_match.group(1) if "pydantic_v1" in module: continue imports_str = ( import_match.group(2).replace("(\n", "").replace("\n)", "") ) # Handle newlines within parentheses # remove any newline and spaces, then split by comma imported_classes = [ imp.strip() for imp in re.split(r",\s*", imports_str.replace("\n", "")) if imp.strip() ] for class_name in imported_classes: try: module_path = get_full_module_name(module, class_name) except AttributeError as e: logger.warning(f"Could not find module for {class_name}, {e}") continue except ImportError as e: logger.warning(f"Failed to load for class {class_name}, {e}") continue if len(module_path.split(".")) < 2: continue url = ( _BASE_URL + module_path.split(".")[1] + "/" + module_path + "." + class_name + ".html" ) # Add the import information to our list imports.append( { "imported": class_name, "source": module, "docs": url, "title": _DOC_TITLE, } ) if imports: all_imports.extend(imports) # Create a unique comment containing the import information import_comment = f"" # Inject the import comment at the start of the code block return match.group(1) + import_comment + "\n" + code + match.group(3) else: # If there are no imports, return the original match return match.group(0) # Use re.sub to replace each Python code block data = code_block_re.sub(replacer, data) # if all_imports: # print(f"Adding {len(all_imports)} links for imports in {file}") with open(file, "w") as f: f.write(data) return all_imports if __name__ == "__main__": main()