diff --git a/docs/modules/document_loaders/examples/example_data/notebook.ipynb b/docs/modules/document_loaders/examples/example_data/notebook.ipynb new file mode 100644 index 00000000..db1a4fdc --- /dev/null +++ b/docs/modules/document_loaders/examples/example_data/notebook.ipynb @@ -0,0 +1,83 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Notebook\n", + "\n", + "This notebook covers how to load data from an .ipynb notebook into a format suitable by LangChain." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import NotebookLoader" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "loader = NotebookLoader(\"example_data/notebook.ipynb\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`NotebookLoader.load()` loads the `.ipynb` notebook file into a `Document` object.\n", + "\n", + "**Parameters**:\n", + "\n", + "* `include_outputs` (bool): whether to include cell outputs in the resulting document (default is False).\n", + "* `max_output_length` (int): the maximum number of characters to include from each cell output (default is 10).\n", + "* `remove_newline` (bool): whether to remove newline characters from the cell sources and outputs (default is False).\n", + "* `traceback` (bool): whether to include full traceback (default is False)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "loader.load(include_outputs=True, max_output_length=20, remove_newline=True)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.1" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "981b6680a42bdb5eb22187741e1607b3aae2cf73db800d1af1f268d1de6a1f70" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/modules/document_loaders/examples/notebook.ipynb b/docs/modules/document_loaders/examples/notebook.ipynb new file mode 100644 index 00000000..db1a4fdc --- /dev/null +++ b/docs/modules/document_loaders/examples/notebook.ipynb @@ -0,0 +1,83 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Notebook\n", + "\n", + "This notebook covers how to load data from an .ipynb notebook into a format suitable by LangChain." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import NotebookLoader" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "loader = NotebookLoader(\"example_data/notebook.ipynb\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`NotebookLoader.load()` loads the `.ipynb` notebook file into a `Document` object.\n", + "\n", + "**Parameters**:\n", + "\n", + "* `include_outputs` (bool): whether to include cell outputs in the resulting document (default is False).\n", + "* `max_output_length` (int): the maximum number of characters to include from each cell output (default is 10).\n", + "* `remove_newline` (bool): whether to remove newline characters from the cell sources and outputs (default is False).\n", + "* `traceback` (bool): whether to include full traceback (default is False)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "loader.load(include_outputs=True, max_output_length=20, remove_newline=True)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.1" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "981b6680a42bdb5eb22187741e1607b3aae2cf73db800d1af1f268d1de6a1f70" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py index 2ca0f1a9..3b170f09 100644 --- a/langchain/document_loaders/__init__.py +++ b/langchain/document_loaders/__init__.py @@ -15,6 +15,7 @@ from langchain.document_loaders.gutenberg import GutenbergLoader from langchain.document_loaders.hn import HNLoader from langchain.document_loaders.html import UnstructuredHTMLLoader from langchain.document_loaders.imsdb import IMSDbLoader +from langchain.document_loaders.notebook import NotebookLoader from langchain.document_loaders.notion import NotionDirectoryLoader from langchain.document_loaders.obsidian import ObsidianLoader from langchain.document_loaders.online_pdf import OnlinePDFLoader @@ -71,4 +72,5 @@ __all__ = [ "PDFMinerLoader", "TelegramChatLoader", "SRTLoader", + "NotebookLoader", ] diff --git a/langchain/document_loaders/notebook.py b/langchain/document_loaders/notebook.py new file mode 100644 index 00000000..53ca5a72 --- /dev/null +++ b/langchain/document_loaders/notebook.py @@ -0,0 +1,102 @@ +"""Loader that loads .ipynb notebook files.""" +import json +from pathlib import Path +from typing import Any, List + +import pandas as pd + +from langchain.docstore.document import Document +from langchain.document_loaders.base import BaseLoader + + +def concatenate_cells( + cell: dict, include_outputs: bool, max_output_length: int, traceback: bool +) -> str: + """Combine cells information in a readable format ready to be used.""" + cell_type = cell["cell_type"] + source = cell["source"] + output = cell["outputs"] + + if include_outputs and cell_type == "code" and output: + if "ename" in output[0].keys(): + error_name = output[0]["ename"] + error_value = output[0]["evalue"] + if traceback: + traceback = output[0]["traceback"] + return ( + f"'{cell_type}' cell: '{source}'\n, gives error '{error_name}'," + f" with description '{error_value}'\n" + f"and traceback '{traceback}'\n\n" + ) + else: + return ( + f"'{cell_type}' cell: '{source}'\n, gives error '{error_name}'," + f"with description '{error_value}'\n\n" + ) + elif output[0]["output_type"] == "stream": + output = output[0]["text"] + min_output = min(max_output_length, len(output)) + return ( + f"'{cell_type}' cell: '{source}'\n with " + f"output: '{output[:min_output]}'\n\n" + ) + else: + return f"'{cell_type}' cell: '{source}'\n\n" + + return "" + + +def remove_newlines(x: Any) -> Any: + """Remove recursivelly newlines, no matter the data structure they are stored in.""" + if isinstance(x, str): + return x.replace("\n", "") + elif isinstance(x, list): + return [remove_newlines(elem) for elem in x] + elif isinstance(x, pd.DataFrame): + return x.applymap(remove_newlines) + else: + return x + + +class NotebookLoader(BaseLoader): + """Loader that loads .ipynb notebook files.""" + + def __init__(self, path: str): + """Initialize with path.""" + self.file_path = path + + def load( + self, + include_outputs: bool = False, + max_output_length: int = 10, + remove_newline: bool = False, + traceback: bool = False, + ) -> List[Document]: + """Load documents.""" + try: + import pandas as pd + except ImportError: + raise ValueError( + "pandas is needed for Notebook Loader, " + "please install with `pip install pandas`" + ) + p = Path(self.file_path) + + with open(p, encoding="utf8") as f: + d = json.load(f) + + data = pd.json_normalize(d["cells"]) + filtered_data = data[["cell_type", "source", "outputs"]] + if remove_newline: + filtered_data = filtered_data.applymap(remove_newlines) + + text = filtered_data.apply( + lambda x: concatenate_cells( + x, include_outputs, max_output_length, traceback + ), + axis=1, + ).str.cat(sep=" ") + + metadata = {"source": str(p)} + + return [Document(page_content=text, metadata=metadata)]