adding .ipynb loader and documentation Fixes #1248 (#1252)

`NotebookLoader.load()` loads the `.ipynb` notebook file into a `Document` object. **Parameters**: * `include_outputs` (bool): whether to include cell outputs in the resulting document (default is False). * `max_output_length` (int): the maximum number of characters to include from each cell output (default is 10). * `remove_newline` (bool): whether to remove newline characters from the cell sources and outputs (default is False). * `traceback` (bool): whether to include full traceback (default is False).
1 year ago · 8a0751dadd
parent 4b5d427421
commit 8a0751dadd
4 changed files with 270 additions and 0 deletions
--- a/docs/modules/document_loaders/examples/example_data/notebook.ipynb
+++ b/docs/modules/document_loaders/examples/example_data/notebook.ipynb
@ -0,0 +1,83 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Notebook\n",
+    "\n",
+    "This notebook covers how to load data from an .ipynb notebook into a format suitable by LangChain."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.document_loaders import NotebookLoader"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loader = NotebookLoader(\"example_data/notebook.ipynb\")"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "`NotebookLoader.load()` loads the `.ipynb` notebook file into a `Document` object.\n",
+    "\n",
+    "**Parameters**:\n",
+    "\n",
+    "* `include_outputs` (bool): whether to include cell outputs in the resulting document (default is False).\n",
+    "* `max_output_length` (int): the maximum number of characters to include from each cell output (default is 10).\n",
+    "* `remove_newline` (bool): whether to remove newline characters from the cell sources and outputs (default is False).\n",
+    "* `traceback` (bool): whether to include full traceback (default is False)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loader.load(include_outputs=True, max_output_length=20, remove_newline=True)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.1"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "981b6680a42bdb5eb22187741e1607b3aae2cf73db800d1af1f268d1de6a1f70"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/docs/modules/document_loaders/examples/notebook.ipynb
+++ b/docs/modules/document_loaders/examples/notebook.ipynb
@ -0,0 +1,83 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Notebook\n",
+    "\n",
+    "This notebook covers how to load data from an .ipynb notebook into a format suitable by LangChain."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.document_loaders import NotebookLoader"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loader = NotebookLoader(\"example_data/notebook.ipynb\")"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "`NotebookLoader.load()` loads the `.ipynb` notebook file into a `Document` object.\n",
+    "\n",
+    "**Parameters**:\n",
+    "\n",
+    "* `include_outputs` (bool): whether to include cell outputs in the resulting document (default is False).\n",
+    "* `max_output_length` (int): the maximum number of characters to include from each cell output (default is 10).\n",
+    "* `remove_newline` (bool): whether to remove newline characters from the cell sources and outputs (default is False).\n",
+    "* `traceback` (bool): whether to include full traceback (default is False)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loader.load(include_outputs=True, max_output_length=20, remove_newline=True)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.1"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "981b6680a42bdb5eb22187741e1607b3aae2cf73db800d1af1f268d1de6a1f70"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/langchain/document_loaders/init.py
+++ b/langchain/document_loaders/init.py
@ -15,6 +15,7 @@ from langchain.document_loaders.gutenberg import GutenbergLoader
 from langchain.document_loaders.hn import HNLoader
 from langchain.document_loaders.html import UnstructuredHTMLLoader
 from langchain.document_loaders.imsdb import IMSDbLoader
+from langchain.document_loaders.notebook import NotebookLoader
 from langchain.document_loaders.notion import NotionDirectoryLoader
 from langchain.document_loaders.obsidian import ObsidianLoader
 from langchain.document_loaders.online_pdf import OnlinePDFLoader
@ -71,4 +72,5 @@ __all__ = [
    "PDFMinerLoader",
    "TelegramChatLoader",
    "SRTLoader",
+    "NotebookLoader",
 ]
--- a/langchain/document_loaders/notebook.py
+++ b/langchain/document_loaders/notebook.py
@ -0,0 +1,102 @@
+"""Loader that loads .ipynb notebook files."""
+import json
+from pathlib import Path
+from typing import Any, List
+
+import pandas as pd
+
+from langchain.docstore.document import Document
+from langchain.document_loaders.base import BaseLoader
+
+
+def concatenate_cells(
+    cell: dict, include_outputs: bool, max_output_length: int, traceback: bool
+) -> str:
+    """Combine cells information in a readable format ready to be used."""
+    cell_type = cell["cell_type"]
+    source = cell["source"]
+    output = cell["outputs"]
+
+    if include_outputs and cell_type == "code" and output:
+        if "ename" in output[0].keys():
+            error_name = output[0]["ename"]
+            error_value = output[0]["evalue"]
+            if traceback:
+                traceback = output[0]["traceback"]
+                return (
+                    f"'{cell_type}' cell: '{source}'\n, gives error '{error_name}',"
+                    f" with description '{error_value}'\n"
+                    f"and traceback '{traceback}'\n\n"
+                )
+            else:
+                return (
+                    f"'{cell_type}' cell: '{source}'\n, gives error '{error_name}',"
+                    f"with description '{error_value}'\n\n"
+                )
+        elif output[0]["output_type"] == "stream":
+            output = output[0]["text"]
+            min_output = min(max_output_length, len(output))
+            return (
+                f"'{cell_type}' cell: '{source}'\n with "
+                f"output: '{output[:min_output]}'\n\n"
+            )
+    else:
+        return f"'{cell_type}' cell: '{source}'\n\n"
+
+    return ""
+
+
+def remove_newlines(x: Any) -> Any:
+    """Remove recursivelly newlines, no matter the data structure they are stored in."""
+    if isinstance(x, str):
+        return x.replace("\n", "")
+    elif isinstance(x, list):
+        return [remove_newlines(elem) for elem in x]
+    elif isinstance(x, pd.DataFrame):
+        return x.applymap(remove_newlines)
+    else:
+        return x
+
+
+class NotebookLoader(BaseLoader):
+    """Loader that loads .ipynb notebook files."""
+
+    def __init__(self, path: str):
+        """Initialize with path."""
+        self.file_path = path
+
+    def load(
+        self,
+        include_outputs: bool = False,
+        max_output_length: int = 10,
+        remove_newline: bool = False,
+        traceback: bool = False,
+    ) -> List[Document]:
+        """Load documents."""
+        try:
+            import pandas as pd
+        except ImportError:
+            raise ValueError(
+                "pandas is needed for Notebook Loader, "
+                "please install with `pip install pandas`"
+            )
+        p = Path(self.file_path)
+
+        with open(p, encoding="utf8") as f:
+            d = json.load(f)
+
+        data = pd.json_normalize(d["cells"])
+        filtered_data = data[["cell_type", "source", "outputs"]]
+        if remove_newline:
+            filtered_data = filtered_data.applymap(remove_newlines)
+
+        text = filtered_data.apply(
+            lambda x: concatenate_cells(
+                x, include_outputs, max_output_length, traceback
+            ),
+            axis=1,
+        ).str.cat(sep=" ")
+
+        metadata = {"source": str(p)}
+
+        return [Document(page_content=text, metadata=metadata)]