diff --git a/libs/community/langchain_community/document_loaders/notebook.py b/libs/community/langchain_community/document_loaders/notebook.py index aa3e1c38a3..f2cbe7be1f 100644 --- a/libs/community/langchain_community/document_loaders/notebook.py +++ b/libs/community/langchain_community/document_loaders/notebook.py @@ -25,7 +25,11 @@ def concatenate_cells( """ cell_type = cell["cell_type"] source = cell["source"] - output = cell["outputs"] + if include_outputs: + try: + output = cell["outputs"] + except KeyError: + pass if include_outputs and cell_type == "code" and output: if "ename" in output[0].keys(): @@ -58,14 +62,13 @@ def concatenate_cells( def remove_newlines(x: Any) -> Any: """Recursively remove newlines, no matter the data structure they are stored in.""" - import pandas as pd if isinstance(x, str): return x.replace("\n", "") elif isinstance(x, list): return [remove_newlines(elem) for elem in x] - elif isinstance(x, pd.DataFrame): - return x.applymap(remove_newlines) + elif isinstance(x, dict): + return {k: remove_newlines(v) for (k, v) in x.items()} else: return x @@ -104,29 +107,29 @@ class NotebookLoader(BaseLoader): self, ) -> List[Document]: """Load documents.""" - try: - import pandas as pd - except ImportError: - raise ImportError( - "pandas is needed for Notebook Loader, " - "please install with `pip install pandas`" - ) p = Path(self.file_path) with open(p, encoding="utf8") as f: d = json.load(f) - data = pd.json_normalize(d["cells"]) - filtered_data = data[["cell_type", "source", "outputs"]] + filtered_data = [ + {k: v for (k, v) in cell.items() if k in ["cell_type", "source", "outputs"]} + for cell in d["cells"] + ] + if self.remove_newline: - filtered_data = filtered_data.applymap(remove_newlines) - - text = filtered_data.apply( - lambda x: concatenate_cells( - x, self.include_outputs, self.max_output_length, self.traceback - ), - axis=1, - ).str.cat(sep=" ") + filtered_data = list(map(remove_newlines, filtered_data)) + + text = "".join( + list( + map( + lambda x: concatenate_cells( + x, self.include_outputs, self.max_output_length, self.traceback + ), + filtered_data, + ) + ) + ) metadata = {"source": str(p)} diff --git a/libs/community/tests/unit_tests/document_loaders/test_notebook.py b/libs/community/tests/unit_tests/document_loaders/test_notebook.py new file mode 100644 index 0000000000..ddcb0947b7 --- /dev/null +++ b/libs/community/tests/unit_tests/document_loaders/test_notebook.py @@ -0,0 +1,85 @@ +import json + +from pytest_mock import MockerFixture + +from langchain_community.document_loaders.notebook import NotebookLoader + + +def test_initialization() -> None: + loader = NotebookLoader(path="./testfile.ipynb") + assert loader.file_path == "./testfile.ipynb" + + +def test_load_no_outputs(mocker: MockerFixture) -> None: + mock_notebook_content = { + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": ["# Test notebook\n", "This is a test notebook."], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": ["Hello World!\n"], + } + ], + } + ] + } + mocked_cell_type = mock_notebook_content["cells"][0]["cell_type"] + mocked_source = mock_notebook_content["cells"][0]["source"] + + # Convert the mock notebook content to a JSON string + mock_notebook_content_str = json.dumps(mock_notebook_content) + + # Mock the open function & json.load functions + mocker.patch("builtins.open", mocker.mock_open(read_data=mock_notebook_content_str)) + mocker.patch("json.load", return_value=mock_notebook_content) + + loader = NotebookLoader(path="./testfile.ipynb") + docs = loader.load() + + assert len(docs) == 1 + assert docs[0].page_content == f"'{mocked_cell_type}' cell: '{mocked_source}'\n\n" + assert docs[0].metadata == {"source": "testfile.ipynb"} + + +def test_load_with_outputs(mocker: MockerFixture) -> None: + mock_notebook_content: dict = { + "cells": [ + { + "cell_type": "code", + "metadata": {}, + "source": ["# Test notebook\n", "This is a test notebook."], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": ["Hello World!\n"], + } + ], + } + ] + } + mocked_cell_type = mock_notebook_content["cells"][0]["cell_type"] + mocked_source = mock_notebook_content["cells"][0]["source"] + mocked_output = mock_notebook_content["cells"][0]["outputs"][0]["text"] + + # Convert the mock notebook content to a JSON string + mock_notebook_content_str = json.dumps(mock_notebook_content) + + # Mock the open function & json.load functions + mocker.patch("builtins.open", mocker.mock_open(read_data=mock_notebook_content_str)) + mocker.patch("json.load", return_value=mock_notebook_content) + + loader = NotebookLoader(path="./testfile.ipynb", include_outputs=True) + docs = loader.load() + + assert len(docs) == 1 + expected_content = ( + f"'{mocked_cell_type}' cell: '{mocked_source}'\n" + f" with output: '{mocked_output}'\n\n" + ) + assert docs[0].page_content == expected_content + assert docs[0].metadata == {"source": "testfile.ipynb"}