langchain/libs/community/langchain_community/document_loaders/notebook.py

"""Loads .ipynb notebook files."""
import json
from pathlib import Path
from typing import Any, List

from langchain_core.documents import Document

from langchain_community.document_loaders.base import BaseLoader


def concatenate_cells(
    cell: dict, include_outputs: bool, max_output_length: int, traceback: bool
) -> str:
    """Combine cells information in a readable format ready to be used.

    Args:
        cell: A dictionary
        include_outputs: Whether to include the outputs of the cell.
        max_output_length: Maximum length of the output to be displayed.
        traceback: Whether to return a traceback of the error.

    Returns:
        A string with the cell information.

    """
    cell_type = cell["cell_type"]
    source = cell["source"]
    output = cell["outputs"]

    if include_outputs and cell_type == "code" and output:
        if "ename" in output[0].keys():
            error_name = output[0]["ename"]
            error_value = output[0]["evalue"]
            if traceback:
                traceback = output[0]["traceback"]
                return (
                    f"'{cell_type}' cell: '{source}'\n, gives error '{error_name}',"
                    f" with description '{error_value}'\n"
                    f"and traceback '{traceback}'\n\n"
                )
            else:
                return (
                    f"'{cell_type}' cell: '{source}'\n, gives error '{error_name}',"
                    f"with description '{error_value}'\n\n"
                )
        elif output[0]["output_type"] == "stream":
            output = output[0]["text"]
            min_output = min(max_output_length, len(output))
            return (
                f"'{cell_type}' cell: '{source}'\n with "
                f"output: '{output[:min_output]}'\n\n"
            )
    else:
        return f"'{cell_type}' cell: '{source}'\n\n"

    return ""


def remove_newlines(x: Any) -> Any:
    """Recursively remove newlines, no matter the data structure they are stored in."""
    import pandas as pd

    if isinstance(x, str):
        return x.replace("\n", "")
    elif isinstance(x, list):
        return [remove_newlines(elem) for elem in x]
    elif isinstance(x, pd.DataFrame):
        return x.applymap(remove_newlines)
    else:
        return x


class NotebookLoader(BaseLoader):
    """Load `Jupyter notebook` (.ipynb) files."""

    def __init__(
        self,
        path: str,
        include_outputs: bool = False,
        max_output_length: int = 10,
        remove_newline: bool = False,
        traceback: bool = False,
    ):
        """Initialize with a path.

        Args:
            path: The path to load the notebook from.
            include_outputs: Whether to include the outputs of the cell.
                Defaults to False.
            max_output_length: Maximum length of the output to be displayed.
                Defaults to 10.
            remove_newline: Whether to remove newlines from the notebook.
                Defaults to False.
            traceback: Whether to return a traceback of the error.
                Defaults to False.
        """
        self.file_path = path
        self.include_outputs = include_outputs
        self.max_output_length = max_output_length
        self.remove_newline = remove_newline
        self.traceback = traceback

    def load(
        self,
    ) -> List[Document]:
        """Load documents."""
        try:
            import pandas as pd
        except ImportError:
            raise ImportError(
                "pandas is needed for Notebook Loader, "
                "please install with `pip install pandas`"
            )
        p = Path(self.file_path)

        with open(p, encoding="utf8") as f:
            d = json.load(f)

        data = pd.json_normalize(d["cells"])
        filtered_data = data[["cell_type", "source", "outputs"]]
        if self.remove_newline:
            filtered_data = filtered_data.applymap(remove_newlines)

        text = filtered_data.apply(
            lambda x: concatenate_cells(
                x, self.include_outputs, self.max_output_length, self.traceback
            ),
            axis=1,
        ).str.cat(sep=" ")

        metadata = {"source": str(p)}

        return [Document(page_content=text, metadata=metadata)]
community[major], core[patch], langchain[patch], experimental[patch]: Create langchain-community (#14463) Moved the following modules to new package langchain-community in a backwards compatible fashion: ``` mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community ``` Moved the following to core ``` mv langchain/langchain/utils/json_schema.py core/langchain_core/utils mv langchain/langchain/utils/html.py core/langchain_core/utils mv langchain/langchain/utils/strings.py core/langchain_core/utils cat langchain/langchain/utils/env.py >> core/langchain_core/utils/env.py rm langchain/langchain/utils/env.py ``` See .scripts/community_split/script_integrations.sh for all changes 2023-12-11 21:53:30 +00:00			`"""Loads .ipynb notebook files."""`
			`import json`
			`from pathlib import Path`
			`from typing import Any, List`

			`from langchain_core.documents import Document`

			`from langchain_community.document_loaders.base import BaseLoader`


			`def concatenate_cells(`
			`cell: dict, include_outputs: bool, max_output_length: int, traceback: bool`
			`) -> str:`
			`"""Combine cells information in a readable format ready to be used.`

			`Args:`
			`cell: A dictionary`
			`include_outputs: Whether to include the outputs of the cell.`
			`max_output_length: Maximum length of the output to be displayed.`
			`traceback: Whether to return a traceback of the error.`

			`Returns:`
			`A string with the cell information.`

			`"""`
			`cell_type = cell["cell_type"]`
			`source = cell["source"]`
			`output = cell["outputs"]`

			`if include_outputs and cell_type == "code" and output:`
			`if "ename" in output[0].keys():`
			`error_name = output[0]["ename"]`
			`error_value = output[0]["evalue"]`
			`if traceback:`
			`traceback = output[0]["traceback"]`
			`return (`
			`f"'{cell_type}' cell: '{source}'\n, gives error '{error_name}',"`
			`f" with description '{error_value}'\n"`
			`f"and traceback '{traceback}'\n\n"`
			`)`
			`else:`
			`return (`
			`f"'{cell_type}' cell: '{source}'\n, gives error '{error_name}',"`
			`f"with description '{error_value}'\n\n"`
			`)`
			`elif output[0]["output_type"] == "stream":`
			`output = output[0]["text"]`
			`min_output = min(max_output_length, len(output))`
			`return (`
			`f"'{cell_type}' cell: '{source}'\n with "`
			`f"output: '{output[:min_output]}'\n\n"`
			`)`
			`else:`
			`return f"'{cell_type}' cell: '{source}'\n\n"`

			`return ""`


			`def remove_newlines(x: Any) -> Any:`
			`"""Recursively remove newlines, no matter the data structure they are stored in."""`
			`import pandas as pd`

			`if isinstance(x, str):`
			`return x.replace("\n", "")`
			`elif isinstance(x, list):`
			`return [remove_newlines(elem) for elem in x]`
			`elif isinstance(x, pd.DataFrame):`
			`return x.applymap(remove_newlines)`
			`else:`
			`return x`


			`class NotebookLoader(BaseLoader):`
			"""Load `Jupyter notebook` (.ipynb) files."""

			`def __init__(`
			`self,`
			`path: str,`
			`include_outputs: bool = False,`
			`max_output_length: int = 10,`
			`remove_newline: bool = False,`
			`traceback: bool = False,`
			`):`
			`"""Initialize with a path.`

			`Args:`
			`path: The path to load the notebook from.`
			`include_outputs: Whether to include the outputs of the cell.`
			`Defaults to False.`
			`max_output_length: Maximum length of the output to be displayed.`
			`Defaults to 10.`
			`remove_newline: Whether to remove newlines from the notebook.`
			`Defaults to False.`
			`traceback: Whether to return a traceback of the error.`
			`Defaults to False.`
			`"""`
			`self.file_path = path`
			`self.include_outputs = include_outputs`
			`self.max_output_length = max_output_length`
			`self.remove_newline = remove_newline`
			`self.traceback = traceback`

			`def load(`
			`self,`
			`) -> List[Document]:`
			`"""Load documents."""`
			`try:`
			`import pandas as pd`
			`except ImportError:`
			`raise ImportError(`
			`"pandas is needed for Notebook Loader, "`
			"please install with `pip install pandas`"
			`)`
			`p = Path(self.file_path)`

			`with open(p, encoding="utf8") as f:`
			`d = json.load(f)`

			`data = pd.json_normalize(d["cells"])`
			`filtered_data = data[["cell_type", "source", "outputs"]]`
			`if self.remove_newline:`
			`filtered_data = filtered_data.applymap(remove_newlines)`

			`text = filtered_data.apply(`
			`lambda x: concatenate_cells(`
			`x, self.include_outputs, self.max_output_length, self.traceback`
			`),`
			`axis=1,`
			`).str.cat(sep=" ")`

			`metadata = {"source": str(p)}`

			`return [Document(page_content=text, metadata=metadata)]`