Add JSON Lines support to JSONLoader (#6913)

**Description**: The JSON Lines format is used by some services such as OpenAI and HuggingFace. It's also a convenient alternative to CSV. This PR adds JSON Lines support to `JSONLoader` and also updates related tests. **Tag maintainer**: @rlancemartin, @eyurtsev. PS I was not able to build docs locally so didn't update related section.
1 year ago · 6d15854cda
parent 153b56d19b
commit 6d15854cda
5 changed files with 281 additions and 35 deletions
--- a/docs/docs_skeleton/docs/modules/data_connection/document_loaders/how_to/json.mdx
+++ b/docs/docs_skeleton/docs/modules/data_connection/document_loaders/how_to/json.mdx
@ -2,6 +2,8 @@
 >[JSON (JavaScript Object Notation)](https://en.wikipedia.org/wiki/JSON) is an open standard file format and data interchange format that uses human-readable text to store and transmit data objects consisting of attribute–value pairs and arrays (or other serializable values).
 >[JSON Lines](https://jsonlines.org/) is a file format where each line is a valid JSON value.
 import Example from "@snippets/modules/data_connection/document_loaders/how_to/json.mdx"
 <Example/>
--- a/docs/extras/modules/data_connection/document_loaders/integrations/example_data/facebook_chat_messages.jsonl
+++ b/docs/extras/modules/data_connection/document_loaders/integrations/example_data/facebook_chat_messages.jsonl
@ -0,0 +1,3 @@
 {"sender_name": "User 2", "timestamp_ms": 1675597571851, "content": "Bye!"}
 {"sender_name": "User 1", "timestamp_ms": 1675597435669, "content": "Oh no worries! Bye"}
 {"sender_name": "User 2", "timestamp_ms": 1675596277579, "content": "No Im sorry it was my mistake, the blue one is not for sale"}
--- a/docs/snippets/modules/data_connection/document_loaders/how_to/json.mdx
+++ b/docs/snippets/modules/data_connection/document_loaders/how_to/json.mdx
@ -78,11 +78,14 @@ pprint(data)
 </CodeOutputBlock>
 ## Using `JSONLoader`
 Suppose we are interested in extracting the values under the `content` field within the `messages` key of the JSON data. This can easily be done through the `JSONLoader` as shown below.
 ### JSON file
 ```python
 loader = JSONLoader(
    file_path='./example_data/facebook_chat.json',
@ -114,6 +117,81 @@ pprint(data)
 </CodeOutputBlock>
 ### JSON Lines file
 If you want to load documents from a JSON Lines file, you pass `json_lines=True`
 and specify `jq_schema` to extract `page_content` from a single JSON object.
 ```python
 file_path = './example_data/facebook_chat_messages.jsonl'
 pprint(Path(file_path).read_text())
 ```
 <CodeOutputBlock lang="python">
 ```
    ('{"sender_name": "User 2", "timestamp_ms": 1675597571851, "content": "Bye!"}\n'
     '{"sender_name": "User 1", "timestamp_ms": 1675597435669, "content": "Oh no '
     'worries! Bye"}\n'
     '{"sender_name": "User 2", "timestamp_ms": 1675596277579, "content": "No Im '
     'sorry it was my mistake, the blue one is not for sale"}\n')
 ```
 </CodeOutputBlock>
 ```python
 loader = JSONLoader(
    file_path='./example_data/facebook_chat_messages.jsonl',
    jq_schema='.content',
    json_lines=True)
 data = loader.load()
 ```
 ```python
 pprint(data)
 ```
 <CodeOutputBlock lang="python">
 ```
    [Document(page_content='Bye!', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat_messages.jsonl', 'seq_num': 1}),
     Document(page_content='Oh no worries! Bye', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat_messages.jsonl', 'seq_num': 2}),
     Document(page_content='No Im sorry it was my mistake, the blue one is not for sale', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat_messages.jsonl', 'seq_num': 3})]
 ```
 </CodeOutputBlock>
 Another option is set `jq_schema='.'` and provide `content_key`:
 ```python
 loader = JSONLoader(
    file_path='./example_data/facebook_chat_messages.jsonl',
    jq_schema='.',
    content_key='sender_name',
    json_lines=True)
 data = loader.load()
 ```
 ```python
 pprint(data)
 ```
 <CodeOutputBlock lang="python">
 ```
    [Document(page_content='User 2', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat_messages.jsonl', 'seq_num': 1}),
     Document(page_content='User 1', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat_messages.jsonl', 'seq_num': 2}),
     Document(page_content='User 2', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat_messages.jsonl', 'seq_num': 3})]
 ```
 </CodeOutputBlock>
 ## Extracting metadata
 Generally, we want to include metadata available in the JSON file into the documents that we create from the content.
--- a/langchain/document_loaders/json_loader.py
+++ b/langchain/document_loaders/json_loader.py
@ -23,11 +23,12 @@ class JSONLoader(BaseLoader):
        content_key: Optional[str] = None,
        metadata_func: Optional[Callable[[Dict, Dict], Dict]] = None,
        text_content: bool = True,
        json_lines: bool = False,
    ):
        """Initialize the JSONLoader.
        Args:
-            file_path (Union[str, Path]): The path to the JSON file.
+            file_path (Union[str, Path]): The path to the JSON or JSON Lines file.
            jq_schema (str): The jq schema to use to extract the data or text from
                the JSON.
            content_key (str): The key to use to extract the content from the JSON if
@ -35,8 +36,10 @@ class JSONLoader(BaseLoader):
            metadata_func (Callable[Dict, Dict]): A function that takes in the JSON
                object extracted by the jq_schema and the default metadata and returns
                a dict of the updated metadata.
-            text_content (bool): Boolean flag to indicates whether the content is in
+            text_content (bool): Boolean flag to indicate whether the content is in
-                string format, default to True
+                string format, default to True.
            json_lines (bool): Boolean flag to indicate whether the input is in
                JSON Lines format.
        """
        try:
            import jq  # noqa:F401
@ -50,10 +53,24 @@ class JSONLoader(BaseLoader):
        self._content_key = content_key
        self._metadata_func = metadata_func
        self._text_content = text_content
        self._json_lines = json_lines
    def load(self) -> List[Document]:
        """Load and return documents from the JSON file."""
-        data = self._jq_schema.input(json.loads(self.file_path.read_text()))
+        docs: List[Document] = []
        if self._json_lines:
            with self.file_path.open(encoding="utf-8") as f:
                for line in f:
                    line = line.strip()
                    if line:
                        self._parse(line, docs)
        else:
            self._parse(self.file_path.read_text(), docs)
        return docs
    def _parse(self, content: str, docs: List[Document]) -> None:
        """Convert given content to documents."""
        data = self._jq_schema.input(json.loads(content))
        # Perform some validation
        # This is not a perfect validation, but it should catch most cases
@ -61,8 +78,7 @@ class JSONLoader(BaseLoader):
        if self._content_key is not None:
            self._validate_content_key(data)
-        docs = []
+        for i, sample in enumerate(data, len(docs) + 1):
        for i, sample in enumerate(data, 1):
            metadata = dict(
                source=str(self.file_path),
                seq_num=i,
@ -70,8 +86,6 @@ class JSONLoader(BaseLoader):
            text = self._get_text(sample=sample, metadata=metadata)
            docs.append(Document(page_content=text, metadata=metadata))
        return docs
    def _get_text(self, sample: Any, metadata: dict) -> str:
        """Convert sample to string format"""
        if self._content_key is not None:
--- a/tests/unit_tests/document_loaders/test_json_loader.py
+++ b/tests/unit_tests/document_loaders/test_json_loader.py
@ -1,3 +1,6 @@
 import io
 from typing import Any, Dict
 import pytest
 from pytest import raises
 from pytest_mock import MockerFixture
@ -5,8 +8,9 @@ from pytest_mock import MockerFixture
 from langchain.docstore.document import Document
 from langchain.document_loaders.json_loader import JSONLoader
 pytestmark = pytest.mark.requires("jq")
@pytest.mark.requires("jq")
 def test_load_valid_string_content(mocker: MockerFixture) -> None:
    file_path = "/workspaces/langchain/test.json"
    expected_docs = [
@ -19,9 +23,12 @@ def test_load_valid_string_content(mocker: MockerFixture) -> None:
            metadata={"source": file_path, "seq_num": 2},
        ),
    ]
    mocker.patch("builtins.open", mocker.mock_open())
-    mock_csv_reader = mocker.patch("pathlib.Path.read_text")
+    mocker.patch(
-    mock_csv_reader.return_value = '[{"text": "value1"}, {"text": "value2"}]'
+        "pathlib.Path.read_text",
        return_value='[{"text": "value1"}, {"text": "value2"}]',
    )
    loader = JSONLoader(file_path=file_path, jq_schema=".[].text", text_content=True)
    result = loader.load()
@ -29,7 +36,6 @@ def test_load_valid_string_content(mocker: MockerFixture) -> None:
    assert result == expected_docs
@pytest.mark.requires("jq")
 def test_load_valid_dict_content(mocker: MockerFixture) -> None:
    file_path = "/workspaces/langchain/test.json"
    expected_docs = [
@ -42,11 +48,14 @@ def test_load_valid_dict_content(mocker: MockerFixture) -> None:
            metadata={"source": file_path, "seq_num": 2},
        ),
    ]
    mocker.patch("builtins.open", mocker.mock_open())
-    mock_csv_reader = mocker.patch("pathlib.Path.read_text")
+    mocker.patch(
-    mock_csv_reader.return_value = """
+        "pathlib.Path.read_text",
        return_value="""
            [{"text": "value1"}, {"text": "value2"}]
-    """
+        """,
    )
    loader = JSONLoader(file_path=file_path, jq_schema=".[]", text_content=False)
    result = loader.load()
@ -54,7 +63,6 @@ def test_load_valid_dict_content(mocker: MockerFixture) -> None:
    assert result == expected_docs
@pytest.mark.requires("jq")
 def test_load_valid_bool_content(mocker: MockerFixture) -> None:
    file_path = "/workspaces/langchain/test.json"
    expected_docs = [
@ -67,13 +75,16 @@ def test_load_valid_bool_content(mocker: MockerFixture) -> None:
            metadata={"source": file_path, "seq_num": 2},
        ),
    ]
    mocker.patch("builtins.open", mocker.mock_open())
-    mock_csv_reader = mocker.patch("pathlib.Path.read_text")
+    mocker.patch(
-    mock_csv_reader.return_value = """
+        "pathlib.Path.read_text",
        return_value="""
            [
                {"flag": false}, {"flag": true}
            ]
-    """
+        """,
    )
    loader = JSONLoader(file_path=file_path, jq_schema=".[].flag", text_content=False)
    result = loader.load()
@ -81,7 +92,6 @@ def test_load_valid_bool_content(mocker: MockerFixture) -> None:
    assert result == expected_docs
@pytest.mark.requires("jq")
 def test_load_valid_numeric_content(mocker: MockerFixture) -> None:
    file_path = "/workspaces/langchain/test.json"
    expected_docs = [
@ -94,13 +104,16 @@ def test_load_valid_numeric_content(mocker: MockerFixture) -> None:
            metadata={"source": file_path, "seq_num": 2},
        ),
    ]
    mocker.patch("builtins.open", mocker.mock_open())
-    mock_csv_reader = mocker.patch("pathlib.Path.read_text")
+    mocker.patch(
-    mock_csv_reader.return_value = """
+        "pathlib.Path.read_text",
        return_value="""
            [
                {"num": 99}, {"num": 99.5}
            ]
-    """
+        """,
    )
    loader = JSONLoader(file_path=file_path, jq_schema=".[].num", text_content=False)
    result = loader.load()
@ -108,16 +121,152 @@ def test_load_valid_numeric_content(mocker: MockerFixture) -> None:
    assert result == expected_docs
@pytest.mark.requires("jq")
 def test_load_invalid_test_content(mocker: MockerFixture) -> None:
    file_path = "/workspaces/langchain/test.json"
    mocker.patch("builtins.open", mocker.mock_open())
-    mock_csv_reader = mocker.patch("pathlib.Path.read_text")
+    mocker.patch(
-    mock_csv_reader.return_value = """
+        "pathlib.Path.read_text",
        return_value="""
            [{"text": "value1"}, {"text": "value2"}]
-    """
+        """,
    )
    loader = JSONLoader(file_path=file_path, jq_schema=".[]", text_content=True)
    with raises(ValueError):
        loader.load()
 def test_load_jsonlines(mocker: MockerFixture) -> None:
    file_path = "/workspaces/langchain/test.json"
    expected_docs = [
        Document(
            page_content="value1",
            metadata={"source": file_path, "seq_num": 1},
        ),
        Document(
            page_content="value2",
            metadata={"source": file_path, "seq_num": 2},
        ),
    ]
    mocker.patch(
        "pathlib.Path.open",
        return_value=io.StringIO(
            """
            {"text": "value1"}
            {"text": "value2"}
            """
        ),
    )
    loader = JSONLoader(
        file_path=file_path, jq_schema=".", content_key="text", json_lines=True
    )
    result = loader.load()
    assert result == expected_docs
@pytest.mark.parametrize(
    "params",
    (
        {"jq_schema": ".[].text"},
        {"jq_schema": ".[]", "content_key": "text"},
    ),
 )
 def test_load_jsonlines_list(params: Dict, mocker: MockerFixture) -> None:
    file_path = "/workspaces/langchain/test.json"
    expected_docs = [
        Document(
            page_content="value1",
            metadata={"source": file_path, "seq_num": 1},
        ),
        Document(
            page_content="value2",
            metadata={"source": file_path, "seq_num": 2},
        ),
        Document(
            page_content="value3",
            metadata={"source": file_path, "seq_num": 3},
        ),
        Document(
            page_content="value4",
            metadata={"source": file_path, "seq_num": 4},
        ),
    ]
    mocker.patch(
        "pathlib.Path.open",
        return_value=io.StringIO(
            """
            [{"text": "value1"}, {"text": "value2"}]
            [{"text": "value3"}, {"text": "value4"}]
            """
        ),
    )
    loader = JSONLoader(file_path=file_path, json_lines=True, **params)
    result = loader.load()
    assert result == expected_docs
 def test_load_empty_jsonlines(mocker: MockerFixture) -> None:
    mocker.patch("pathlib.Path.open", return_value=io.StringIO(""))
    loader = JSONLoader(file_path="file_path", jq_schema=".[].text", json_lines=True)
    result = loader.load()
    assert result == []
@pytest.mark.parametrize(
    "patch_func,patch_func_value,kwargs",
    (
        # JSON content.
        (
            "pathlib.Path.read_text",
            '[{"text": "value1"}, {"text": "value2"}]',
            {"jq_schema": ".[]", "content_key": "text"},
        ),
        # JSON Lines content.
        (
            "pathlib.Path.open",
            io.StringIO(
                """
                {"text": "value1"}
                {"text": "value2"}
                """
            ),
            {"jq_schema": ".", "content_key": "text", "json_lines": True},
        ),
    ),
 )
 def test_json_meta(
    patch_func: str, patch_func_value: Any, kwargs: Dict, mocker: MockerFixture
 ) -> None:
    mocker.patch("builtins.open", mocker.mock_open())
    mocker.patch(patch_func, return_value=patch_func_value)
    file_path = "/workspaces/langchain/test.json"
    expected_docs = [
        Document(
            page_content="value1",
            metadata={"source": file_path, "seq_num": 1, "x": "value1-meta"},
        ),
        Document(
            page_content="value2",
            metadata={"source": file_path, "seq_num": 2, "x": "value2-meta"},
        ),
    ]
    def metadata_func(record: Dict, metadata: Dict) -> Dict:
        metadata["x"] = f"{record['text']}-meta"
        return metadata
    loader = JSONLoader(file_path=file_path, metadata_func=metadata_func, **kwargs)
    result = loader.load()
    assert result == expected_docs