Add JSON Lines support to JSONLoader (#6913)

**Description**: The JSON Lines format is used by some services such as OpenAI and HuggingFace. It's also a convenient alternative to CSV. This PR adds JSON Lines support to `JSONLoader` and also updates related tests. **Tag maintainer**: @rlancemartin, @eyurtsev. PS I was not able to build docs locally so didn't update related section.
1 year ago · 6d15854cda
parent 153b56d19b
commit 6d15854cda
5 changed files with 281 additions and 35 deletions
--- a/docs/docs_skeleton/docs/modules/data_connection/document_loaders/how_to/json.mdx
+++ b/docs/docs_skeleton/docs/modules/data_connection/document_loaders/how_to/json.mdx
@ -2,6 +2,8 @@

 >[JSON (JavaScript Object Notation)](https://en.wikipedia.org/wiki/JSON) is an open standard file format and data interchange format that uses human-readable text to store and transmit data objects consisting of attribute–value pairs and arrays (or other serializable values).

+>[JSON Lines](https://jsonlines.org/) is a file format where each line is a valid JSON value.
+
 import Example from "@snippets/modules/data_connection/document_loaders/how_to/json.mdx"

 <Example/>
--- a/docs/extras/modules/data_connection/document_loaders/integrations/example_data/facebook_chat_messages.jsonl
+++ b/docs/extras/modules/data_connection/document_loaders/integrations/example_data/facebook_chat_messages.jsonl
@ -0,0 +1,3 @@
+{"sender_name": "User 2", "timestamp_ms": 1675597571851, "content": "Bye!"}
+{"sender_name": "User 1", "timestamp_ms": 1675597435669, "content": "Oh no worries! Bye"}
+{"sender_name": "User 2", "timestamp_ms": 1675596277579, "content": "No Im sorry it was my mistake, the blue one is not for sale"}
--- a/docs/snippets/modules/data_connection/document_loaders/how_to/json.mdx
+++ b/docs/snippets/modules/data_connection/document_loaders/how_to/json.mdx
@ -78,11 +78,14 @@ pprint(data)

 </CodeOutputBlock>

+
 ## Using `JSONLoader`

 Suppose we are interested in extracting the values under the `content` field within the `messages` key of the JSON data. This can easily be done through the `JSONLoader` as shown below.


+### JSON file
+
 ```python
 loader = JSONLoader(
    file_path='./example_data/facebook_chat.json',
@ -114,6 +117,81 @@ pprint(data)

 </CodeOutputBlock>

+
+### JSON Lines file
+
+If you want to load documents from a JSON Lines file, you pass `json_lines=True`
+and specify `jq_schema` to extract `page_content` from a single JSON object.
+
+```python
+file_path = './example_data/facebook_chat_messages.jsonl'
+pprint(Path(file_path).read_text())
+```
+
+<CodeOutputBlock lang="python">
+
+```
+    ('{"sender_name": "User 2", "timestamp_ms": 1675597571851, "content": "Bye!"}\n'
+     '{"sender_name": "User 1", "timestamp_ms": 1675597435669, "content": "Oh no '
+     'worries! Bye"}\n'
+     '{"sender_name": "User 2", "timestamp_ms": 1675596277579, "content": "No Im '
+     'sorry it was my mistake, the blue one is not for sale"}\n')
+```
+
+</CodeOutputBlock>
+
+
+```python
+loader = JSONLoader(
+    file_path='./example_data/facebook_chat_messages.jsonl',
+    jq_schema='.content',
+    json_lines=True)
+
+data = loader.load()
+```
+
+```python
+pprint(data)
+```
+
+<CodeOutputBlock lang="python">
+
+```
+    [Document(page_content='Bye!', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat_messages.jsonl', 'seq_num': 1}),
+     Document(page_content='Oh no worries! Bye', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat_messages.jsonl', 'seq_num': 2}),
+     Document(page_content='No Im sorry it was my mistake, the blue one is not for sale', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat_messages.jsonl', 'seq_num': 3})]
+```
+
+</CodeOutputBlock>
+
+
+Another option is set `jq_schema='.'` and provide `content_key`:
+
+```python
+loader = JSONLoader(
+    file_path='./example_data/facebook_chat_messages.jsonl',
+    jq_schema='.',
+    content_key='sender_name',
+    json_lines=True)
+
+data = loader.load()
+```
+
+```python
+pprint(data)
+```
+
+<CodeOutputBlock lang="python">
+
+```
+    [Document(page_content='User 2', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat_messages.jsonl', 'seq_num': 1}),
+     Document(page_content='User 1', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat_messages.jsonl', 'seq_num': 2}),
+     Document(page_content='User 2', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat_messages.jsonl', 'seq_num': 3})]
+```
+
+</CodeOutputBlock>
+
+
 ## Extracting metadata

 Generally, we want to include metadata available in the JSON file into the documents that we create from the content.
--- a/langchain/document_loaders/json_loader.py
+++ b/langchain/document_loaders/json_loader.py
@ -23,11 +23,12 @@ class JSONLoader(BaseLoader):
        content_key: Optional[str] = None,
        metadata_func: Optional[Callable[[Dict, Dict], Dict]] = None,
        text_content: bool = True,
+        json_lines: bool = False,
    ):
        """Initialize the JSONLoader.

        Args:
-            file_path (Union[str, Path]): The path to the JSON file.
+            file_path (Union[str, Path]): The path to the JSON or JSON Lines file.
            jq_schema (str): The jq schema to use to extract the data or text from
                the JSON.
            content_key (str): The key to use to extract the content from the JSON if
@ -35,8 +36,10 @@ class JSONLoader(BaseLoader):
            metadata_func (Callable[Dict, Dict]): A function that takes in the JSON
                object extracted by the jq_schema and the default metadata and returns
                a dict of the updated metadata.
-            text_content (bool): Boolean flag to indicates whether the content is in
-                string format, default to True
+            text_content (bool): Boolean flag to indicate whether the content is in
+                string format, default to True.
+            json_lines (bool): Boolean flag to indicate whether the input is in
+                JSON Lines format.
        """
        try:
            import jq  # noqa:F401
@ -50,10 +53,24 @@ class JSONLoader(BaseLoader):
        self._content_key = content_key
        self._metadata_func = metadata_func
        self._text_content = text_content
+        self._json_lines = json_lines

    def load(self) -> List[Document]:
        """Load and return documents from the JSON file."""
-        data = self._jq_schema.input(json.loads(self.file_path.read_text()))
+        docs: List[Document] = []
+        if self._json_lines:
+            with self.file_path.open(encoding="utf-8") as f:
+                for line in f:
+                    line = line.strip()
+                    if line:
+                        self._parse(line, docs)
+        else:
+            self._parse(self.file_path.read_text(), docs)
+        return docs
+
+    def _parse(self, content: str, docs: List[Document]) -> None:
+        """Convert given content to documents."""
+        data = self._jq_schema.input(json.loads(content))

        # Perform some validation
        # This is not a perfect validation, but it should catch most cases
@ -61,8 +78,7 @@ class JSONLoader(BaseLoader):
        if self._content_key is not None:
            self._validate_content_key(data)

-        docs = []
-        for i, sample in enumerate(data, 1):
+        for i, sample in enumerate(data, len(docs) + 1):
            metadata = dict(
                source=str(self.file_path),
                seq_num=i,
@ -70,8 +86,6 @@ class JSONLoader(BaseLoader):
            text = self._get_text(sample=sample, metadata=metadata)
            docs.append(Document(page_content=text, metadata=metadata))

-        return docs
-
    def _get_text(self, sample: Any, metadata: dict) -> str:
        """Convert sample to string format"""
        if self._content_key is not None:
--- a/tests/unit_tests/document_loaders/test_json_loader.py
+++ b/tests/unit_tests/document_loaders/test_json_loader.py
@ -1,3 +1,6 @@
+import io
+from typing import Any, Dict
+
 import pytest
 from pytest import raises
 from pytest_mock import MockerFixture
@ -5,8 +8,9 @@ from pytest_mock import MockerFixture
 from langchain.docstore.document import Document
 from langchain.document_loaders.json_loader import JSONLoader

+pytestmark = pytest.mark.requires("jq")
+

-@pytest.mark.requires("jq")
 def test_load_valid_string_content(mocker: MockerFixture) -> None:
    file_path = "/workspaces/langchain/test.json"
    expected_docs = [
@ -19,9 +23,12 @@ def test_load_valid_string_content(mocker: MockerFixture) -> None:
            metadata={"source": file_path, "seq_num": 2},
        ),
    ]
+
    mocker.patch("builtins.open", mocker.mock_open())
-    mock_csv_reader = mocker.patch("pathlib.Path.read_text")
-    mock_csv_reader.return_value = '[{"text": "value1"}, {"text": "value2"}]'
+    mocker.patch(
+        "pathlib.Path.read_text",
+        return_value='[{"text": "value1"}, {"text": "value2"}]',
+    )

    loader = JSONLoader(file_path=file_path, jq_schema=".[].text", text_content=True)
    result = loader.load()
@ -29,7 +36,6 @@ def test_load_valid_string_content(mocker: MockerFixture) -> None:
    assert result == expected_docs


-@pytest.mark.requires("jq")
 def test_load_valid_dict_content(mocker: MockerFixture) -> None:
    file_path = "/workspaces/langchain/test.json"
    expected_docs = [
@ -42,11 +48,14 @@ def test_load_valid_dict_content(mocker: MockerFixture) -> None:
            metadata={"source": file_path, "seq_num": 2},
        ),
    ]
+
    mocker.patch("builtins.open", mocker.mock_open())
-    mock_csv_reader = mocker.patch("pathlib.Path.read_text")
-    mock_csv_reader.return_value = """
+    mocker.patch(
+        "pathlib.Path.read_text",
+        return_value="""
            [{"text": "value1"}, {"text": "value2"}]
-    """
+        """,
+    )

    loader = JSONLoader(file_path=file_path, jq_schema=".[]", text_content=False)
    result = loader.load()
@ -54,7 +63,6 @@ def test_load_valid_dict_content(mocker: MockerFixture) -> None:
    assert result == expected_docs


-@pytest.mark.requires("jq")
 def test_load_valid_bool_content(mocker: MockerFixture) -> None:
    file_path = "/workspaces/langchain/test.json"
    expected_docs = [
@ -67,13 +75,16 @@ def test_load_valid_bool_content(mocker: MockerFixture) -> None:
            metadata={"source": file_path, "seq_num": 2},
        ),
    ]
+
    mocker.patch("builtins.open", mocker.mock_open())
-    mock_csv_reader = mocker.patch("pathlib.Path.read_text")
-    mock_csv_reader.return_value = """
+    mocker.patch(
+        "pathlib.Path.read_text",
+        return_value="""
            [
                {"flag": false}, {"flag": true}
            ]
-    """
+        """,
+    )

    loader = JSONLoader(file_path=file_path, jq_schema=".[].flag", text_content=False)
    result = loader.load()
@ -81,7 +92,6 @@ def test_load_valid_bool_content(mocker: MockerFixture) -> None:
    assert result == expected_docs


-@pytest.mark.requires("jq")
 def test_load_valid_numeric_content(mocker: MockerFixture) -> None:
    file_path = "/workspaces/langchain/test.json"
    expected_docs = [
@ -94,13 +104,16 @@ def test_load_valid_numeric_content(mocker: MockerFixture) -> None:
            metadata={"source": file_path, "seq_num": 2},
        ),
    ]
+
    mocker.patch("builtins.open", mocker.mock_open())
-    mock_csv_reader = mocker.patch("pathlib.Path.read_text")
-    mock_csv_reader.return_value = """
+    mocker.patch(
+        "pathlib.Path.read_text",
+        return_value="""
            [
                {"num": 99}, {"num": 99.5}
            ]
-    """
+        """,
+    )

    loader = JSONLoader(file_path=file_path, jq_schema=".[].num", text_content=False)
    result = loader.load()
@ -108,16 +121,152 @@ def test_load_valid_numeric_content(mocker: MockerFixture) -> None:
    assert result == expected_docs


-@pytest.mark.requires("jq")
 def test_load_invalid_test_content(mocker: MockerFixture) -> None:
    file_path = "/workspaces/langchain/test.json"
+
    mocker.patch("builtins.open", mocker.mock_open())
-    mock_csv_reader = mocker.patch("pathlib.Path.read_text")
-    mock_csv_reader.return_value = """
+    mocker.patch(
+        "pathlib.Path.read_text",
+        return_value="""
            [{"text": "value1"}, {"text": "value2"}]
-    """
+        """,
+    )

    loader = JSONLoader(file_path=file_path, jq_schema=".[]", text_content=True)

    with raises(ValueError):
        loader.load()
+
+
+def test_load_jsonlines(mocker: MockerFixture) -> None:
+    file_path = "/workspaces/langchain/test.json"
+    expected_docs = [
+        Document(
+            page_content="value1",
+            metadata={"source": file_path, "seq_num": 1},
+        ),
+        Document(
+            page_content="value2",
+            metadata={"source": file_path, "seq_num": 2},
+        ),
+    ]
+
+    mocker.patch(
+        "pathlib.Path.open",
+        return_value=io.StringIO(
+            """
+            {"text": "value1"}
+            {"text": "value2"}
+            """
+        ),
+    )
+
+    loader = JSONLoader(
+        file_path=file_path, jq_schema=".", content_key="text", json_lines=True
+    )
+    result = loader.load()
+
+    assert result == expected_docs
+
+
+@pytest.mark.parametrize(
+    "params",
+    (
+        {"jq_schema": ".[].text"},
+        {"jq_schema": ".[]", "content_key": "text"},
+    ),
+)
+def test_load_jsonlines_list(params: Dict, mocker: MockerFixture) -> None:
+    file_path = "/workspaces/langchain/test.json"
+    expected_docs = [
+        Document(
+            page_content="value1",
+            metadata={"source": file_path, "seq_num": 1},
+        ),
+        Document(
+            page_content="value2",
+            metadata={"source": file_path, "seq_num": 2},
+        ),
+        Document(
+            page_content="value3",
+            metadata={"source": file_path, "seq_num": 3},
+        ),
+        Document(
+            page_content="value4",
+            metadata={"source": file_path, "seq_num": 4},
+        ),
+    ]
+
+    mocker.patch(
+        "pathlib.Path.open",
+        return_value=io.StringIO(
+            """
+            [{"text": "value1"}, {"text": "value2"}]
+            [{"text": "value3"}, {"text": "value4"}]
+            """
+        ),
+    )
+
+    loader = JSONLoader(file_path=file_path, json_lines=True, **params)
+    result = loader.load()
+
+    assert result == expected_docs
+
+
+def test_load_empty_jsonlines(mocker: MockerFixture) -> None:
+    mocker.patch("pathlib.Path.open", return_value=io.StringIO(""))
+
+    loader = JSONLoader(file_path="file_path", jq_schema=".[].text", json_lines=True)
+    result = loader.load()
+
+    assert result == []
+
+
+@pytest.mark.parametrize(
+    "patch_func,patch_func_value,kwargs",
+    (
+        # JSON content.
+        (
+            "pathlib.Path.read_text",
+            '[{"text": "value1"}, {"text": "value2"}]',
+            {"jq_schema": ".[]", "content_key": "text"},
+        ),
+        # JSON Lines content.
+        (
+            "pathlib.Path.open",
+            io.StringIO(
+                """
+                {"text": "value1"}
+                {"text": "value2"}
+                """
+            ),
+            {"jq_schema": ".", "content_key": "text", "json_lines": True},
+        ),
+    ),
+)
+def test_json_meta(
+    patch_func: str, patch_func_value: Any, kwargs: Dict, mocker: MockerFixture
+) -> None:
+    mocker.patch("builtins.open", mocker.mock_open())
+    mocker.patch(patch_func, return_value=patch_func_value)
+
+    file_path = "/workspaces/langchain/test.json"
+    expected_docs = [
+        Document(
+            page_content="value1",
+            metadata={"source": file_path, "seq_num": 1, "x": "value1-meta"},
+        ),
+        Document(
+            page_content="value2",
+            metadata={"source": file_path, "seq_num": 2, "x": "value2-meta"},
+        ),
+    ]
+
+    def metadata_func(record: Dict, metadata: Dict) -> Dict:
+        metadata["x"] = f"{record['text']}-meta"
+        return metadata
+
+    loader = JSONLoader(file_path=file_path, metadata_func=metadata_func, **kwargs)
+    result = loader.load()
+
+    assert result == expected_docs