Add JSON Lines support to JSONLoader (#6913)

**Description**:

The JSON Lines format is used by some services such as OpenAI and
HuggingFace. It's also a convenient alternative to CSV.

This PR adds JSON Lines support to `JSONLoader` and also updates related
tests.

**Tag maintainer**: @rlancemartin, @eyurtsev.

PS I was not able to build docs locally so didn't update related
section.
harrison/split-schema-dir^2
Sergey Kozlov 1 year ago committed by GitHub
parent 153b56d19b
commit 6d15854cda
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -2,6 +2,8 @@
>[JSON (JavaScript Object Notation)](https://en.wikipedia.org/wiki/JSON) is an open standard file format and data interchange format that uses human-readable text to store and transmit data objects consisting of attributevalue pairs and arrays (or other serializable values).
>[JSON Lines](https://jsonlines.org/) is a file format where each line is a valid JSON value.
import Example from "@snippets/modules/data_connection/document_loaders/how_to/json.mdx"
<Example/>

@ -0,0 +1,3 @@
{"sender_name": "User 2", "timestamp_ms": 1675597571851, "content": "Bye!"}
{"sender_name": "User 1", "timestamp_ms": 1675597435669, "content": "Oh no worries! Bye"}
{"sender_name": "User 2", "timestamp_ms": 1675596277579, "content": "No Im sorry it was my mistake, the blue one is not for sale"}

@ -78,11 +78,14 @@ pprint(data)
</CodeOutputBlock>
## Using `JSONLoader`
Suppose we are interested in extracting the values under the `content` field within the `messages` key of the JSON data. This can easily be done through the `JSONLoader` as shown below.
### JSON file
```python
loader = JSONLoader(
file_path='./example_data/facebook_chat.json',
@ -114,6 +117,81 @@ pprint(data)
</CodeOutputBlock>
### JSON Lines file
If you want to load documents from a JSON Lines file, you pass `json_lines=True`
and specify `jq_schema` to extract `page_content` from a single JSON object.
```python
file_path = './example_data/facebook_chat_messages.jsonl'
pprint(Path(file_path).read_text())
```
<CodeOutputBlock lang="python">
```
('{"sender_name": "User 2", "timestamp_ms": 1675597571851, "content": "Bye!"}\n'
'{"sender_name": "User 1", "timestamp_ms": 1675597435669, "content": "Oh no '
'worries! Bye"}\n'
'{"sender_name": "User 2", "timestamp_ms": 1675596277579, "content": "No Im '
'sorry it was my mistake, the blue one is not for sale"}\n')
```
</CodeOutputBlock>
```python
loader = JSONLoader(
file_path='./example_data/facebook_chat_messages.jsonl',
jq_schema='.content',
json_lines=True)
data = loader.load()
```
```python
pprint(data)
```
<CodeOutputBlock lang="python">
```
[Document(page_content='Bye!', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat_messages.jsonl', 'seq_num': 1}),
Document(page_content='Oh no worries! Bye', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat_messages.jsonl', 'seq_num': 2}),
Document(page_content='No Im sorry it was my mistake, the blue one is not for sale', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat_messages.jsonl', 'seq_num': 3})]
```
</CodeOutputBlock>
Another option is set `jq_schema='.'` and provide `content_key`:
```python
loader = JSONLoader(
file_path='./example_data/facebook_chat_messages.jsonl',
jq_schema='.',
content_key='sender_name',
json_lines=True)
data = loader.load()
```
```python
pprint(data)
```
<CodeOutputBlock lang="python">
```
[Document(page_content='User 2', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat_messages.jsonl', 'seq_num': 1}),
Document(page_content='User 1', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat_messages.jsonl', 'seq_num': 2}),
Document(page_content='User 2', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat_messages.jsonl', 'seq_num': 3})]
```
</CodeOutputBlock>
## Extracting metadata
Generally, we want to include metadata available in the JSON file into the documents that we create from the content.

@ -23,11 +23,12 @@ class JSONLoader(BaseLoader):
content_key: Optional[str] = None,
metadata_func: Optional[Callable[[Dict, Dict], Dict]] = None,
text_content: bool = True,
json_lines: bool = False,
):
"""Initialize the JSONLoader.
Args:
file_path (Union[str, Path]): The path to the JSON file.
file_path (Union[str, Path]): The path to the JSON or JSON Lines file.
jq_schema (str): The jq schema to use to extract the data or text from
the JSON.
content_key (str): The key to use to extract the content from the JSON if
@ -35,8 +36,10 @@ class JSONLoader(BaseLoader):
metadata_func (Callable[Dict, Dict]): A function that takes in the JSON
object extracted by the jq_schema and the default metadata and returns
a dict of the updated metadata.
text_content (bool): Boolean flag to indicates whether the content is in
string format, default to True
text_content (bool): Boolean flag to indicate whether the content is in
string format, default to True.
json_lines (bool): Boolean flag to indicate whether the input is in
JSON Lines format.
"""
try:
import jq # noqa:F401
@ -50,10 +53,24 @@ class JSONLoader(BaseLoader):
self._content_key = content_key
self._metadata_func = metadata_func
self._text_content = text_content
self._json_lines = json_lines
def load(self) -> List[Document]:
"""Load and return documents from the JSON file."""
data = self._jq_schema.input(json.loads(self.file_path.read_text()))
docs: List[Document] = []
if self._json_lines:
with self.file_path.open(encoding="utf-8") as f:
for line in f:
line = line.strip()
if line:
self._parse(line, docs)
else:
self._parse(self.file_path.read_text(), docs)
return docs
def _parse(self, content: str, docs: List[Document]) -> None:
"""Convert given content to documents."""
data = self._jq_schema.input(json.loads(content))
# Perform some validation
# This is not a perfect validation, but it should catch most cases
@ -61,8 +78,7 @@ class JSONLoader(BaseLoader):
if self._content_key is not None:
self._validate_content_key(data)
docs = []
for i, sample in enumerate(data, 1):
for i, sample in enumerate(data, len(docs) + 1):
metadata = dict(
source=str(self.file_path),
seq_num=i,
@ -70,8 +86,6 @@ class JSONLoader(BaseLoader):
text = self._get_text(sample=sample, metadata=metadata)
docs.append(Document(page_content=text, metadata=metadata))
return docs
def _get_text(self, sample: Any, metadata: dict) -> str:
"""Convert sample to string format"""
if self._content_key is not None:

@ -1,3 +1,6 @@
import io
from typing import Any, Dict
import pytest
from pytest import raises
from pytest_mock import MockerFixture
@ -5,8 +8,9 @@ from pytest_mock import MockerFixture
from langchain.docstore.document import Document
from langchain.document_loaders.json_loader import JSONLoader
pytestmark = pytest.mark.requires("jq")
@pytest.mark.requires("jq")
def test_load_valid_string_content(mocker: MockerFixture) -> None:
file_path = "/workspaces/langchain/test.json"
expected_docs = [
@ -19,9 +23,12 @@ def test_load_valid_string_content(mocker: MockerFixture) -> None:
metadata={"source": file_path, "seq_num": 2},
),
]
mocker.patch("builtins.open", mocker.mock_open())
mock_csv_reader = mocker.patch("pathlib.Path.read_text")
mock_csv_reader.return_value = '[{"text": "value1"}, {"text": "value2"}]'
mocker.patch(
"pathlib.Path.read_text",
return_value='[{"text": "value1"}, {"text": "value2"}]',
)
loader = JSONLoader(file_path=file_path, jq_schema=".[].text", text_content=True)
result = loader.load()
@ -29,7 +36,6 @@ def test_load_valid_string_content(mocker: MockerFixture) -> None:
assert result == expected_docs
@pytest.mark.requires("jq")
def test_load_valid_dict_content(mocker: MockerFixture) -> None:
file_path = "/workspaces/langchain/test.json"
expected_docs = [
@ -42,11 +48,14 @@ def test_load_valid_dict_content(mocker: MockerFixture) -> None:
metadata={"source": file_path, "seq_num": 2},
),
]
mocker.patch("builtins.open", mocker.mock_open())
mock_csv_reader = mocker.patch("pathlib.Path.read_text")
mock_csv_reader.return_value = """
mocker.patch(
"pathlib.Path.read_text",
return_value="""
[{"text": "value1"}, {"text": "value2"}]
"""
""",
)
loader = JSONLoader(file_path=file_path, jq_schema=".[]", text_content=False)
result = loader.load()
@ -54,7 +63,6 @@ def test_load_valid_dict_content(mocker: MockerFixture) -> None:
assert result == expected_docs
@pytest.mark.requires("jq")
def test_load_valid_bool_content(mocker: MockerFixture) -> None:
file_path = "/workspaces/langchain/test.json"
expected_docs = [
@ -67,13 +75,16 @@ def test_load_valid_bool_content(mocker: MockerFixture) -> None:
metadata={"source": file_path, "seq_num": 2},
),
]
mocker.patch("builtins.open", mocker.mock_open())
mock_csv_reader = mocker.patch("pathlib.Path.read_text")
mock_csv_reader.return_value = """
mocker.patch(
"pathlib.Path.read_text",
return_value="""
[
{"flag": false}, {"flag": true}
]
"""
""",
)
loader = JSONLoader(file_path=file_path, jq_schema=".[].flag", text_content=False)
result = loader.load()
@ -81,7 +92,6 @@ def test_load_valid_bool_content(mocker: MockerFixture) -> None:
assert result == expected_docs
@pytest.mark.requires("jq")
def test_load_valid_numeric_content(mocker: MockerFixture) -> None:
file_path = "/workspaces/langchain/test.json"
expected_docs = [
@ -94,13 +104,16 @@ def test_load_valid_numeric_content(mocker: MockerFixture) -> None:
metadata={"source": file_path, "seq_num": 2},
),
]
mocker.patch("builtins.open", mocker.mock_open())
mock_csv_reader = mocker.patch("pathlib.Path.read_text")
mock_csv_reader.return_value = """
mocker.patch(
"pathlib.Path.read_text",
return_value="""
[
{"num": 99}, {"num": 99.5}
]
"""
""",
)
loader = JSONLoader(file_path=file_path, jq_schema=".[].num", text_content=False)
result = loader.load()
@ -108,16 +121,152 @@ def test_load_valid_numeric_content(mocker: MockerFixture) -> None:
assert result == expected_docs
@pytest.mark.requires("jq")
def test_load_invalid_test_content(mocker: MockerFixture) -> None:
file_path = "/workspaces/langchain/test.json"
mocker.patch("builtins.open", mocker.mock_open())
mock_csv_reader = mocker.patch("pathlib.Path.read_text")
mock_csv_reader.return_value = """
mocker.patch(
"pathlib.Path.read_text",
return_value="""
[{"text": "value1"}, {"text": "value2"}]
"""
""",
)
loader = JSONLoader(file_path=file_path, jq_schema=".[]", text_content=True)
with raises(ValueError):
loader.load()
def test_load_jsonlines(mocker: MockerFixture) -> None:
file_path = "/workspaces/langchain/test.json"
expected_docs = [
Document(
page_content="value1",
metadata={"source": file_path, "seq_num": 1},
),
Document(
page_content="value2",
metadata={"source": file_path, "seq_num": 2},
),
]
mocker.patch(
"pathlib.Path.open",
return_value=io.StringIO(
"""
{"text": "value1"}
{"text": "value2"}
"""
),
)
loader = JSONLoader(
file_path=file_path, jq_schema=".", content_key="text", json_lines=True
)
result = loader.load()
assert result == expected_docs
@pytest.mark.parametrize(
"params",
(
{"jq_schema": ".[].text"},
{"jq_schema": ".[]", "content_key": "text"},
),
)
def test_load_jsonlines_list(params: Dict, mocker: MockerFixture) -> None:
file_path = "/workspaces/langchain/test.json"
expected_docs = [
Document(
page_content="value1",
metadata={"source": file_path, "seq_num": 1},
),
Document(
page_content="value2",
metadata={"source": file_path, "seq_num": 2},
),
Document(
page_content="value3",
metadata={"source": file_path, "seq_num": 3},
),
Document(
page_content="value4",
metadata={"source": file_path, "seq_num": 4},
),
]
mocker.patch(
"pathlib.Path.open",
return_value=io.StringIO(
"""
[{"text": "value1"}, {"text": "value2"}]
[{"text": "value3"}, {"text": "value4"}]
"""
),
)
loader = JSONLoader(file_path=file_path, json_lines=True, **params)
result = loader.load()
assert result == expected_docs
def test_load_empty_jsonlines(mocker: MockerFixture) -> None:
mocker.patch("pathlib.Path.open", return_value=io.StringIO(""))
loader = JSONLoader(file_path="file_path", jq_schema=".[].text", json_lines=True)
result = loader.load()
assert result == []
@pytest.mark.parametrize(
"patch_func,patch_func_value,kwargs",
(
# JSON content.
(
"pathlib.Path.read_text",
'[{"text": "value1"}, {"text": "value2"}]',
{"jq_schema": ".[]", "content_key": "text"},
),
# JSON Lines content.
(
"pathlib.Path.open",
io.StringIO(
"""
{"text": "value1"}
{"text": "value2"}
"""
),
{"jq_schema": ".", "content_key": "text", "json_lines": True},
),
),
)
def test_json_meta(
patch_func: str, patch_func_value: Any, kwargs: Dict, mocker: MockerFixture
) -> None:
mocker.patch("builtins.open", mocker.mock_open())
mocker.patch(patch_func, return_value=patch_func_value)
file_path = "/workspaces/langchain/test.json"
expected_docs = [
Document(
page_content="value1",
metadata={"source": file_path, "seq_num": 1, "x": "value1-meta"},
),
Document(
page_content="value2",
metadata={"source": file_path, "seq_num": 2, "x": "value2-meta"},
),
]
def metadata_func(record: Dict, metadata: Dict) -> Dict:
metadata["x"] = f"{record['text']}-meta"
return metadata
loader = JSONLoader(file_path=file_path, metadata_func=metadata_func, **kwargs)
result = loader.load()
assert result == expected_docs

Loading…
Cancel
Save