Add JSON Lines support to JSONLoader (#6913)

**Description**:

The JSON Lines format is used by some services such as OpenAI and
HuggingFace. It's also a convenient alternative to CSV.

This PR adds JSON Lines support to `JSONLoader` and also updates related
tests.

**Tag maintainer**: @rlancemartin, @eyurtsev.

PS I was not able to build docs locally so didn't update related
section.
harrison/split-schema-dir^2
Sergey Kozlov 1 year ago committed by GitHub
parent 153b56d19b
commit 6d15854cda
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -2,6 +2,8 @@
>[JSON (JavaScript Object Notation)](https://en.wikipedia.org/wiki/JSON) is an open standard file format and data interchange format that uses human-readable text to store and transmit data objects consisting of attributevalue pairs and arrays (or other serializable values). >[JSON (JavaScript Object Notation)](https://en.wikipedia.org/wiki/JSON) is an open standard file format and data interchange format that uses human-readable text to store and transmit data objects consisting of attributevalue pairs and arrays (or other serializable values).
>[JSON Lines](https://jsonlines.org/) is a file format where each line is a valid JSON value.
import Example from "@snippets/modules/data_connection/document_loaders/how_to/json.mdx" import Example from "@snippets/modules/data_connection/document_loaders/how_to/json.mdx"
<Example/> <Example/>

@ -0,0 +1,3 @@
{"sender_name": "User 2", "timestamp_ms": 1675597571851, "content": "Bye!"}
{"sender_name": "User 1", "timestamp_ms": 1675597435669, "content": "Oh no worries! Bye"}
{"sender_name": "User 2", "timestamp_ms": 1675596277579, "content": "No Im sorry it was my mistake, the blue one is not for sale"}

@ -78,11 +78,14 @@ pprint(data)
</CodeOutputBlock> </CodeOutputBlock>
## Using `JSONLoader` ## Using `JSONLoader`
Suppose we are interested in extracting the values under the `content` field within the `messages` key of the JSON data. This can easily be done through the `JSONLoader` as shown below. Suppose we are interested in extracting the values under the `content` field within the `messages` key of the JSON data. This can easily be done through the `JSONLoader` as shown below.
### JSON file
```python ```python
loader = JSONLoader( loader = JSONLoader(
file_path='./example_data/facebook_chat.json', file_path='./example_data/facebook_chat.json',
@ -114,6 +117,81 @@ pprint(data)
</CodeOutputBlock> </CodeOutputBlock>
### JSON Lines file
If you want to load documents from a JSON Lines file, you pass `json_lines=True`
and specify `jq_schema` to extract `page_content` from a single JSON object.
```python
file_path = './example_data/facebook_chat_messages.jsonl'
pprint(Path(file_path).read_text())
```
<CodeOutputBlock lang="python">
```
('{"sender_name": "User 2", "timestamp_ms": 1675597571851, "content": "Bye!"}\n'
'{"sender_name": "User 1", "timestamp_ms": 1675597435669, "content": "Oh no '
'worries! Bye"}\n'
'{"sender_name": "User 2", "timestamp_ms": 1675596277579, "content": "No Im '
'sorry it was my mistake, the blue one is not for sale"}\n')
```
</CodeOutputBlock>
```python
loader = JSONLoader(
file_path='./example_data/facebook_chat_messages.jsonl',
jq_schema='.content',
json_lines=True)
data = loader.load()
```
```python
pprint(data)
```
<CodeOutputBlock lang="python">
```
[Document(page_content='Bye!', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat_messages.jsonl', 'seq_num': 1}),
Document(page_content='Oh no worries! Bye', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat_messages.jsonl', 'seq_num': 2}),
Document(page_content='No Im sorry it was my mistake, the blue one is not for sale', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat_messages.jsonl', 'seq_num': 3})]
```
</CodeOutputBlock>
Another option is set `jq_schema='.'` and provide `content_key`:
```python
loader = JSONLoader(
file_path='./example_data/facebook_chat_messages.jsonl',
jq_schema='.',
content_key='sender_name',
json_lines=True)
data = loader.load()
```
```python
pprint(data)
```
<CodeOutputBlock lang="python">
```
[Document(page_content='User 2', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat_messages.jsonl', 'seq_num': 1}),
Document(page_content='User 1', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat_messages.jsonl', 'seq_num': 2}),
Document(page_content='User 2', metadata={'source': 'langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat_messages.jsonl', 'seq_num': 3})]
```
</CodeOutputBlock>
## Extracting metadata ## Extracting metadata
Generally, we want to include metadata available in the JSON file into the documents that we create from the content. Generally, we want to include metadata available in the JSON file into the documents that we create from the content.

@ -23,11 +23,12 @@ class JSONLoader(BaseLoader):
content_key: Optional[str] = None, content_key: Optional[str] = None,
metadata_func: Optional[Callable[[Dict, Dict], Dict]] = None, metadata_func: Optional[Callable[[Dict, Dict], Dict]] = None,
text_content: bool = True, text_content: bool = True,
json_lines: bool = False,
): ):
"""Initialize the JSONLoader. """Initialize the JSONLoader.
Args: Args:
file_path (Union[str, Path]): The path to the JSON file. file_path (Union[str, Path]): The path to the JSON or JSON Lines file.
jq_schema (str): The jq schema to use to extract the data or text from jq_schema (str): The jq schema to use to extract the data or text from
the JSON. the JSON.
content_key (str): The key to use to extract the content from the JSON if content_key (str): The key to use to extract the content from the JSON if
@ -35,8 +36,10 @@ class JSONLoader(BaseLoader):
metadata_func (Callable[Dict, Dict]): A function that takes in the JSON metadata_func (Callable[Dict, Dict]): A function that takes in the JSON
object extracted by the jq_schema and the default metadata and returns object extracted by the jq_schema and the default metadata and returns
a dict of the updated metadata. a dict of the updated metadata.
text_content (bool): Boolean flag to indicates whether the content is in text_content (bool): Boolean flag to indicate whether the content is in
string format, default to True string format, default to True.
json_lines (bool): Boolean flag to indicate whether the input is in
JSON Lines format.
""" """
try: try:
import jq # noqa:F401 import jq # noqa:F401
@ -50,10 +53,24 @@ class JSONLoader(BaseLoader):
self._content_key = content_key self._content_key = content_key
self._metadata_func = metadata_func self._metadata_func = metadata_func
self._text_content = text_content self._text_content = text_content
self._json_lines = json_lines
def load(self) -> List[Document]: def load(self) -> List[Document]:
"""Load and return documents from the JSON file.""" """Load and return documents from the JSON file."""
data = self._jq_schema.input(json.loads(self.file_path.read_text())) docs: List[Document] = []
if self._json_lines:
with self.file_path.open(encoding="utf-8") as f:
for line in f:
line = line.strip()
if line:
self._parse(line, docs)
else:
self._parse(self.file_path.read_text(), docs)
return docs
def _parse(self, content: str, docs: List[Document]) -> None:
"""Convert given content to documents."""
data = self._jq_schema.input(json.loads(content))
# Perform some validation # Perform some validation
# This is not a perfect validation, but it should catch most cases # This is not a perfect validation, but it should catch most cases
@ -61,8 +78,7 @@ class JSONLoader(BaseLoader):
if self._content_key is not None: if self._content_key is not None:
self._validate_content_key(data) self._validate_content_key(data)
docs = [] for i, sample in enumerate(data, len(docs) + 1):
for i, sample in enumerate(data, 1):
metadata = dict( metadata = dict(
source=str(self.file_path), source=str(self.file_path),
seq_num=i, seq_num=i,
@ -70,8 +86,6 @@ class JSONLoader(BaseLoader):
text = self._get_text(sample=sample, metadata=metadata) text = self._get_text(sample=sample, metadata=metadata)
docs.append(Document(page_content=text, metadata=metadata)) docs.append(Document(page_content=text, metadata=metadata))
return docs
def _get_text(self, sample: Any, metadata: dict) -> str: def _get_text(self, sample: Any, metadata: dict) -> str:
"""Convert sample to string format""" """Convert sample to string format"""
if self._content_key is not None: if self._content_key is not None:

@ -1,3 +1,6 @@
import io
from typing import Any, Dict
import pytest import pytest
from pytest import raises from pytest import raises
from pytest_mock import MockerFixture from pytest_mock import MockerFixture
@ -5,8 +8,9 @@ from pytest_mock import MockerFixture
from langchain.docstore.document import Document from langchain.docstore.document import Document
from langchain.document_loaders.json_loader import JSONLoader from langchain.document_loaders.json_loader import JSONLoader
pytestmark = pytest.mark.requires("jq")
@pytest.mark.requires("jq")
def test_load_valid_string_content(mocker: MockerFixture) -> None: def test_load_valid_string_content(mocker: MockerFixture) -> None:
file_path = "/workspaces/langchain/test.json" file_path = "/workspaces/langchain/test.json"
expected_docs = [ expected_docs = [
@ -19,9 +23,12 @@ def test_load_valid_string_content(mocker: MockerFixture) -> None:
metadata={"source": file_path, "seq_num": 2}, metadata={"source": file_path, "seq_num": 2},
), ),
] ]
mocker.patch("builtins.open", mocker.mock_open()) mocker.patch("builtins.open", mocker.mock_open())
mock_csv_reader = mocker.patch("pathlib.Path.read_text") mocker.patch(
mock_csv_reader.return_value = '[{"text": "value1"}, {"text": "value2"}]' "pathlib.Path.read_text",
return_value='[{"text": "value1"}, {"text": "value2"}]',
)
loader = JSONLoader(file_path=file_path, jq_schema=".[].text", text_content=True) loader = JSONLoader(file_path=file_path, jq_schema=".[].text", text_content=True)
result = loader.load() result = loader.load()
@ -29,7 +36,6 @@ def test_load_valid_string_content(mocker: MockerFixture) -> None:
assert result == expected_docs assert result == expected_docs
@pytest.mark.requires("jq")
def test_load_valid_dict_content(mocker: MockerFixture) -> None: def test_load_valid_dict_content(mocker: MockerFixture) -> None:
file_path = "/workspaces/langchain/test.json" file_path = "/workspaces/langchain/test.json"
expected_docs = [ expected_docs = [
@ -42,11 +48,14 @@ def test_load_valid_dict_content(mocker: MockerFixture) -> None:
metadata={"source": file_path, "seq_num": 2}, metadata={"source": file_path, "seq_num": 2},
), ),
] ]
mocker.patch("builtins.open", mocker.mock_open()) mocker.patch("builtins.open", mocker.mock_open())
mock_csv_reader = mocker.patch("pathlib.Path.read_text") mocker.patch(
mock_csv_reader.return_value = """ "pathlib.Path.read_text",
return_value="""
[{"text": "value1"}, {"text": "value2"}] [{"text": "value1"}, {"text": "value2"}]
""" """,
)
loader = JSONLoader(file_path=file_path, jq_schema=".[]", text_content=False) loader = JSONLoader(file_path=file_path, jq_schema=".[]", text_content=False)
result = loader.load() result = loader.load()
@ -54,7 +63,6 @@ def test_load_valid_dict_content(mocker: MockerFixture) -> None:
assert result == expected_docs assert result == expected_docs
@pytest.mark.requires("jq")
def test_load_valid_bool_content(mocker: MockerFixture) -> None: def test_load_valid_bool_content(mocker: MockerFixture) -> None:
file_path = "/workspaces/langchain/test.json" file_path = "/workspaces/langchain/test.json"
expected_docs = [ expected_docs = [
@ -67,13 +75,16 @@ def test_load_valid_bool_content(mocker: MockerFixture) -> None:
metadata={"source": file_path, "seq_num": 2}, metadata={"source": file_path, "seq_num": 2},
), ),
] ]
mocker.patch("builtins.open", mocker.mock_open()) mocker.patch("builtins.open", mocker.mock_open())
mock_csv_reader = mocker.patch("pathlib.Path.read_text") mocker.patch(
mock_csv_reader.return_value = """ "pathlib.Path.read_text",
return_value="""
[ [
{"flag": false}, {"flag": true} {"flag": false}, {"flag": true}
] ]
""" """,
)
loader = JSONLoader(file_path=file_path, jq_schema=".[].flag", text_content=False) loader = JSONLoader(file_path=file_path, jq_schema=".[].flag", text_content=False)
result = loader.load() result = loader.load()
@ -81,7 +92,6 @@ def test_load_valid_bool_content(mocker: MockerFixture) -> None:
assert result == expected_docs assert result == expected_docs
@pytest.mark.requires("jq")
def test_load_valid_numeric_content(mocker: MockerFixture) -> None: def test_load_valid_numeric_content(mocker: MockerFixture) -> None:
file_path = "/workspaces/langchain/test.json" file_path = "/workspaces/langchain/test.json"
expected_docs = [ expected_docs = [
@ -94,13 +104,16 @@ def test_load_valid_numeric_content(mocker: MockerFixture) -> None:
metadata={"source": file_path, "seq_num": 2}, metadata={"source": file_path, "seq_num": 2},
), ),
] ]
mocker.patch("builtins.open", mocker.mock_open()) mocker.patch("builtins.open", mocker.mock_open())
mock_csv_reader = mocker.patch("pathlib.Path.read_text") mocker.patch(
mock_csv_reader.return_value = """ "pathlib.Path.read_text",
return_value="""
[ [
{"num": 99}, {"num": 99.5} {"num": 99}, {"num": 99.5}
] ]
""" """,
)
loader = JSONLoader(file_path=file_path, jq_schema=".[].num", text_content=False) loader = JSONLoader(file_path=file_path, jq_schema=".[].num", text_content=False)
result = loader.load() result = loader.load()
@ -108,16 +121,152 @@ def test_load_valid_numeric_content(mocker: MockerFixture) -> None:
assert result == expected_docs assert result == expected_docs
@pytest.mark.requires("jq")
def test_load_invalid_test_content(mocker: MockerFixture) -> None: def test_load_invalid_test_content(mocker: MockerFixture) -> None:
file_path = "/workspaces/langchain/test.json" file_path = "/workspaces/langchain/test.json"
mocker.patch("builtins.open", mocker.mock_open()) mocker.patch("builtins.open", mocker.mock_open())
mock_csv_reader = mocker.patch("pathlib.Path.read_text") mocker.patch(
mock_csv_reader.return_value = """ "pathlib.Path.read_text",
return_value="""
[{"text": "value1"}, {"text": "value2"}] [{"text": "value1"}, {"text": "value2"}]
""" """,
)
loader = JSONLoader(file_path=file_path, jq_schema=".[]", text_content=True) loader = JSONLoader(file_path=file_path, jq_schema=".[]", text_content=True)
with raises(ValueError): with raises(ValueError):
loader.load() loader.load()
def test_load_jsonlines(mocker: MockerFixture) -> None:
file_path = "/workspaces/langchain/test.json"
expected_docs = [
Document(
page_content="value1",
metadata={"source": file_path, "seq_num": 1},
),
Document(
page_content="value2",
metadata={"source": file_path, "seq_num": 2},
),
]
mocker.patch(
"pathlib.Path.open",
return_value=io.StringIO(
"""
{"text": "value1"}
{"text": "value2"}
"""
),
)
loader = JSONLoader(
file_path=file_path, jq_schema=".", content_key="text", json_lines=True
)
result = loader.load()
assert result == expected_docs
@pytest.mark.parametrize(
"params",
(
{"jq_schema": ".[].text"},
{"jq_schema": ".[]", "content_key": "text"},
),
)
def test_load_jsonlines_list(params: Dict, mocker: MockerFixture) -> None:
file_path = "/workspaces/langchain/test.json"
expected_docs = [
Document(
page_content="value1",
metadata={"source": file_path, "seq_num": 1},
),
Document(
page_content="value2",
metadata={"source": file_path, "seq_num": 2},
),
Document(
page_content="value3",
metadata={"source": file_path, "seq_num": 3},
),
Document(
page_content="value4",
metadata={"source": file_path, "seq_num": 4},
),
]
mocker.patch(
"pathlib.Path.open",
return_value=io.StringIO(
"""
[{"text": "value1"}, {"text": "value2"}]
[{"text": "value3"}, {"text": "value4"}]
"""
),
)
loader = JSONLoader(file_path=file_path, json_lines=True, **params)
result = loader.load()
assert result == expected_docs
def test_load_empty_jsonlines(mocker: MockerFixture) -> None:
mocker.patch("pathlib.Path.open", return_value=io.StringIO(""))
loader = JSONLoader(file_path="file_path", jq_schema=".[].text", json_lines=True)
result = loader.load()
assert result == []
@pytest.mark.parametrize(
"patch_func,patch_func_value,kwargs",
(
# JSON content.
(
"pathlib.Path.read_text",
'[{"text": "value1"}, {"text": "value2"}]',
{"jq_schema": ".[]", "content_key": "text"},
),
# JSON Lines content.
(
"pathlib.Path.open",
io.StringIO(
"""
{"text": "value1"}
{"text": "value2"}
"""
),
{"jq_schema": ".", "content_key": "text", "json_lines": True},
),
),
)
def test_json_meta(
patch_func: str, patch_func_value: Any, kwargs: Dict, mocker: MockerFixture
) -> None:
mocker.patch("builtins.open", mocker.mock_open())
mocker.patch(patch_func, return_value=patch_func_value)
file_path = "/workspaces/langchain/test.json"
expected_docs = [
Document(
page_content="value1",
metadata={"source": file_path, "seq_num": 1, "x": "value1-meta"},
),
Document(
page_content="value2",
metadata={"source": file_path, "seq_num": 2, "x": "value2-meta"},
),
]
def metadata_func(record: Dict, metadata: Dict) -> Dict:
metadata["x"] = f"{record['text']}-meta"
return metadata
loader = JSONLoader(file_path=file_path, metadata_func=metadata_func, **kwargs)
result = loader.load()
assert result == expected_docs

Loading…
Cancel
Save