Bugfix/jsonloader metadata (#9793)

Hi,

  - Description: 
    - Solves the issue #6478. 
    - Includes some additional rework on the `JSONLoader` class:
      - Getting metadata is decoupled from `_get_text`
- Validating metadata_func is perform now by `_validate_metadata_func`,
instead of `_validate_content_key`
  - Issue: #6478 
  - Dependencies: NA
  - Tag maintainer: @hwchase17
This commit is contained in:
mgvalverde 2023-09-04 01:01:43 +02:00 committed by GitHub
parent 7d1b0fbe79
commit 33f43cc1b0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 74 additions and 11 deletions

View File

@ -76,24 +76,20 @@ class JSONLoader(BaseLoader):
# and prevent the user from getting a cryptic error later on.
if self._content_key is not None:
self._validate_content_key(data)
if self._metadata_func is not None:
self._validate_metadata_func(data)
for i, sample in enumerate(data, len(docs) + 1):
metadata = dict(
source=str(self.file_path),
seq_num=i,
text = self._get_text(sample=sample)
metadata = self._get_metadata(
sample=sample, source=str(self.file_path), seq_num=i
)
text = self._get_text(sample=sample, metadata=metadata)
docs.append(Document(page_content=text, metadata=metadata))
def _get_text(self, sample: Any, metadata: dict) -> str:
def _get_text(self, sample: Any) -> str:
"""Convert sample to string format"""
if self._content_key is not None:
content = sample.get(self._content_key)
if self._metadata_func is not None:
# We pass in the metadata dict to the metadata_func
# so that the user can customize the default metadata
# based on the content of the JSON object.
metadata = self._metadata_func(sample, metadata)
else:
content = sample
@ -112,6 +108,20 @@ class JSONLoader(BaseLoader):
else:
return str(content) if content is not None else ""
def _get_metadata(
self, sample: Dict[str, Any], **additional_fields: Any
) -> Dict[str, Any]:
"""
Return a metadata dictionary base on the existence of metadata_func
:param sample: single data payload
:param additional_fields: key-word arguments to be added as metadata values
:return:
"""
if self._metadata_func is not None:
return self._metadata_func(sample, additional_fields)
else:
return additional_fields
def _validate_content_key(self, data: Any) -> None:
"""Check if a content key is valid"""
sample = data.first()
@ -127,6 +137,10 @@ class JSONLoader(BaseLoader):
with the key `{self._content_key}`"
)
def _validate_metadata_func(self, data: Any) -> None:
"""Check if the metadata_func output is valid"""
sample = data.first()
if self._metadata_func is not None:
sample_metadata = self._metadata_func(sample, {})
if not isinstance(sample_metadata, dict):

View File

@ -244,7 +244,7 @@ def test_load_empty_jsonlines(mocker: MockerFixture) -> None:
),
),
)
def test_json_meta(
def test_json_meta_01(
patch_func: str, patch_func_value: Any, kwargs: Dict, mocker: MockerFixture
) -> None:
mocker.patch("builtins.open", mocker.mock_open())
@ -270,3 +270,52 @@ def test_json_meta(
result = loader.load()
assert result == expected_docs
@pytest.mark.parametrize(
"patch_func,patch_func_value,kwargs",
(
# JSON content.
(
"pathlib.Path.read_text",
'[{"text": "value1"}, {"text": "value2"}]',
{"jq_schema": ".[]", "content_key": "text"},
),
# JSON Lines content.
(
"pathlib.Path.open",
io.StringIO(
"""
{"text": "value1"}
{"text": "value2"}
"""
),
{"jq_schema": ".", "content_key": "text", "json_lines": True},
),
),
)
def test_json_meta_02(
patch_func: str, patch_func_value: Any, kwargs: Dict, mocker: MockerFixture
) -> None:
mocker.patch("builtins.open", mocker.mock_open())
mocker.patch(patch_func, return_value=patch_func_value)
file_path = "/workspaces/langchain/test.json"
expected_docs = [
Document(
page_content="value1",
metadata={"source": file_path, "seq_num": 1, "x": "value1-meta"},
),
Document(
page_content="value2",
metadata={"source": file_path, "seq_num": 2, "x": "value2-meta"},
),
]
def metadata_func(record: Dict, metadata: Dict) -> Dict:
return {**metadata, "x": f"{record['text']}-meta"}
loader = JSONLoader(file_path=file_path, metadata_func=metadata_func, **kwargs)
result = loader.load()
assert result == expected_docs