mirror of
https://github.com/hwchase17/langchain
synced 2024-11-04 06:00:26 +00:00
Bugfix/jsonloader metadata (#9793)
Hi, - Description: - Solves the issue #6478. - Includes some additional rework on the `JSONLoader` class: - Getting metadata is decoupled from `_get_text` - Validating metadata_func is perform now by `_validate_metadata_func`, instead of `_validate_content_key` - Issue: #6478 - Dependencies: NA - Tag maintainer: @hwchase17
This commit is contained in:
parent
7d1b0fbe79
commit
33f43cc1b0
@ -76,24 +76,20 @@ class JSONLoader(BaseLoader):
|
||||
# and prevent the user from getting a cryptic error later on.
|
||||
if self._content_key is not None:
|
||||
self._validate_content_key(data)
|
||||
if self._metadata_func is not None:
|
||||
self._validate_metadata_func(data)
|
||||
|
||||
for i, sample in enumerate(data, len(docs) + 1):
|
||||
metadata = dict(
|
||||
source=str(self.file_path),
|
||||
seq_num=i,
|
||||
text = self._get_text(sample=sample)
|
||||
metadata = self._get_metadata(
|
||||
sample=sample, source=str(self.file_path), seq_num=i
|
||||
)
|
||||
text = self._get_text(sample=sample, metadata=metadata)
|
||||
docs.append(Document(page_content=text, metadata=metadata))
|
||||
|
||||
def _get_text(self, sample: Any, metadata: dict) -> str:
|
||||
def _get_text(self, sample: Any) -> str:
|
||||
"""Convert sample to string format"""
|
||||
if self._content_key is not None:
|
||||
content = sample.get(self._content_key)
|
||||
if self._metadata_func is not None:
|
||||
# We pass in the metadata dict to the metadata_func
|
||||
# so that the user can customize the default metadata
|
||||
# based on the content of the JSON object.
|
||||
metadata = self._metadata_func(sample, metadata)
|
||||
else:
|
||||
content = sample
|
||||
|
||||
@ -112,6 +108,20 @@ class JSONLoader(BaseLoader):
|
||||
else:
|
||||
return str(content) if content is not None else ""
|
||||
|
||||
def _get_metadata(
|
||||
self, sample: Dict[str, Any], **additional_fields: Any
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Return a metadata dictionary base on the existence of metadata_func
|
||||
:param sample: single data payload
|
||||
:param additional_fields: key-word arguments to be added as metadata values
|
||||
:return:
|
||||
"""
|
||||
if self._metadata_func is not None:
|
||||
return self._metadata_func(sample, additional_fields)
|
||||
else:
|
||||
return additional_fields
|
||||
|
||||
def _validate_content_key(self, data: Any) -> None:
|
||||
"""Check if a content key is valid"""
|
||||
sample = data.first()
|
||||
@ -127,6 +137,10 @@ class JSONLoader(BaseLoader):
|
||||
with the key `{self._content_key}`"
|
||||
)
|
||||
|
||||
def _validate_metadata_func(self, data: Any) -> None:
|
||||
"""Check if the metadata_func output is valid"""
|
||||
|
||||
sample = data.first()
|
||||
if self._metadata_func is not None:
|
||||
sample_metadata = self._metadata_func(sample, {})
|
||||
if not isinstance(sample_metadata, dict):
|
||||
|
@ -244,7 +244,7 @@ def test_load_empty_jsonlines(mocker: MockerFixture) -> None:
|
||||
),
|
||||
),
|
||||
)
|
||||
def test_json_meta(
|
||||
def test_json_meta_01(
|
||||
patch_func: str, patch_func_value: Any, kwargs: Dict, mocker: MockerFixture
|
||||
) -> None:
|
||||
mocker.patch("builtins.open", mocker.mock_open())
|
||||
@ -270,3 +270,52 @@ def test_json_meta(
|
||||
result = loader.load()
|
||||
|
||||
assert result == expected_docs
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"patch_func,patch_func_value,kwargs",
|
||||
(
|
||||
# JSON content.
|
||||
(
|
||||
"pathlib.Path.read_text",
|
||||
'[{"text": "value1"}, {"text": "value2"}]',
|
||||
{"jq_schema": ".[]", "content_key": "text"},
|
||||
),
|
||||
# JSON Lines content.
|
||||
(
|
||||
"pathlib.Path.open",
|
||||
io.StringIO(
|
||||
"""
|
||||
{"text": "value1"}
|
||||
{"text": "value2"}
|
||||
"""
|
||||
),
|
||||
{"jq_schema": ".", "content_key": "text", "json_lines": True},
|
||||
),
|
||||
),
|
||||
)
|
||||
def test_json_meta_02(
|
||||
patch_func: str, patch_func_value: Any, kwargs: Dict, mocker: MockerFixture
|
||||
) -> None:
|
||||
mocker.patch("builtins.open", mocker.mock_open())
|
||||
mocker.patch(patch_func, return_value=patch_func_value)
|
||||
|
||||
file_path = "/workspaces/langchain/test.json"
|
||||
expected_docs = [
|
||||
Document(
|
||||
page_content="value1",
|
||||
metadata={"source": file_path, "seq_num": 1, "x": "value1-meta"},
|
||||
),
|
||||
Document(
|
||||
page_content="value2",
|
||||
metadata={"source": file_path, "seq_num": 2, "x": "value2-meta"},
|
||||
),
|
||||
]
|
||||
|
||||
def metadata_func(record: Dict, metadata: Dict) -> Dict:
|
||||
return {**metadata, "x": f"{record['text']}-meta"}
|
||||
|
||||
loader = JSONLoader(file_path=file_path, metadata_func=metadata_func, **kwargs)
|
||||
result = loader.load()
|
||||
|
||||
assert result == expected_docs
|
||||
|
Loading…
Reference in New Issue
Block a user