From 33f43cc1b07346de7d5cd6db79552ccf21c32262 Mon Sep 17 00:00:00 2001 From: mgvalverde <32850810+mgvalverde@users.noreply.github.com> Date: Mon, 4 Sep 2023 01:01:43 +0200 Subject: [PATCH] Bugfix/jsonloader metadata (#9793) Hi, - Description: - Solves the issue #6478. - Includes some additional rework on the `JSONLoader` class: - Getting metadata is decoupled from `_get_text` - Validating metadata_func is perform now by `_validate_metadata_func`, instead of `_validate_content_key` - Issue: #6478 - Dependencies: NA - Tag maintainer: @hwchase17 --- .../langchain/document_loaders/json_loader.py | 34 +++++++++---- .../document_loaders/test_json_loader.py | 51 ++++++++++++++++++- 2 files changed, 74 insertions(+), 11 deletions(-) diff --git a/libs/langchain/langchain/document_loaders/json_loader.py b/libs/langchain/langchain/document_loaders/json_loader.py index e13ae1bb89..8f8efa867b 100644 --- a/libs/langchain/langchain/document_loaders/json_loader.py +++ b/libs/langchain/langchain/document_loaders/json_loader.py @@ -76,24 +76,20 @@ class JSONLoader(BaseLoader): # and prevent the user from getting a cryptic error later on. if self._content_key is not None: self._validate_content_key(data) + if self._metadata_func is not None: + self._validate_metadata_func(data) for i, sample in enumerate(data, len(docs) + 1): - metadata = dict( - source=str(self.file_path), - seq_num=i, + text = self._get_text(sample=sample) + metadata = self._get_metadata( + sample=sample, source=str(self.file_path), seq_num=i ) - text = self._get_text(sample=sample, metadata=metadata) docs.append(Document(page_content=text, metadata=metadata)) - def _get_text(self, sample: Any, metadata: dict) -> str: + def _get_text(self, sample: Any) -> str: """Convert sample to string format""" if self._content_key is not None: content = sample.get(self._content_key) - if self._metadata_func is not None: - # We pass in the metadata dict to the metadata_func - # so that the user can customize the default metadata - # based on the content of the JSON object. - metadata = self._metadata_func(sample, metadata) else: content = sample @@ -112,6 +108,20 @@ class JSONLoader(BaseLoader): else: return str(content) if content is not None else "" + def _get_metadata( + self, sample: Dict[str, Any], **additional_fields: Any + ) -> Dict[str, Any]: + """ + Return a metadata dictionary base on the existence of metadata_func + :param sample: single data payload + :param additional_fields: key-word arguments to be added as metadata values + :return: + """ + if self._metadata_func is not None: + return self._metadata_func(sample, additional_fields) + else: + return additional_fields + def _validate_content_key(self, data: Any) -> None: """Check if a content key is valid""" sample = data.first() @@ -127,6 +137,10 @@ class JSONLoader(BaseLoader): with the key `{self._content_key}`" ) + def _validate_metadata_func(self, data: Any) -> None: + """Check if the metadata_func output is valid""" + + sample = data.first() if self._metadata_func is not None: sample_metadata = self._metadata_func(sample, {}) if not isinstance(sample_metadata, dict): diff --git a/libs/langchain/tests/unit_tests/document_loaders/test_json_loader.py b/libs/langchain/tests/unit_tests/document_loaders/test_json_loader.py index 19fb90e0c9..a96facc417 100644 --- a/libs/langchain/tests/unit_tests/document_loaders/test_json_loader.py +++ b/libs/langchain/tests/unit_tests/document_loaders/test_json_loader.py @@ -244,7 +244,7 @@ def test_load_empty_jsonlines(mocker: MockerFixture) -> None: ), ), ) -def test_json_meta( +def test_json_meta_01( patch_func: str, patch_func_value: Any, kwargs: Dict, mocker: MockerFixture ) -> None: mocker.patch("builtins.open", mocker.mock_open()) @@ -270,3 +270,52 @@ def test_json_meta( result = loader.load() assert result == expected_docs + + +@pytest.mark.parametrize( + "patch_func,patch_func_value,kwargs", + ( + # JSON content. + ( + "pathlib.Path.read_text", + '[{"text": "value1"}, {"text": "value2"}]', + {"jq_schema": ".[]", "content_key": "text"}, + ), + # JSON Lines content. + ( + "pathlib.Path.open", + io.StringIO( + """ + {"text": "value1"} + {"text": "value2"} + """ + ), + {"jq_schema": ".", "content_key": "text", "json_lines": True}, + ), + ), +) +def test_json_meta_02( + patch_func: str, patch_func_value: Any, kwargs: Dict, mocker: MockerFixture +) -> None: + mocker.patch("builtins.open", mocker.mock_open()) + mocker.patch(patch_func, return_value=patch_func_value) + + file_path = "/workspaces/langchain/test.json" + expected_docs = [ + Document( + page_content="value1", + metadata={"source": file_path, "seq_num": 1, "x": "value1-meta"}, + ), + Document( + page_content="value2", + metadata={"source": file_path, "seq_num": 2, "x": "value2-meta"}, + ), + ] + + def metadata_func(record: Dict, metadata: Dict) -> Dict: + return {**metadata, "x": f"{record['text']}-meta"} + + loader = JSONLoader(file_path=file_path, metadata_func=metadata_func, **kwargs) + result = loader.load() + + assert result == expected_docs