From 3d54784e6df13d45a798f5b97cde34a7f97efc3b Mon Sep 17 00:00:00 2001 From: bilk0h <43228593+bilkoh@users.noreply.github.com> Date: Tue, 18 Jun 2024 20:21:55 -0700 Subject: [PATCH] text-splitters: Fix/recursive json splitter data persistence issue (#21529) Thank you for contributing to LangChain! **Description:** Noticed an issue with when I was calling `RecursiveJsonSplitter().split_json()` multiple times that I was getting weird results. I found an issue where `chunks` list in the `_json_split` method. If chunks is not provided when _json_split (which is the case when split_json calls _json_split) then the same list is used for subsequent calls to `_json_split`. You can see this in the test case i also added to this commit. Output should be: ``` [{'a': 1, 'b': 2}] [{'c': 3, 'd': 4}] ``` Instead you get: ``` [{'a': 1, 'b': 2}] [{'a': 1, 'b': 2, 'c': 3, 'd': 4}] ``` --------- Co-authored-by: Nuno Campos Co-authored-by: isaac hershenson Co-authored-by: Isaac Francisco <78627776+isahers1@users.noreply.github.com> --- .../langchain_text_splitters/json.py | 2 +- .../tests/unit_tests/test_text_splitters.py | 21 +++++++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/libs/text-splitters/langchain_text_splitters/json.py b/libs/text-splitters/langchain_text_splitters/json.py index 7f21d5b9f6..c83d8b2a42 100644 --- a/libs/text-splitters/langchain_text_splitters/json.py +++ b/libs/text-splitters/langchain_text_splitters/json.py @@ -55,7 +55,7 @@ class RecursiveJsonSplitter: Split json into maximum size dictionaries while preserving structure. """ current_path = current_path or [] - chunks = chunks or [{}] + chunks = chunks if chunks is not None else [{}] if isinstance(data, dict): for key, value in data.items(): new_path = current_path + [key] diff --git a/libs/text-splitters/tests/unit_tests/test_text_splitters.py b/libs/text-splitters/tests/unit_tests/test_text_splitters.py index d1d113009d..2229480522 100644 --- a/libs/text-splitters/tests/unit_tests/test_text_splitters.py +++ b/libs/text-splitters/tests/unit_tests/test_text_splitters.py @@ -1953,3 +1953,24 @@ def test_split_json_with_lists() -> None: texts_list = splitter.split_text(json_data=test_data_list, convert_lists=True) assert len(texts_list) >= len(texts) + + +def test_split_json_many_calls() -> None: + x = {"a": 1, "b": 2} + y = {"c": 3, "d": 4} + + splitter = RecursiveJsonSplitter() + chunk0 = splitter.split_json(x) + assert chunk0 == [{"a": 1, "b": 2}] + + chunk1 = splitter.split_json(y) + assert chunk1 == [{"c": 3, "d": 4}] + + # chunk0 is now altered by creating chunk1 + assert chunk0 == [{"a": 1, "b": 2}] + + chunk0_output = [{"a": 1, "b": 2}] + chunk1_output = [{"c": 3, "d": 4}] + + assert chunk0 == chunk0_output + assert chunk1 == chunk1_output