diff --git a/libs/text-splitters/langchain_text_splitters/json.py b/libs/text-splitters/langchain_text_splitters/json.py index 69b9c73252..7f21d5b9f6 100644 --- a/libs/text-splitters/langchain_text_splitters/json.py +++ b/libs/text-splitters/langchain_text_splitters/json.py @@ -96,26 +96,32 @@ class RecursiveJsonSplitter: return chunks def split_text( - self, json_data: Dict[str, Any], convert_lists: bool = False + self, + json_data: Dict[str, Any], + convert_lists: bool = False, + ensure_ascii: bool = True, ) -> List[str]: """Splits JSON into a list of JSON formatted strings""" chunks = self.split_json(json_data=json_data, convert_lists=convert_lists) # Convert to string - return [json.dumps(chunk) for chunk in chunks] + return [json.dumps(chunk, ensure_ascii=ensure_ascii) for chunk in chunks] def create_documents( self, texts: List[Dict], convert_lists: bool = False, + ensure_ascii: bool = True, metadatas: Optional[List[dict]] = None, ) -> List[Document]: """Create documents from a list of json objects (Dict).""" _metadatas = metadatas or [{}] * len(texts) documents = [] for i, text in enumerate(texts): - for chunk in self.split_text(json_data=text, convert_lists=convert_lists): + for chunk in self.split_text( + json_data=text, convert_lists=convert_lists, ensure_ascii=ensure_ascii + ): metadata = copy.deepcopy(_metadatas[i]) new_doc = Document(page_content=chunk, metadata=metadata) documents.append(new_doc) diff --git a/libs/text-splitters/tests/integration_tests/test_nlp_text_splitters.py b/libs/text-splitters/tests/integration_tests/test_nlp_text_splitters.py index 9886cbe80e..402d01655d 100644 --- a/libs/text-splitters/tests/integration_tests/test_nlp_text_splitters.py +++ b/libs/text-splitters/tests/integration_tests/test_nlp_text_splitters.py @@ -1,4 +1,5 @@ """Test text splitting functionality using NLTK and Spacy based sentence splitters.""" + import pytest from langchain_text_splitters.nltk import NLTKTextSplitter diff --git a/libs/text-splitters/tests/unit_tests/conftest.py b/libs/text-splitters/tests/unit_tests/conftest.py index f1746902fc..dd4080cfca 100644 --- a/libs/text-splitters/tests/unit_tests/conftest.py +++ b/libs/text-splitters/tests/unit_tests/conftest.py @@ -1,4 +1,5 @@ """Configuration for unit tests.""" + from importlib import util from typing import Dict, Sequence diff --git a/libs/text-splitters/tests/unit_tests/test_text_splitters.py b/libs/text-splitters/tests/unit_tests/test_text_splitters.py index 057e5d8aa4..edfcd0c61a 100644 --- a/libs/text-splitters/tests/unit_tests/test_text_splitters.py +++ b/libs/text-splitters/tests/unit_tests/test_text_splitters.py @@ -1,4 +1,5 @@ """Test text splitting functionality.""" + import random import re import string