community: Implement lazy_load() for JSONLoader (#18643)

Covered by `tests/unit_tests/document_loaders/test_json_loader.py`
This commit is contained in:
Christophe Bornet 2024-03-08 19:58:17 +01:00 committed by GitHub
parent a88f62ec3c
commit ead2a74806
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -1,6 +1,6 @@
import json
from pathlib import Path
from typing import Any, Callable, Dict, List, Optional, Union
from typing import Any, Callable, Dict, Iterator, Optional, Union
from langchain_core.documents import Document
@ -67,20 +67,23 @@ class JSONLoader(BaseLoader):
self._text_content = text_content
self._json_lines = json_lines
def load(self) -> List[Document]:
def lazy_load(self) -> Iterator[Document]:
"""Load and return documents from the JSON file."""
docs: List[Document] = []
index = 0
if self._json_lines:
with self.file_path.open(encoding="utf-8") as f:
for line in f:
line = line.strip()
if line:
self._parse(line, docs)
for doc in self._parse(line, index):
yield doc
index += 1
else:
self._parse(self.file_path.read_text(encoding="utf-8"), docs)
return docs
for doc in self._parse(self.file_path.read_text(encoding="utf-8"), index):
yield doc
index += 1
def _parse(self, content: str, docs: List[Document]) -> None:
def _parse(self, content: str, index: int) -> Iterator[Document]:
"""Convert given content to documents."""
data = self._jq_schema.input(json.loads(content))
@ -92,12 +95,12 @@ class JSONLoader(BaseLoader):
if self._metadata_func is not None:
self._validate_metadata_func(data)
for i, sample in enumerate(data, len(docs) + 1):
for i, sample in enumerate(data, index + 1):
text = self._get_text(sample=sample)
metadata = self._get_metadata(
sample=sample, source=str(self.file_path), seq_num=i
)
docs.append(Document(page_content=text, metadata=metadata))
yield Document(page_content=text, metadata=metadata)
def _get_text(self, sample: Any) -> str:
"""Convert sample to string format"""