forked from Archives/langchain
You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
125 lines
4.8 KiB
Python
125 lines
4.8 KiB
Python
"""Loader that loads data from JSON."""
|
|
import json
|
|
from pathlib import Path
|
|
from typing import Any, Callable, Dict, List, Optional, Union
|
|
|
|
from langchain.docstore.document import Document
|
|
from langchain.document_loaders.base import BaseLoader
|
|
|
|
|
|
class JSONLoader(BaseLoader):
|
|
"""Loads a JSON file and references a jq schema provided to load the text into
|
|
documents.
|
|
|
|
Example:
|
|
[{"text": ...}, {"text": ...}, {"text": ...}] -> schema = .[].text
|
|
{"key": [{"text": ...}, {"text": ...}, {"text": ...}]} -> schema = .key[].text
|
|
["", "", ""] -> schema = .[]
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
file_path: Union[str, Path],
|
|
jq_schema: str,
|
|
content_key: Optional[str] = None,
|
|
metadata_func: Optional[Callable[[Dict, Dict], Dict]] = None,
|
|
text_content: bool = True,
|
|
):
|
|
"""Initialize the JSONLoader.
|
|
|
|
Args:
|
|
file_path (Union[str, Path]): The path to the JSON file.
|
|
jq_schema (str): The jq schema to use to extract the data or text from
|
|
the JSON.
|
|
content_key (str): The key to use to extract the content from the JSON if
|
|
the jq_schema results to a list of objects (dict).
|
|
metadata_func (Callable[Dict, Dict]): A function that takes in the JSON
|
|
object extracted by the jq_schema and the default metadata and returns
|
|
a dict of the updated metadata.
|
|
text_content (bool): Boolean flag to indicates whether the content is in
|
|
string format, default to True
|
|
"""
|
|
try:
|
|
import jq # noqa:F401
|
|
except ImportError:
|
|
raise ValueError(
|
|
"jq package not found, please install it with `pip install jq`"
|
|
)
|
|
|
|
self.file_path = Path(file_path).resolve()
|
|
self._jq_schema = jq.compile(jq_schema)
|
|
self._content_key = content_key
|
|
self._metadata_func = metadata_func
|
|
self._text_content = text_content
|
|
|
|
def load(self) -> List[Document]:
|
|
"""Load and return documents from the JSON file."""
|
|
data = self._jq_schema.input(json.loads(self.file_path.read_text()))
|
|
|
|
# Perform some validation
|
|
# This is not a perfect validation, but it should catch most cases
|
|
# and prevent the user from getting a cryptic error later on.
|
|
if self._content_key is not None:
|
|
self._validate_content_key(data)
|
|
|
|
docs = []
|
|
for i, sample in enumerate(data, 1):
|
|
metadata = dict(
|
|
source=str(self.file_path),
|
|
seq_num=i,
|
|
)
|
|
text = self._get_text(sample=sample, metadata=metadata)
|
|
docs.append(Document(page_content=text, metadata=metadata))
|
|
|
|
return docs
|
|
|
|
def _get_text(self, sample: Any, metadata: dict) -> str:
|
|
"""Convert sample to string format"""
|
|
if self._content_key is not None:
|
|
content = sample.get(self._content_key)
|
|
if self._metadata_func is not None:
|
|
# We pass in the metadata dict to the metadata_func
|
|
# so that the user can customize the default metadata
|
|
# based on the content of the JSON object.
|
|
metadata = self._metadata_func(sample, metadata)
|
|
else:
|
|
content = sample
|
|
|
|
if self._text_content and not isinstance(content, str):
|
|
raise ValueError(
|
|
f"Expected page_content is string, got {type(content)} instead. \
|
|
Set `text_content=False` if the desired input for \
|
|
`page_content` is not a string"
|
|
)
|
|
|
|
# In case the text is None, set it to an empty string
|
|
elif isinstance(content, str):
|
|
return content
|
|
elif isinstance(content, dict):
|
|
return json.dumps(content) if content else ""
|
|
else:
|
|
return str(content) if content is not None else ""
|
|
|
|
def _validate_content_key(self, data: Any) -> None:
|
|
"""Check if content key is valid"""
|
|
sample = data.first()
|
|
if not isinstance(sample, dict):
|
|
raise ValueError(
|
|
f"Expected the jq schema to result in a list of objects (dict), \
|
|
so sample must be a dict but got `{type(sample)}`"
|
|
)
|
|
|
|
if sample.get(self._content_key) is None:
|
|
raise ValueError(
|
|
f"Expected the jq schema to result in a list of objects (dict) \
|
|
with the key `{self._content_key}`"
|
|
)
|
|
|
|
if self._metadata_func is not None:
|
|
sample_metadata = self._metadata_func(sample, {})
|
|
if not isinstance(sample_metadata, dict):
|
|
raise ValueError(
|
|
f"Expected the metadata_func to return a dict but got \
|
|
`{type(sample_metadata)}`"
|
|
)
|