Harrison/json loader fix (#4686)

Co-authored-by: Triet Le <112841660+triet-lq-holistics@users.noreply.github.com>
This commit is contained in:
Harrison Chase 2023-05-14 18:25:59 -07:00 committed by GitHub
parent ed8207b2fb
commit cdc20d1203
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 188 additions and 45 deletions

View File

@ -1,7 +1,7 @@
"""Loader that loads data from JSON."""
import json
from pathlib import Path
from typing import Callable, Dict, List, Optional, Union
from typing import Any, Callable, Dict, List, Optional, Union
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
@ -23,6 +23,7 @@ class JSONLoader(BaseLoader):
jq_schema: str,
content_key: Optional[str] = None,
metadata_func: Optional[Callable[[Dict, Dict], Dict]] = None,
text_content: bool = True,
):
"""Initialize the JSONLoader.
@ -35,6 +36,8 @@ class JSONLoader(BaseLoader):
metadata_func (Callable[Dict, Dict]): A function that takes in the JSON
object extracted by the jq_schema and the default metadata and returns
a dict of the updated metadata.
text_content (bool): Boolean flag to indicates whether the content is in
string format, default to True
"""
try:
import jq # noqa:F401
@ -47,58 +50,75 @@ class JSONLoader(BaseLoader):
self._jq_schema = jq.compile(jq_schema)
self._content_key = content_key
self._metadata_func = metadata_func
self._text_content = text_content
def load(self) -> List[Document]:
"""Load and return documents from the JSON file."""
data = self._jq_schema.input(json.loads(self.file_path.read_text()))
# Perform some validation
# This is not a perfect validation, but it should catch most cases
# and prevent the user from getting a cryptic error later on.
if self._content_key is not None:
sample = data.first()
if not isinstance(sample, dict):
raise ValueError(
f"Expected the jq schema to result in a list of objects (dict), \
so sample must be a dict but got `{type(sample)}`"
)
if sample.get(self._content_key) is None:
raise ValueError(
f"Expected the jq schema to result in a list of objects (dict) \
with the key `{self._content_key}`"
)
if self._metadata_func is not None:
sample_metadata = self._metadata_func(sample, {})
if not isinstance(sample_metadata, dict):
raise ValueError(
f"Expected the metadata_func to return a dict but got \
`{type(sample_metadata)}`"
)
self._validate_content_key(data)
docs = []
for i, sample in enumerate(data, 1):
metadata = dict(
source=str(self.file_path),
seq_num=i,
)
if self._content_key is not None:
text = sample.get(self._content_key)
if self._metadata_func is not None:
# We pass in the metadata dict to the metadata_func
# so that the user can customize the default metadata
# based on the content of the JSON object.
metadata = self._metadata_func(sample, metadata)
else:
text = sample
# In case the text is None, set it to an empty string
text = text or ""
text = self._get_text(sample=sample, metadata=metadata)
docs.append(Document(page_content=text, metadata=metadata))
return docs
def _get_text(self, sample: Any, metadata: dict) -> str:
"""Convert sample to string format"""
if self._content_key is not None:
content = sample.get(self._content_key)
if self._metadata_func is not None:
# We pass in the metadata dict to the metadata_func
# so that the user can customize the default metadata
# based on the content of the JSON object.
metadata = self._metadata_func(sample, metadata)
else:
content = sample
if self._text_content and not isinstance(content, str):
raise ValueError(
f"Expected page_content is string, got {type(content)} instead. \
Set `text_content=False` if the desired input for \
`page_content` is not a string"
)
# In case the text is None, set it to an empty string
elif isinstance(content, str):
return content
elif isinstance(content, dict):
return json.dumps(content) if content else ""
else:
return str(content) if content is not None else ""
def _validate_content_key(self, data: Any) -> None:
"""Check if content key is valid"""
sample = data.first()
if not isinstance(sample, dict):
raise ValueError(
f"Expected the jq schema to result in a list of objects (dict), \
so sample must be a dict but got `{type(sample)}`"
)
if sample.get(self._content_key) is None:
raise ValueError(
f"Expected the jq schema to result in a list of objects (dict) \
with the key `{self._content_key}`"
)
if self._metadata_func is not None:
sample_metadata = self._metadata_func(sample, {})
if not isinstance(sample_metadata, dict):
raise ValueError(
f"Expected the metadata_func to return a dict but got \
`{type(sample_metadata)}`"
)

14
poetry.lock generated
View File

@ -1,4 +1,4 @@
# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand.
# This file is automatically @generated by Poetry and should not be changed by hand.
[[package]]
name = "absl-py"
@ -9994,18 +9994,18 @@ cffi = {version = ">=1.11", markers = "platform_python_implementation == \"PyPy\
cffi = ["cffi (>=1.11)"]
[extras]
all = ["O365", "aleph-alpha-client", "anthropic", "arxiv", "atlassian-python-api", "azure-cosmos", "azure-identity", "beautifulsoup4", "clickhouse-connect", "cohere", "deeplake", "docarray", "duckduckgo-search", "elasticsearch", "faiss-cpu", "google-api-python-client", "google-search-results", "gptcache", "hnswlib", "html2text", "huggingface_hub", "jina", "jinja2", "jq", "lancedb", "lark", "manifest-ml", "networkx", "nlpcloud", "nltk", "nomic", "openai", "opensearch-py", "pexpect", "pgvector", "pinecone-client", "pinecone-text", "protobuf", "psycopg2-binary", "pyowm", "pypdf", "pytesseract", "pyvespa", "qdrant-client", "redis", "sentence-transformers", "spacy", "tensorflow-text", "tiktoken", "torch", "transformers", "weaviate-client", "wikipedia", "wolframalpha"]
azure = ["azure-core", "azure-cosmos", "azure-identity", "openai"]
all = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "jina", "manifest-ml", "elasticsearch", "opensearch-py", "google-search-results", "faiss-cpu", "sentence-transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "pinecone-text", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf", "networkx", "nomic", "aleph-alpha-client", "deeplake", "pgvector", "psycopg2-binary", "pyowm", "pytesseract", "html2text", "atlassian-python-api", "gptcache", "duckduckgo-search", "arxiv", "azure-identity", "clickhouse-connect", "azure-cosmos", "lancedb", "lark", "pexpect", "pyvespa", "O365", "jq", "docarray", "protobuf", "hnswlib", "steamship", "pdfminer-six"]
azure = ["azure-identity", "azure-cosmos", "openai", "azure-core"]
cohere = ["cohere"]
embeddings = ["sentence-transformers"]
extended-testing = ["pdfminer-six", "pypdf", "tqdm"]
hnswlib = ["docarray", "hnswlib", "protobuf"]
extended-testing = ["pypdf", "pdfminer-six", "tqdm", "jq"]
hnswlib = ["docarray", "protobuf", "hnswlib"]
in-memory-store = ["docarray"]
llms = ["anthropic", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "torch", "transformers"]
llms = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "manifest-ml", "torch", "transformers"]
openai = ["openai", "tiktoken"]
qdrant = ["qdrant-client"]
[metadata]
lock-version = "2.0"
python-versions = ">=3.8.1,<4.0"
content-hash = "6d5c4aa06539e6f7c7531c30d73cbf08fbdea75486bf4b81c106b9e678a13b45"
content-hash = "42b518704c39bc25c6da05f81a9488a9a6fecfd7784b3c9915d30127ce384a63"

View File

@ -171,7 +171,7 @@ azure = ["azure-identity", "azure-cosmos", "openai", "azure-core"]
all = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "jina", "manifest-ml", "elasticsearch", "opensearch-py", "google-search-results", "faiss-cpu", "sentence-transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "pinecone-text", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf", "networkx", "nomic", "aleph-alpha-client", "deeplake", "pgvector", "psycopg2-binary", "boto3", "pyowm", "pytesseract", "html2text", "atlassian-python-api", "gptcache", "duckduckgo-search", "arxiv", "azure-identity", "clickhouse-connect", "azure-cosmos", "lancedb", "lark", "pexpect", "pyvespa", "O365", "jq", "docarray", "protobuf", "hnswlib", "steamship", "pdfminer-six"]
# An extra used to be able to add extended testing.
extended_testing = [
"pypdf", "pdfminer.six", "tqdm"
"pypdf", "pdfminer.six", "tqdm", "jq"
]
[tool.ruff]

View File

@ -0,0 +1,123 @@
import pytest
from pytest import raises
from pytest_mock import MockerFixture
from langchain.docstore.document import Document
from langchain.document_loaders.json_loader import JSONLoader
@pytest.mark.requires("jq")
def test_load_valid_string_content(mocker: MockerFixture) -> None:
file_path = "/workspaces/langchain/test.json"
expected_docs = [
Document(
page_content="value1",
metadata={"source": file_path, "seq_num": 1},
),
Document(
page_content="value2",
metadata={"source": file_path, "seq_num": 2},
),
]
mocker.patch("builtins.open", mocker.mock_open())
mock_csv_reader = mocker.patch("pathlib.Path.read_text")
mock_csv_reader.return_value = '[{"text": "value1"}, {"text": "value2"}]'
loader = JSONLoader(file_path=file_path, jq_schema=".[].text", text_content=True)
result = loader.load()
assert result == expected_docs
@pytest.mark.requires("jq")
def test_load_valid_dict_content(mocker: MockerFixture) -> None:
file_path = "/workspaces/langchain/test.json"
expected_docs = [
Document(
page_content='{"text": "value1"}',
metadata={"source": file_path, "seq_num": 1},
),
Document(
page_content='{"text": "value2"}',
metadata={"source": file_path, "seq_num": 2},
),
]
mocker.patch("builtins.open", mocker.mock_open())
mock_csv_reader = mocker.patch("pathlib.Path.read_text")
mock_csv_reader.return_value = """
[{"text": "value1"}, {"text": "value2"}]
"""
loader = JSONLoader(file_path=file_path, jq_schema=".[]", text_content=False)
result = loader.load()
assert result == expected_docs
@pytest.mark.requires("jq")
def test_load_valid_bool_content(mocker: MockerFixture) -> None:
file_path = "/workspaces/langchain/test.json"
expected_docs = [
Document(
page_content="False",
metadata={"source": file_path, "seq_num": 1},
),
Document(
page_content="True",
metadata={"source": file_path, "seq_num": 2},
),
]
mocker.patch("builtins.open", mocker.mock_open())
mock_csv_reader = mocker.patch("pathlib.Path.read_text")
mock_csv_reader.return_value = """
[
{"flag": false}, {"flag": true}
]
"""
loader = JSONLoader(file_path=file_path, jq_schema=".[].flag", text_content=False)
result = loader.load()
assert result == expected_docs
@pytest.mark.requires("jq")
def test_load_valid_numeric_content(mocker: MockerFixture) -> None:
file_path = "/workspaces/langchain/test.json"
expected_docs = [
Document(
page_content="99",
metadata={"source": file_path, "seq_num": 1},
),
Document(
page_content="99.5",
metadata={"source": file_path, "seq_num": 2},
),
]
mocker.patch("builtins.open", mocker.mock_open())
mock_csv_reader = mocker.patch("pathlib.Path.read_text")
mock_csv_reader.return_value = """
[
{"num": 99}, {"num": 99.5}
]
"""
loader = JSONLoader(file_path=file_path, jq_schema=".[].num", text_content=False)
result = loader.load()
assert result == expected_docs
@pytest.mark.requires("jq")
def test_load_invalid_test_content(mocker: MockerFixture) -> None:
file_path = "/workspaces/langchain/test.json"
mocker.patch("builtins.open", mocker.mock_open())
mock_csv_reader = mocker.patch("pathlib.Path.read_text")
mock_csv_reader.return_value = """
[{"text": "value1"}, {"text": "value2"}]
"""
loader = JSONLoader(file_path=file_path, jq_schema=".[]", text_content=True)
with raises(ValueError):
loader.load()