langchain/libs/community/langchain_community/document_loaders/hugging_face_dataset.py
Bagatur ed58eeb9c5
community[major], core[patch], langchain[patch], experimental[patch]: Create langchain-community (#14463)
Moved the following modules to new package langchain-community in a backwards compatible fashion:

```
mv langchain/langchain/adapters community/langchain_community
mv langchain/langchain/callbacks community/langchain_community/callbacks
mv langchain/langchain/chat_loaders community/langchain_community
mv langchain/langchain/chat_models community/langchain_community
mv langchain/langchain/document_loaders community/langchain_community
mv langchain/langchain/docstore community/langchain_community
mv langchain/langchain/document_transformers community/langchain_community
mv langchain/langchain/embeddings community/langchain_community
mv langchain/langchain/graphs community/langchain_community
mv langchain/langchain/llms community/langchain_community
mv langchain/langchain/memory/chat_message_histories community/langchain_community
mv langchain/langchain/retrievers community/langchain_community
mv langchain/langchain/storage community/langchain_community
mv langchain/langchain/tools community/langchain_community
mv langchain/langchain/utilities community/langchain_community
mv langchain/langchain/vectorstores community/langchain_community
mv langchain/langchain/agents/agent_toolkits community/langchain_community
mv langchain/langchain/cache.py community/langchain_community
mv langchain/langchain/adapters community/langchain_community
mv langchain/langchain/callbacks community/langchain_community/callbacks
mv langchain/langchain/chat_loaders community/langchain_community
mv langchain/langchain/chat_models community/langchain_community
mv langchain/langchain/document_loaders community/langchain_community
mv langchain/langchain/docstore community/langchain_community
mv langchain/langchain/document_transformers community/langchain_community
mv langchain/langchain/embeddings community/langchain_community
mv langchain/langchain/graphs community/langchain_community
mv langchain/langchain/llms community/langchain_community
mv langchain/langchain/memory/chat_message_histories community/langchain_community
mv langchain/langchain/retrievers community/langchain_community
mv langchain/langchain/storage community/langchain_community
mv langchain/langchain/tools community/langchain_community
mv langchain/langchain/utilities community/langchain_community
mv langchain/langchain/vectorstores community/langchain_community
mv langchain/langchain/agents/agent_toolkits community/langchain_community
mv langchain/langchain/cache.py community/langchain_community
```

Moved the following to core
```
mv langchain/langchain/utils/json_schema.py core/langchain_core/utils
mv langchain/langchain/utils/html.py core/langchain_core/utils
mv langchain/langchain/utils/strings.py core/langchain_core/utils
cat langchain/langchain/utils/env.py >> core/langchain_core/utils/env.py
rm langchain/langchain/utils/env.py
```

See .scripts/community_split/script_integrations.sh for all changes
2023-12-11 13:53:30 -08:00

95 lines
3.1 KiB
Python

import json
from typing import Iterator, List, Mapping, Optional, Sequence, Union
from langchain_core.documents import Document
from langchain_community.document_loaders.base import BaseLoader
class HuggingFaceDatasetLoader(BaseLoader):
"""Load from `Hugging Face Hub` datasets."""
def __init__(
self,
path: str,
page_content_column: str = "text",
name: Optional[str] = None,
data_dir: Optional[str] = None,
data_files: Optional[
Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]
] = None,
cache_dir: Optional[str] = None,
keep_in_memory: Optional[bool] = None,
save_infos: bool = False,
use_auth_token: Optional[Union[bool, str]] = None,
num_proc: Optional[int] = None,
):
"""Initialize the HuggingFaceDatasetLoader.
Args:
path: Path or name of the dataset.
page_content_column: Page content column name. Default is "text".
name: Name of the dataset configuration.
data_dir: Data directory of the dataset configuration.
data_files: Path(s) to source data file(s).
cache_dir: Directory to read/write data.
keep_in_memory: Whether to copy the dataset in-memory.
save_infos: Save the dataset information (checksums/size/splits/...).
Default is False.
use_auth_token: Bearer token for remote files on the Dataset Hub.
num_proc: Number of processes.
"""
self.path = path
self.page_content_column = page_content_column
self.name = name
self.data_dir = data_dir
self.data_files = data_files
self.cache_dir = cache_dir
self.keep_in_memory = keep_in_memory
self.save_infos = save_infos
self.use_auth_token = use_auth_token
self.num_proc = num_proc
def lazy_load(
self,
) -> Iterator[Document]:
"""Load documents lazily."""
try:
from datasets import load_dataset
except ImportError:
raise ImportError(
"Could not import datasets python package. "
"Please install it with `pip install datasets`."
)
dataset = load_dataset(
path=self.path,
name=self.name,
data_dir=self.data_dir,
data_files=self.data_files,
cache_dir=self.cache_dir,
keep_in_memory=self.keep_in_memory,
save_infos=self.save_infos,
use_auth_token=self.use_auth_token,
num_proc=self.num_proc,
)
yield from (
Document(
page_content=self.parse_obj(row.pop(self.page_content_column)),
metadata=row,
)
for key in dataset.keys()
for row in dataset[key]
)
def load(self) -> List[Document]:
"""Load documents."""
return list(self.lazy_load())
def parse_obj(self, page_content: Union[str, object]) -> str:
if isinstance(page_content, object):
return json.dumps(page_content)
return page_content