mirror of
https://github.com/hwchase17/langchain
synced 2024-11-18 09:25:54 +00:00
ed58eeb9c5
Moved the following modules to new package langchain-community in a backwards compatible fashion: ``` mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community ``` Moved the following to core ``` mv langchain/langchain/utils/json_schema.py core/langchain_core/utils mv langchain/langchain/utils/html.py core/langchain_core/utils mv langchain/langchain/utils/strings.py core/langchain_core/utils cat langchain/langchain/utils/env.py >> core/langchain_core/utils/env.py rm langchain/langchain/utils/env.py ``` See .scripts/community_split/script_integrations.sh for all changes
95 lines
3.1 KiB
Python
95 lines
3.1 KiB
Python
import json
|
|
from typing import Iterator, List, Mapping, Optional, Sequence, Union
|
|
|
|
from langchain_core.documents import Document
|
|
|
|
from langchain_community.document_loaders.base import BaseLoader
|
|
|
|
|
|
class HuggingFaceDatasetLoader(BaseLoader):
|
|
"""Load from `Hugging Face Hub` datasets."""
|
|
|
|
def __init__(
|
|
self,
|
|
path: str,
|
|
page_content_column: str = "text",
|
|
name: Optional[str] = None,
|
|
data_dir: Optional[str] = None,
|
|
data_files: Optional[
|
|
Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]
|
|
] = None,
|
|
cache_dir: Optional[str] = None,
|
|
keep_in_memory: Optional[bool] = None,
|
|
save_infos: bool = False,
|
|
use_auth_token: Optional[Union[bool, str]] = None,
|
|
num_proc: Optional[int] = None,
|
|
):
|
|
"""Initialize the HuggingFaceDatasetLoader.
|
|
|
|
Args:
|
|
path: Path or name of the dataset.
|
|
page_content_column: Page content column name. Default is "text".
|
|
name: Name of the dataset configuration.
|
|
data_dir: Data directory of the dataset configuration.
|
|
data_files: Path(s) to source data file(s).
|
|
cache_dir: Directory to read/write data.
|
|
keep_in_memory: Whether to copy the dataset in-memory.
|
|
save_infos: Save the dataset information (checksums/size/splits/...).
|
|
Default is False.
|
|
use_auth_token: Bearer token for remote files on the Dataset Hub.
|
|
num_proc: Number of processes.
|
|
"""
|
|
|
|
self.path = path
|
|
self.page_content_column = page_content_column
|
|
self.name = name
|
|
self.data_dir = data_dir
|
|
self.data_files = data_files
|
|
self.cache_dir = cache_dir
|
|
self.keep_in_memory = keep_in_memory
|
|
self.save_infos = save_infos
|
|
self.use_auth_token = use_auth_token
|
|
self.num_proc = num_proc
|
|
|
|
def lazy_load(
|
|
self,
|
|
) -> Iterator[Document]:
|
|
"""Load documents lazily."""
|
|
try:
|
|
from datasets import load_dataset
|
|
except ImportError:
|
|
raise ImportError(
|
|
"Could not import datasets python package. "
|
|
"Please install it with `pip install datasets`."
|
|
)
|
|
|
|
dataset = load_dataset(
|
|
path=self.path,
|
|
name=self.name,
|
|
data_dir=self.data_dir,
|
|
data_files=self.data_files,
|
|
cache_dir=self.cache_dir,
|
|
keep_in_memory=self.keep_in_memory,
|
|
save_infos=self.save_infos,
|
|
use_auth_token=self.use_auth_token,
|
|
num_proc=self.num_proc,
|
|
)
|
|
|
|
yield from (
|
|
Document(
|
|
page_content=self.parse_obj(row.pop(self.page_content_column)),
|
|
metadata=row,
|
|
)
|
|
for key in dataset.keys()
|
|
for row in dataset[key]
|
|
)
|
|
|
|
def load(self) -> List[Document]:
|
|
"""Load documents."""
|
|
return list(self.lazy_load())
|
|
|
|
def parse_obj(self, page_content: Union[str, object]) -> str:
|
|
if isinstance(page_content, object):
|
|
return json.dumps(page_content)
|
|
return page_content
|