mirror of
https://github.com/hwchase17/langchain
synced 2024-11-08 07:10:35 +00:00
ed58eeb9c5
Moved the following modules to new package langchain-community in a backwards compatible fashion: ``` mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community ``` Moved the following to core ``` mv langchain/langchain/utils/json_schema.py core/langchain_core/utils mv langchain/langchain/utils/html.py core/langchain_core/utils mv langchain/langchain/utils/strings.py core/langchain_core/utils cat langchain/langchain/utils/env.py >> core/langchain_core/utils/env.py rm langchain/langchain/utils/env.py ``` See .scripts/community_split/script_integrations.sh for all changes
196 lines
6.5 KiB
Python
196 lines
6.5 KiB
Python
"""Schema for Blobs and Blob Loaders.
|
|
|
|
The goal is to facilitate decoupling of content loading from content parsing code.
|
|
|
|
In addition, content loading code should provide a lazy loading interface by default.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import contextlib
|
|
import mimetypes
|
|
from abc import ABC, abstractmethod
|
|
from io import BufferedReader, BytesIO
|
|
from pathlib import PurePath
|
|
from typing import Any, Dict, Generator, Iterable, Mapping, Optional, Union, cast
|
|
|
|
from langchain_core.pydantic_v1 import BaseModel, Field, root_validator
|
|
|
|
PathLike = Union[str, PurePath]
|
|
|
|
|
|
class Blob(BaseModel):
|
|
"""Blob represents raw data by either reference or value.
|
|
|
|
Provides an interface to materialize the blob in different representations, and
|
|
help to decouple the development of data loaders from the downstream parsing of
|
|
the raw data.
|
|
|
|
Inspired by: https://developer.mozilla.org/en-US/docs/Web/API/Blob
|
|
"""
|
|
|
|
data: Union[bytes, str, None]
|
|
"""Raw data associated with the blob."""
|
|
mimetype: Optional[str] = None
|
|
"""MimeType not to be confused with a file extension."""
|
|
encoding: str = "utf-8"
|
|
"""Encoding to use if decoding the bytes into a string.
|
|
|
|
Use utf-8 as default encoding, if decoding to string.
|
|
"""
|
|
path: Optional[PathLike] = None
|
|
"""Location where the original content was found."""
|
|
|
|
metadata: Dict[str, Any] = Field(default_factory=dict)
|
|
"""Metadata about the blob (e.g., source)"""
|
|
|
|
class Config:
|
|
arbitrary_types_allowed = True
|
|
frozen = True
|
|
|
|
@property
|
|
def source(self) -> Optional[str]:
|
|
"""The source location of the blob as string if known otherwise none.
|
|
|
|
If a path is associated with the blob, it will default to the path location.
|
|
|
|
Unless explicitly set via a metadata field called "source", in which
|
|
case that value will be used instead.
|
|
"""
|
|
if self.metadata and "source" in self.metadata:
|
|
return cast(Optional[str], self.metadata["source"])
|
|
return str(self.path) if self.path else None
|
|
|
|
@root_validator(pre=True)
|
|
def check_blob_is_valid(cls, values: Mapping[str, Any]) -> Mapping[str, Any]:
|
|
"""Verify that either data or path is provided."""
|
|
if "data" not in values and "path" not in values:
|
|
raise ValueError("Either data or path must be provided")
|
|
return values
|
|
|
|
def as_string(self) -> str:
|
|
"""Read data as a string."""
|
|
if self.data is None and self.path:
|
|
with open(str(self.path), "r", encoding=self.encoding) as f:
|
|
return f.read()
|
|
elif isinstance(self.data, bytes):
|
|
return self.data.decode(self.encoding)
|
|
elif isinstance(self.data, str):
|
|
return self.data
|
|
else:
|
|
raise ValueError(f"Unable to get string for blob {self}")
|
|
|
|
def as_bytes(self) -> bytes:
|
|
"""Read data as bytes."""
|
|
if isinstance(self.data, bytes):
|
|
return self.data
|
|
elif isinstance(self.data, str):
|
|
return self.data.encode(self.encoding)
|
|
elif self.data is None and self.path:
|
|
with open(str(self.path), "rb") as f:
|
|
return f.read()
|
|
else:
|
|
raise ValueError(f"Unable to get bytes for blob {self}")
|
|
|
|
@contextlib.contextmanager
|
|
def as_bytes_io(self) -> Generator[Union[BytesIO, BufferedReader], None, None]:
|
|
"""Read data as a byte stream."""
|
|
if isinstance(self.data, bytes):
|
|
yield BytesIO(self.data)
|
|
elif self.data is None and self.path:
|
|
with open(str(self.path), "rb") as f:
|
|
yield f
|
|
else:
|
|
raise NotImplementedError(f"Unable to convert blob {self}")
|
|
|
|
@classmethod
|
|
def from_path(
|
|
cls,
|
|
path: PathLike,
|
|
*,
|
|
encoding: str = "utf-8",
|
|
mime_type: Optional[str] = None,
|
|
guess_type: bool = True,
|
|
metadata: Optional[dict] = None,
|
|
) -> Blob:
|
|
"""Load the blob from a path like object.
|
|
|
|
Args:
|
|
path: path like object to file to be read
|
|
encoding: Encoding to use if decoding the bytes into a string
|
|
mime_type: if provided, will be set as the mime-type of the data
|
|
guess_type: If True, the mimetype will be guessed from the file extension,
|
|
if a mime-type was not provided
|
|
metadata: Metadata to associate with the blob
|
|
|
|
Returns:
|
|
Blob instance
|
|
"""
|
|
if mime_type is None and guess_type:
|
|
_mimetype = mimetypes.guess_type(path)[0] if guess_type else None
|
|
else:
|
|
_mimetype = mime_type
|
|
# We do not load the data immediately, instead we treat the blob as a
|
|
# reference to the underlying data.
|
|
return cls(
|
|
data=None,
|
|
mimetype=_mimetype,
|
|
encoding=encoding,
|
|
path=path,
|
|
metadata=metadata if metadata is not None else {},
|
|
)
|
|
|
|
@classmethod
|
|
def from_data(
|
|
cls,
|
|
data: Union[str, bytes],
|
|
*,
|
|
encoding: str = "utf-8",
|
|
mime_type: Optional[str] = None,
|
|
path: Optional[str] = None,
|
|
metadata: Optional[dict] = None,
|
|
) -> Blob:
|
|
"""Initialize the blob from in-memory data.
|
|
|
|
Args:
|
|
data: the in-memory data associated with the blob
|
|
encoding: Encoding to use if decoding the bytes into a string
|
|
mime_type: if provided, will be set as the mime-type of the data
|
|
path: if provided, will be set as the source from which the data came
|
|
metadata: Metadata to associate with the blob
|
|
|
|
Returns:
|
|
Blob instance
|
|
"""
|
|
return cls(
|
|
data=data,
|
|
mimetype=mime_type,
|
|
encoding=encoding,
|
|
path=path,
|
|
metadata=metadata if metadata is not None else {},
|
|
)
|
|
|
|
def __repr__(self) -> str:
|
|
"""Define the blob representation."""
|
|
str_repr = f"Blob {id(self)}"
|
|
if self.source:
|
|
str_repr += f" {self.source}"
|
|
return str_repr
|
|
|
|
|
|
class BlobLoader(ABC):
|
|
"""Abstract interface for blob loaders implementation.
|
|
|
|
Implementer should be able to load raw content from a storage system according
|
|
to some criteria and return the raw content lazily as a stream of blobs.
|
|
"""
|
|
|
|
@abstractmethod
|
|
def yield_blobs(
|
|
self,
|
|
) -> Iterable[Blob]:
|
|
"""A lazy loader for raw data represented by LangChain's Blob object.
|
|
|
|
Returns:
|
|
A generator over blobs
|
|
"""
|