langchain/libs/community/langchain_community/document_loaders/gcs_file.py

import os
import tempfile
from typing import Callable, List, Optional

from langchain_core.documents import Document

from langchain_community.document_loaders.base import BaseLoader
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
from langchain_community.utilities.vertexai import get_client_info


class GCSFileLoader(BaseLoader):
    """Load from GCS file."""

    def __init__(
        self,
        project_name: str,
        bucket: str,
        blob: str,
        loader_func: Optional[Callable[[str], BaseLoader]] = None,
    ):
        """Initialize with bucket and key name.

        Args:
            project_name: The name of the project to load
            bucket: The name of the GCS bucket.
            blob: The name of the GCS blob to load.
            loader_func: A loader function that instantiates a loader based on a
                file_path argument. If nothing is provided, the
                UnstructuredFileLoader is used.

        Examples:
            To use an alternative PDF loader:
            >> from from langchain_community.document_loaders import PyPDFLoader
            >> loader = GCSFileLoader(..., loader_func=PyPDFLoader)

            To use UnstructuredFileLoader with additional arguments:
            >> loader = GCSFileLoader(...,
            >>      loader_func=lambda x: UnstructuredFileLoader(x, mode="elements"))

        """
        self.bucket = bucket
        self.blob = blob
        self.project_name = project_name

        def default_loader_func(file_path: str) -> BaseLoader:
            return UnstructuredFileLoader(file_path)

        self._loader_func = loader_func if loader_func else default_loader_func

    def load(self) -> List[Document]:
        """Load documents."""
        try:
            from google.cloud import storage
        except ImportError:
            raise ImportError(
                "Could not import google-cloud-storage python package. "
                "Please install it with `pip install google-cloud-storage`."
            )

        # initialize a client
        storage_client = storage.Client(
            self.project_name, client_info=get_client_info("google-cloud-storage")
        )
        # Create a bucket object for our bucket
        bucket = storage_client.get_bucket(self.bucket)
        # Create a blob object from the filepath
        blob = bucket.blob(self.blob)
        # retrieve custom metadata associated with the blob
        metadata = bucket.get_blob(self.blob).metadata
        with tempfile.TemporaryDirectory() as temp_dir:
            file_path = f"{temp_dir}/{self.blob}"
            os.makedirs(os.path.dirname(file_path), exist_ok=True)
            # Download the file to a destination
            blob.download_to_filename(file_path)
            loader = self._loader_func(file_path)
            docs = loader.load()
            for doc in docs:
                if "source" in doc.metadata:
                    doc.metadata["source"] = f"gs://{self.bucket}/{self.blob}"
                if metadata:
                    doc.metadata.update(metadata)
            return docs