import os from typing import Any, Dict, Iterator, List, Optional, Union from langchain_core.documents import Document from langchain_core.utils import get_from_dict_or_env from langchain_community.document_loaders import PyPDFLoader from langchain_community.document_loaders.base import BaseLoader class RSpaceLoader(BaseLoader): """Load content from RSpace notebooks, folders, documents or PDF Gallery files. Map RSpace document <-> Langchain Document in 1-1. PDFs are imported using PyPDF. Requirements are rspace_client (`pip install rspace_client`) and PyPDF if importing PDF docs (`pip install pypdf`). """ def __init__( self, global_id: str, api_key: Optional[str] = None, url: Optional[str] = None ): """api_key: RSpace API key - can also be supplied as environment variable 'RSPACE_API_KEY' url: str The URL of your RSpace instance - can also be supplied as environment variable 'RSPACE_URL' global_id: str The global ID of the resource to load, e.g. 'SD12344' (a single document); 'GL12345'(A PDF file in the gallery); 'NB4567' (a notebook); 'FL12244' (a folder) """ args: Dict[str, Optional[str]] = { "api_key": api_key, "url": url, "global_id": global_id, } verified_args: Dict[str, str] = RSpaceLoader.validate_environment(args) self.api_key = verified_args["api_key"] self.url = verified_args["url"] self.global_id: str = verified_args["global_id"] @classmethod def validate_environment(cls, values: Dict) -> Dict: """Validate that API key and URL exist in environment.""" values["api_key"] = get_from_dict_or_env(values, "api_key", "RSPACE_API_KEY") values["url"] = get_from_dict_or_env(values, "url", "RSPACE_URL") if "global_id" not in values or values["global_id"] is None: raise ValueError( "No value supplied for global_id. Please supply an RSpace global ID" ) return values def _create_rspace_client(self) -> Any: """Create a RSpace client.""" try: from rspace_client.eln import eln, field_content except ImportError: raise ImportError("You must run " "`pip install rspace_client`") try: eln = eln.ELNClient(self.url, self.api_key) eln.get_status() except Exception: raise Exception( f"Unable to initialise client - is url {self.url} or " f"api key correct?" ) return eln, field_content.FieldContent def _get_doc(self, cli: Any, field_content: Any, d_id: Union[str, int]) -> Document: content = "" doc = cli.get_document(d_id) content += f"