import os from typing import Any, Dict, Iterator, List, Optional, Union from langchain_core.documents import Document from langchain_core.utils import get_from_dict_or_env from langchain_community.document_loaders import PyPDFLoader from langchain_community.document_loaders.base import BaseLoader class RSpaceLoader(BaseLoader): """Load content from RSpace notebooks, folders, documents or PDF Gallery files. Map RSpace document <-> Langchain Document in 1-1. PDFs are imported using PyPDF. Requirements are rspace_client (`pip install rspace_client`) and PyPDF if importing PDF docs (`pip install pypdf`). """ def __init__( self, global_id: str, api_key: Optional[str] = None, url: Optional[str] = None ): """api_key: RSpace API key - can also be supplied as environment variable 'RSPACE_API_KEY' url: str The URL of your RSpace instance - can also be supplied as environment variable 'RSPACE_URL' global_id: str The global ID of the resource to load, e.g. 'SD12344' (a single document); 'GL12345'(A PDF file in the gallery); 'NB4567' (a notebook); 'FL12244' (a folder) """ args: Dict[str, Optional[str]] = { "api_key": api_key, "url": url, "global_id": global_id, } verified_args: Dict[str, str] = RSpaceLoader.validate_environment(args) self.api_key = verified_args["api_key"] self.url = verified_args["url"] self.global_id: str = verified_args["global_id"] @classmethod def validate_environment(cls, values: Dict) -> Dict: """Validate that API key and URL exist in environment.""" values["api_key"] = get_from_dict_or_env(values, "api_key", "RSPACE_API_KEY") values["url"] = get_from_dict_or_env(values, "url", "RSPACE_URL") if "global_id" not in values or values["global_id"] is None: raise ValueError( "No value supplied for global_id. Please supply an RSpace global ID" ) return values def _create_rspace_client(self) -> Any: """Create a RSpace client.""" try: from rspace_client.eln import eln, field_content except ImportError: raise ImportError("You must run " "`pip install rspace_client`") try: eln = eln.ELNClient(self.url, self.api_key) eln.get_status() except Exception: raise Exception( f"Unable to initialize client - is url {self.url} or " f"api key correct?" ) return eln, field_content.FieldContent def _get_doc(self, cli: Any, field_content: Any, d_id: Union[str, int]) -> Document: content = "" doc = cli.get_document(d_id) content += f"

{doc['name']}

" for f in doc["fields"]: content += f"{f['name']}\n" fc = field_content(f["content"]) content += fc.get_text() content += "\n" return Document( metadata={"source": f"rspace: {doc['name']}-{doc['globalId']}"}, page_content=content, ) def _load_structured_doc(self) -> Iterator[Document]: cli, field_content = self._create_rspace_client() yield self._get_doc(cli, field_content, self.global_id) def _load_folder_tree(self) -> Iterator[Document]: cli, field_content = self._create_rspace_client() if self.global_id: docs_in_folder = cli.list_folder_tree( folder_id=self.global_id[2:], typesToInclude=["document"] ) doc_ids: List[int] = [d["id"] for d in docs_in_folder["records"]] for doc_id in doc_ids: yield self._get_doc(cli, field_content, doc_id) def _load_pdf(self) -> Iterator[Document]: cli, field_content = self._create_rspace_client() file_info = cli.get_file_info(self.global_id) _, ext = os.path.splitext(file_info["name"]) if ext.lower() == ".pdf": outfile = f"{self.global_id}.pdf" cli.download_file(self.global_id, outfile) pdf_loader = PyPDFLoader(outfile) for pdf in pdf_loader.lazy_load(): pdf.metadata["rspace_src"] = self.global_id yield pdf def lazy_load(self) -> Iterator[Document]: if self.global_id and "GL" in self.global_id: for d in self._load_pdf(): yield d elif self.global_id and "SD" in self.global_id: for d in self._load_structured_doc(): yield d elif self.global_id and self.global_id[0:2] in ["FL", "NB"]: for d in self._load_folder_tree(): yield d else: raise ValueError("Unknown global ID type")