2023-12-11 21:53:30 +00:00
|
|
|
import os
|
|
|
|
from typing import Any, Dict, Iterator, List, Optional, Union
|
|
|
|
|
|
|
|
from langchain_core.documents import Document
|
|
|
|
from langchain_core.utils import get_from_dict_or_env
|
|
|
|
|
|
|
|
from langchain_community.document_loaders import PyPDFLoader
|
|
|
|
from langchain_community.document_loaders.base import BaseLoader
|
|
|
|
|
|
|
|
|
|
|
|
class RSpaceLoader(BaseLoader):
|
2023-12-19 13:58:24 +00:00
|
|
|
"""Load content from RSpace notebooks, folders, documents or PDF Gallery files.
|
2023-12-11 21:53:30 +00:00
|
|
|
|
2023-12-19 13:58:24 +00:00
|
|
|
Map RSpace document <-> Langchain Document in 1-1. PDFs are imported using PyPDF.
|
2023-12-11 21:53:30 +00:00
|
|
|
|
|
|
|
Requirements are rspace_client (`pip install rspace_client`) and PyPDF if importing
|
|
|
|
PDF docs (`pip install pypdf`).
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
def __init__(
|
|
|
|
self, global_id: str, api_key: Optional[str] = None, url: Optional[str] = None
|
|
|
|
):
|
|
|
|
"""api_key: RSpace API key - can also be supplied as environment variable
|
|
|
|
'RSPACE_API_KEY'
|
|
|
|
url: str
|
|
|
|
The URL of your RSpace instance - can also be supplied as environment
|
|
|
|
variable 'RSPACE_URL'
|
|
|
|
global_id: str
|
|
|
|
The global ID of the resource to load,
|
|
|
|
e.g. 'SD12344' (a single document); 'GL12345'(A PDF file in the gallery);
|
|
|
|
'NB4567' (a notebook); 'FL12244' (a folder)
|
|
|
|
"""
|
|
|
|
args: Dict[str, Optional[str]] = {
|
|
|
|
"api_key": api_key,
|
|
|
|
"url": url,
|
|
|
|
"global_id": global_id,
|
|
|
|
}
|
|
|
|
verified_args: Dict[str, str] = RSpaceLoader.validate_environment(args)
|
|
|
|
self.api_key = verified_args["api_key"]
|
|
|
|
self.url = verified_args["url"]
|
|
|
|
self.global_id: str = verified_args["global_id"]
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def validate_environment(cls, values: Dict) -> Dict:
|
2023-12-19 13:58:24 +00:00
|
|
|
"""Validate that API key and URL exist in environment."""
|
2023-12-11 21:53:30 +00:00
|
|
|
values["api_key"] = get_from_dict_or_env(values, "api_key", "RSPACE_API_KEY")
|
|
|
|
values["url"] = get_from_dict_or_env(values, "url", "RSPACE_URL")
|
|
|
|
if "global_id" not in values or values["global_id"] is None:
|
|
|
|
raise ValueError(
|
|
|
|
"No value supplied for global_id. Please supply an RSpace global ID"
|
|
|
|
)
|
|
|
|
return values
|
|
|
|
|
|
|
|
def _create_rspace_client(self) -> Any:
|
|
|
|
"""Create a RSpace client."""
|
|
|
|
try:
|
|
|
|
from rspace_client.eln import eln, field_content
|
|
|
|
|
|
|
|
except ImportError:
|
|
|
|
raise ImportError("You must run " "`pip install rspace_client`")
|
|
|
|
|
|
|
|
try:
|
|
|
|
eln = eln.ELNClient(self.url, self.api_key)
|
|
|
|
eln.get_status()
|
|
|
|
|
|
|
|
except Exception:
|
|
|
|
raise Exception(
|
2023-12-22 19:49:35 +00:00
|
|
|
f"Unable to initialize client - is url {self.url} or "
|
2023-12-11 21:53:30 +00:00
|
|
|
f"api key correct?"
|
|
|
|
)
|
|
|
|
|
|
|
|
return eln, field_content.FieldContent
|
|
|
|
|
|
|
|
def _get_doc(self, cli: Any, field_content: Any, d_id: Union[str, int]) -> Document:
|
|
|
|
content = ""
|
|
|
|
doc = cli.get_document(d_id)
|
|
|
|
content += f"<h2>{doc['name']}<h2/>"
|
|
|
|
for f in doc["fields"]:
|
|
|
|
content += f"{f['name']}\n"
|
|
|
|
fc = field_content(f["content"])
|
|
|
|
content += fc.get_text()
|
|
|
|
content += "\n"
|
|
|
|
return Document(
|
|
|
|
metadata={"source": f"rspace: {doc['name']}-{doc['globalId']}"},
|
|
|
|
page_content=content,
|
|
|
|
)
|
|
|
|
|
|
|
|
def _load_structured_doc(self) -> Iterator[Document]:
|
|
|
|
cli, field_content = self._create_rspace_client()
|
|
|
|
yield self._get_doc(cli, field_content, self.global_id)
|
|
|
|
|
|
|
|
def _load_folder_tree(self) -> Iterator[Document]:
|
|
|
|
cli, field_content = self._create_rspace_client()
|
|
|
|
if self.global_id:
|
|
|
|
docs_in_folder = cli.list_folder_tree(
|
|
|
|
folder_id=self.global_id[2:], typesToInclude=["document"]
|
|
|
|
)
|
|
|
|
doc_ids: List[int] = [d["id"] for d in docs_in_folder["records"]]
|
|
|
|
for doc_id in doc_ids:
|
|
|
|
yield self._get_doc(cli, field_content, doc_id)
|
|
|
|
|
|
|
|
def _load_pdf(self) -> Iterator[Document]:
|
|
|
|
cli, field_content = self._create_rspace_client()
|
|
|
|
file_info = cli.get_file_info(self.global_id)
|
|
|
|
_, ext = os.path.splitext(file_info["name"])
|
|
|
|
if ext.lower() == ".pdf":
|
|
|
|
outfile = f"{self.global_id}.pdf"
|
|
|
|
cli.download_file(self.global_id, outfile)
|
|
|
|
pdf_loader = PyPDFLoader(outfile)
|
|
|
|
for pdf in pdf_loader.lazy_load():
|
|
|
|
pdf.metadata["rspace_src"] = self.global_id
|
|
|
|
yield pdf
|
|
|
|
|
|
|
|
def lazy_load(self) -> Iterator[Document]:
|
|
|
|
if self.global_id and "GL" in self.global_id:
|
|
|
|
for d in self._load_pdf():
|
|
|
|
yield d
|
|
|
|
elif self.global_id and "SD" in self.global_id:
|
|
|
|
for d in self._load_structured_doc():
|
|
|
|
yield d
|
|
|
|
elif self.global_id and self.global_id[0:2] in ["FL", "NB"]:
|
|
|
|
for d in self._load_folder_tree():
|
|
|
|
yield d
|
|
|
|
else:
|
|
|
|
raise ValueError("Unknown global ID type")
|