from __future__ import annotations
import os
from typing import (
TYPE_CHECKING,
List,
Optional,
Union,
)
if TYPE_CHECKING:
import rdflib
class OntotextGraphDBGraph:
"""Ontotext GraphDB https://graphdb.ontotext.com/ wrapper for graph operations.
*Security note*: Make sure that the database connection uses credentials
that are narrowly-scoped to only include necessary permissions.
Failure to do so may result in data corruption or loss, since the calling
code may attempt commands that would result in deletion, mutation
of data if appropriately prompted or reading sensitive data if such
data is present in the database.
The best way to guard against such negative outcomes is to (as appropriate)
limit the permissions granted to the credentials used with this tool.
See https://python.langchain.com/docs/security for more information.
"""
def __init__(
self,
query_endpoint: str,
query_ontology: Optional[str] = None,
local_file: Optional[str] = None,
local_file_format: Optional[str] = None,
) -> None:
"""
Set up the GraphDB wrapper
:param query_endpoint: SPARQL endpoint for queries, read access
If GraphDB is secured,
set the environment variables 'GRAPHDB_USERNAME' and 'GRAPHDB_PASSWORD'.
:param query_ontology: a `CONSTRUCT` query that is executed
on the SPARQL endpoint and returns the KG schema statements
Example:
'CONSTRUCT {?s ?p ?o} FROM WHERE {?s ?p ?o}'
Currently, DESCRIBE queries like
'PREFIX onto:
PREFIX rdfs:
DESCRIBE ?term WHERE {
?term rdfs:isDefinedBy onto:
}'
are not supported, because DESCRIBE returns
the Symmetric Concise Bounded Description (SCBD),
i.e. also the incoming class links.
In case of large graphs with a million of instances, this is not efficient.
Check https://github.com/eclipse-rdf4j/rdf4j/issues/4857
:param local_file: a local RDF ontology file.
Supported RDF formats:
Turtle, RDF/XML, JSON-LD, N-Triples, Notation-3, Trig, Trix, N-Quads.
If the rdf format can't be determined from the file extension,
pass explicitly the rdf format in `local_file_format` param.
:param local_file_format: Used if the rdf format can't be determined
from the local file extension.
One of "json-ld", "xml", "n3", "turtle", "nt", "trig", "nquads", "trix"
Either `query_ontology` or `local_file` should be passed.
"""
if query_ontology and local_file:
raise ValueError("Both file and query provided. Only one is allowed.")
if not query_ontology and not local_file:
raise ValueError("Neither file nor query provided. One is required.")
try:
import rdflib
from rdflib.plugins.stores import sparqlstore
except ImportError:
raise ValueError(
"Could not import rdflib python package. "
"Please install it with `pip install rdflib`."
)
auth = self._get_auth()
store = sparqlstore.SPARQLStore(auth=auth)
store.open(query_endpoint)
self.graph = rdflib.Graph(store, identifier=None, bind_namespaces="none")
self._check_connectivity()
if local_file:
ontology_schema_graph = self._load_ontology_schema_from_file(
local_file,
local_file_format, # type: ignore[arg-type]
)
else:
self._validate_user_query(query_ontology) # type: ignore[arg-type]
ontology_schema_graph = self._load_ontology_schema_with_query(
query_ontology # type: ignore[arg-type]
)
self.schema = ontology_schema_graph.serialize(format="turtle")
@staticmethod
def _get_auth() -> Union[tuple, None]:
"""
Returns the basic authentication configuration
"""
username = os.environ.get("GRAPHDB_USERNAME", None)
password = os.environ.get("GRAPHDB_PASSWORD", None)
if username:
if not password:
raise ValueError(
"Environment variable 'GRAPHDB_USERNAME' is set, "
"but 'GRAPHDB_PASSWORD' is not set."
)
else:
return username, password
return None
def _check_connectivity(self) -> None:
"""
Executes a simple `ASK` query to check connectivity
"""
try:
self.graph.query("ASK { ?s ?p ?o }")
except ValueError:
raise ValueError(
"Could not query the provided endpoint. "
"Please, check, if the value of the provided "
"query_endpoint points to the right repository. "
"If GraphDB is secured, please, "
"make sure that the environment variables "
"'GRAPHDB_USERNAME' and 'GRAPHDB_PASSWORD' are set."
)
@staticmethod
def _load_ontology_schema_from_file(local_file: str, local_file_format: str = None): # type: ignore[no-untyped-def, assignment]
"""
Parse the ontology schema statements from the provided file
"""
import rdflib
if not os.path.exists(local_file):
raise FileNotFoundError(f"File {local_file} does not exist.")
if not os.access(local_file, os.R_OK):
raise PermissionError(f"Read permission for {local_file} is restricted")
graph = rdflib.ConjunctiveGraph()
try:
graph.parse(local_file, format=local_file_format)
except Exception as e:
raise ValueError(f"Invalid file format for {local_file} : ", e)
return graph
@staticmethod
def _validate_user_query(query_ontology: str) -> None:
"""
Validate the query is a valid SPARQL CONSTRUCT query
"""
from pyparsing import ParseException
from rdflib.plugins.sparql import prepareQuery
if not isinstance(query_ontology, str):
raise TypeError("Ontology query must be provided as string.")
try:
parsed_query = prepareQuery(query_ontology)
except ParseException as e:
raise ValueError("Ontology query is not a valid SPARQL query.", e)
if parsed_query.algebra.name != "ConstructQuery":
raise ValueError(
"Invalid query type. Only CONSTRUCT queries are supported."
)
def _load_ontology_schema_with_query(self, query: str): # type: ignore[no-untyped-def]
"""
Execute the query for collecting the ontology schema statements
"""
from rdflib.exceptions import ParserError
try:
results = self.graph.query(query)
except ParserError as e:
raise ValueError(f"Generated SPARQL statement is invalid\n{e}")
return results.graph
@property
def get_schema(self) -> str:
"""
Returns the schema of the graph database in turtle format
"""
return self.schema
def query(
self,
query: str,
) -> List[rdflib.query.ResultRow]:
"""
Query the graph.
"""
from rdflib.query import ResultRow
res = self.graph.query(query)
return [r for r in res if isinstance(r, ResultRow)]