2024-02-13 05:30:20 +00:00
|
|
|
import json
|
|
|
|
from types import SimpleNamespace
|
|
|
|
from typing import Any, Dict, Optional, Sequence
|
|
|
|
|
|
|
|
import requests
|
|
|
|
|
2024-03-26 14:36:51 +00:00
|
|
|
# Query to find OWL datatype properties
|
2024-02-13 05:30:20 +00:00
|
|
|
DTPROP_QUERY = """
|
2024-03-26 14:36:51 +00:00
|
|
|
SELECT DISTINCT ?elem
|
2024-02-13 05:30:20 +00:00
|
|
|
WHERE {
|
2024-03-26 14:36:51 +00:00
|
|
|
?elem a owl:DatatypeProperty .
|
2024-02-13 05:30:20 +00:00
|
|
|
}
|
|
|
|
"""
|
|
|
|
|
2024-03-26 14:36:51 +00:00
|
|
|
# Query to find OWL object properties
|
2024-02-13 05:30:20 +00:00
|
|
|
OPROP_QUERY = """
|
2024-03-26 14:36:51 +00:00
|
|
|
SELECT DISTINCT ?elem
|
2024-02-13 05:30:20 +00:00
|
|
|
WHERE {
|
2024-03-26 14:36:51 +00:00
|
|
|
?elem a owl:ObjectProperty .
|
2024-02-13 05:30:20 +00:00
|
|
|
}
|
|
|
|
"""
|
|
|
|
|
|
|
|
ELEM_TYPES = {
|
2024-03-26 14:36:51 +00:00
|
|
|
"classes": None,
|
|
|
|
"rels": None,
|
2024-02-13 05:30:20 +00:00
|
|
|
"dtprops": DTPROP_QUERY,
|
|
|
|
"oprops": OPROP_QUERY,
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
class NeptuneRdfGraph:
|
|
|
|
"""Neptune wrapper for RDF graph operations.
|
|
|
|
|
|
|
|
Args:
|
2024-03-26 14:36:51 +00:00
|
|
|
host: endpoint for the database instance
|
|
|
|
port: port number for the database instance, default is 8182
|
2024-02-13 05:30:20 +00:00
|
|
|
use_iam_auth: boolean indicating IAM auth is enabled in Neptune cluster
|
2024-03-26 14:36:51 +00:00
|
|
|
use_https: whether to use secure connection, default is True
|
|
|
|
client: optional boto3 Neptune client
|
|
|
|
credentials_profile_name: optional AWS profile name
|
|
|
|
region_name: optional AWS region, e.g., us-west-2
|
|
|
|
service: optional service name, default is neptunedata
|
|
|
|
sign: optional, whether to sign the request payload, default is True
|
2024-02-13 05:30:20 +00:00
|
|
|
|
|
|
|
Example:
|
|
|
|
.. code-block:: python
|
|
|
|
|
|
|
|
graph = NeptuneRdfGraph(
|
|
|
|
host='<SPARQL host'>,
|
2024-03-26 14:36:51 +00:00
|
|
|
port=<SPARQL port>
|
2024-02-13 05:30:20 +00:00
|
|
|
)
|
|
|
|
schema = graph.get_schema()
|
|
|
|
|
|
|
|
OR
|
|
|
|
graph = NeptuneRdfGraph(
|
|
|
|
host='<SPARQL host'>,
|
2024-03-26 14:36:51 +00:00
|
|
|
port=<SPARQL port>
|
2024-02-13 05:30:20 +00:00
|
|
|
)
|
|
|
|
schema_elem = graph.get_schema_elements()
|
2024-03-26 14:36:51 +00:00
|
|
|
#... change schema_elements ...
|
2024-02-13 05:30:20 +00:00
|
|
|
graph.load_schema(schema_elem)
|
|
|
|
|
|
|
|
*Security note*: Make sure that the database connection uses credentials
|
|
|
|
that are narrowly-scoped to only include necessary permissions.
|
|
|
|
Failure to do so may result in data corruption or loss, since the calling
|
|
|
|
code may attempt commands that would result in deletion, mutation
|
|
|
|
of data if appropriately prompted or reading sensitive data if such
|
|
|
|
data is present in the database.
|
|
|
|
The best way to guard against such negative outcomes is to (as appropriate)
|
|
|
|
limit the permissions granted to the credentials used with this tool.
|
|
|
|
|
|
|
|
See https://python.langchain.com/docs/security for more information.
|
|
|
|
"""
|
|
|
|
|
|
|
|
def __init__(
|
|
|
|
self,
|
|
|
|
host: str,
|
|
|
|
port: int = 8182,
|
2024-03-26 14:36:51 +00:00
|
|
|
use_https: bool = True,
|
2024-02-13 05:30:20 +00:00
|
|
|
use_iam_auth: bool = False,
|
2024-03-26 14:36:51 +00:00
|
|
|
client: Any = None,
|
|
|
|
credentials_profile_name: Optional[str] = None,
|
2024-02-13 05:30:20 +00:00
|
|
|
region_name: Optional[str] = None,
|
2024-03-26 14:36:51 +00:00
|
|
|
service: str = "neptunedata",
|
|
|
|
sign: bool = True,
|
2024-02-13 05:30:20 +00:00
|
|
|
) -> None:
|
|
|
|
self.use_iam_auth = use_iam_auth
|
|
|
|
self.region_name = region_name
|
|
|
|
self.query_endpoint = f"https://{host}:{port}/sparql"
|
|
|
|
|
2024-03-26 14:36:51 +00:00
|
|
|
try:
|
|
|
|
if client is not None:
|
|
|
|
self.client = client
|
|
|
|
else:
|
2024-02-13 05:30:20 +00:00
|
|
|
import boto3
|
|
|
|
|
2024-03-26 14:36:51 +00:00
|
|
|
if credentials_profile_name is not None:
|
|
|
|
self.session = boto3.Session(profile_name=credentials_profile_name)
|
|
|
|
else:
|
|
|
|
# use default credentials
|
|
|
|
self.session = boto3.Session()
|
|
|
|
|
|
|
|
client_params = {}
|
|
|
|
if region_name:
|
|
|
|
client_params["region_name"] = region_name
|
|
|
|
|
|
|
|
protocol = "https" if use_https else "http"
|
|
|
|
|
|
|
|
client_params["endpoint_url"] = f"{protocol}://{host}:{port}"
|
|
|
|
|
|
|
|
if sign:
|
|
|
|
self.client = self.session.client(service, **client_params)
|
|
|
|
else:
|
|
|
|
from botocore import UNSIGNED
|
|
|
|
from botocore.config import Config
|
|
|
|
|
|
|
|
self.client = self.session.client(
|
|
|
|
service,
|
|
|
|
**client_params,
|
|
|
|
config=Config(signature_version=UNSIGNED),
|
|
|
|
)
|
|
|
|
|
|
|
|
except ImportError:
|
2024-04-29 14:32:50 +00:00
|
|
|
raise ImportError(
|
2024-03-26 14:36:51 +00:00
|
|
|
"Could not import boto3 python package. "
|
|
|
|
"Please install it with `pip install boto3`."
|
|
|
|
)
|
|
|
|
except Exception as e:
|
|
|
|
if type(e).__name__ == "UnknownServiceError":
|
2024-04-29 14:32:50 +00:00
|
|
|
raise ImportError(
|
2024-03-26 14:36:51 +00:00
|
|
|
"NeptuneGraph requires a boto3 version 1.28.38 or greater."
|
|
|
|
"Please install it with `pip install -U boto3`."
|
|
|
|
) from e
|
|
|
|
else:
|
|
|
|
raise ValueError(
|
|
|
|
"Could not load credentials to authenticate with AWS client. "
|
|
|
|
"Please check that credentials in the specified "
|
|
|
|
"profile name are valid."
|
|
|
|
) from e
|
2024-02-13 05:30:20 +00:00
|
|
|
|
|
|
|
# Set schema
|
|
|
|
self.schema = ""
|
|
|
|
self.schema_elements: Dict[str, Any] = {}
|
|
|
|
self._refresh_schema()
|
|
|
|
|
|
|
|
@property
|
|
|
|
def get_schema(self) -> str:
|
|
|
|
"""
|
|
|
|
Returns the schema of the graph database.
|
|
|
|
"""
|
|
|
|
return self.schema
|
|
|
|
|
|
|
|
@property
|
|
|
|
def get_schema_elements(self) -> Dict[str, Any]:
|
|
|
|
return self.schema_elements
|
|
|
|
|
2024-03-26 14:36:51 +00:00
|
|
|
def get_summary(self) -> Dict[str, Any]:
|
|
|
|
"""
|
|
|
|
Obtain Neptune statistical summary of classes and predicates in the graph.
|
|
|
|
"""
|
|
|
|
return self.client.get_rdf_graph_summary(mode="detailed")
|
|
|
|
|
2024-02-13 05:30:20 +00:00
|
|
|
def query(
|
|
|
|
self,
|
|
|
|
query: str,
|
|
|
|
) -> Dict[str, Any]:
|
|
|
|
"""
|
|
|
|
Run Neptune query.
|
|
|
|
"""
|
|
|
|
request_data = {"query": query}
|
|
|
|
data = request_data
|
|
|
|
request_hdr = None
|
|
|
|
|
|
|
|
if self.use_iam_auth:
|
|
|
|
credentials = self.session.get_credentials()
|
|
|
|
credentials = credentials.get_frozen_credentials()
|
|
|
|
access_key = credentials.access_key
|
|
|
|
secret_key = credentials.secret_key
|
|
|
|
service = "neptune-db"
|
|
|
|
session_token = credentials.token
|
|
|
|
params = None
|
|
|
|
creds = SimpleNamespace(
|
|
|
|
access_key=access_key,
|
|
|
|
secret_key=secret_key,
|
|
|
|
token=session_token,
|
|
|
|
region=self.region_name,
|
|
|
|
)
|
|
|
|
from botocore.awsrequest import AWSRequest
|
|
|
|
|
|
|
|
request = AWSRequest(
|
|
|
|
method="POST", url=self.query_endpoint, data=data, params=params
|
|
|
|
)
|
|
|
|
from botocore.auth import SigV4Auth
|
|
|
|
|
|
|
|
SigV4Auth(creds, service, self.region_name).add_auth(request)
|
|
|
|
request.headers["Content-Type"] = "application/x-www-form-urlencoded"
|
|
|
|
request_hdr = request.headers
|
|
|
|
else:
|
|
|
|
request_hdr = {}
|
|
|
|
request_hdr["Content-Type"] = "application/x-www-form-urlencoded"
|
|
|
|
|
|
|
|
queryres = requests.request(
|
|
|
|
method="POST", url=self.query_endpoint, headers=request_hdr, data=data
|
|
|
|
)
|
|
|
|
json_resp = json.loads(queryres.text)
|
|
|
|
return json_resp
|
|
|
|
|
|
|
|
def load_schema(self, schema_elements: Dict[str, Any]) -> None:
|
|
|
|
"""
|
|
|
|
Generates and sets schema from schema_elements. Helpful in
|
|
|
|
cases where introspected schema needs pruning.
|
|
|
|
"""
|
|
|
|
|
|
|
|
elem_str = {}
|
|
|
|
for elem in ELEM_TYPES:
|
|
|
|
res_list = []
|
2024-03-26 14:36:51 +00:00
|
|
|
for elem_rec in schema_elements[elem]:
|
2024-02-13 05:30:20 +00:00
|
|
|
uri = elem_rec["uri"]
|
|
|
|
local = elem_rec["local"]
|
|
|
|
res_str = f"<{uri}> ({local})"
|
|
|
|
res_list.append(res_str)
|
|
|
|
elem_str[elem] = ", ".join(res_list)
|
|
|
|
|
|
|
|
self.schema = (
|
|
|
|
"In the following, each IRI is followed by the local name and "
|
|
|
|
"optionally its description in parentheses. \n"
|
|
|
|
"The graph supports the following node types:\n"
|
2024-03-26 14:36:51 +00:00
|
|
|
f"{elem_str['classes']}\n"
|
2024-02-13 05:30:20 +00:00
|
|
|
"The graph supports the following relationships:\n"
|
2024-03-26 14:36:51 +00:00
|
|
|
f"{elem_str['rels']}\n"
|
|
|
|
"The graph supports the following OWL object properties:\n"
|
|
|
|
f"{elem_str['dtprops']}\n"
|
|
|
|
"The graph supports the following OWL data properties:\n"
|
2024-02-13 05:30:20 +00:00
|
|
|
f"{elem_str['oprops']}"
|
|
|
|
)
|
|
|
|
|
|
|
|
def _get_local_name(self, iri: str) -> Sequence[str]:
|
|
|
|
"""
|
|
|
|
Split IRI into prefix and local
|
|
|
|
"""
|
|
|
|
if "#" in iri:
|
|
|
|
tokens = iri.split("#")
|
|
|
|
return [f"{tokens[0]}#", tokens[-1]]
|
|
|
|
elif "/" in iri:
|
|
|
|
tokens = iri.split("/")
|
|
|
|
return [f"{'/'.join(tokens[0:len(tokens)-1])}/", tokens[-1]]
|
|
|
|
else:
|
|
|
|
raise ValueError(f"Unexpected IRI '{iri}', contains neither '#' nor '/'.")
|
|
|
|
|
|
|
|
def _refresh_schema(self) -> None:
|
|
|
|
"""
|
|
|
|
Query Neptune to introspect schema.
|
|
|
|
"""
|
|
|
|
self.schema_elements["distinct_prefixes"] = {}
|
|
|
|
|
2024-03-26 14:36:51 +00:00
|
|
|
# get summary and build list of classes and rels
|
|
|
|
summary = self.get_summary()
|
|
|
|
reslist = []
|
|
|
|
for c in summary["payload"]["graphSummary"]["classes"]:
|
|
|
|
uri = c
|
|
|
|
tokens = self._get_local_name(uri)
|
|
|
|
elem_record = {"uri": uri, "local": tokens[1]}
|
|
|
|
reslist.append(elem_record)
|
|
|
|
if tokens[0] not in self.schema_elements["distinct_prefixes"]:
|
|
|
|
self.schema_elements["distinct_prefixes"][tokens[0]] = "y"
|
|
|
|
self.schema_elements["classes"] = reslist
|
|
|
|
|
|
|
|
reslist = []
|
|
|
|
for r in summary["payload"]["graphSummary"]["predicates"]:
|
|
|
|
for p in r:
|
|
|
|
uri = p
|
|
|
|
tokens = self._get_local_name(uri)
|
|
|
|
elem_record = {"uri": uri, "local": tokens[1]}
|
|
|
|
reslist.append(elem_record)
|
|
|
|
if tokens[0] not in self.schema_elements["distinct_prefixes"]:
|
|
|
|
self.schema_elements["distinct_prefixes"][tokens[0]] = "y"
|
|
|
|
self.schema_elements["rels"] = reslist
|
|
|
|
|
|
|
|
# get dtprops and oprops too
|
2024-02-13 05:30:20 +00:00
|
|
|
for elem in ELEM_TYPES:
|
2024-03-26 14:36:51 +00:00
|
|
|
q = ELEM_TYPES.get(elem)
|
|
|
|
if not q:
|
|
|
|
continue
|
|
|
|
items = self.query(q)
|
2024-02-13 05:30:20 +00:00
|
|
|
reslist = []
|
|
|
|
for r in items["results"]["bindings"]:
|
|
|
|
uri = r["elem"]["value"]
|
|
|
|
tokens = self._get_local_name(uri)
|
|
|
|
elem_record = {"uri": uri, "local": tokens[1]}
|
|
|
|
reslist.append(elem_record)
|
|
|
|
if tokens[0] not in self.schema_elements["distinct_prefixes"]:
|
|
|
|
self.schema_elements["distinct_prefixes"][tokens[0]] = "y"
|
|
|
|
|
|
|
|
self.schema_elements[elem] = reslist
|
|
|
|
|
|
|
|
self.load_schema(self.schema_elements)
|