langchain/libs/community/langchain_community/document_loaders/cube_semantic.py

179 lines
6.5 KiB
Python
Raw Normal View History

community[major], core[patch], langchain[patch], experimental[patch]: Create langchain-community (#14463) Moved the following modules to new package langchain-community in a backwards compatible fashion: ``` mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community ``` Moved the following to core ``` mv langchain/langchain/utils/json_schema.py core/langchain_core/utils mv langchain/langchain/utils/html.py core/langchain_core/utils mv langchain/langchain/utils/strings.py core/langchain_core/utils cat langchain/langchain/utils/env.py >> core/langchain_core/utils/env.py rm langchain/langchain/utils/env.py ``` See .scripts/community_split/script_integrations.sh for all changes
2023-12-11 21:53:30 +00:00
import json
import logging
import time
from typing import List
import requests
from langchain_core.documents import Document
from langchain_community.document_loaders.base import BaseLoader
logger = logging.getLogger(__name__)
class CubeSemanticLoader(BaseLoader):
"""Load `Cube semantic layer` metadata.
Args:
cube_api_url: REST API endpoint.
Use the REST API of your Cube's deployment.
Please find out more information here:
https://cube.dev/docs/http-api/rest#configuration-base-path
cube_api_token: Cube API token.
Authentication tokens are generated based on your Cube's API secret.
Please find out more information here:
https://cube.dev/docs/security#generating-json-web-tokens-jwt
load_dimension_values: Whether to load dimension values for every string
dimension or not.
dimension_values_limit: Maximum number of dimension values to load.
dimension_values_max_retries: Maximum number of retries to load dimension
values.
dimension_values_retry_delay: Delay between retries to load dimension values.
"""
def __init__(
self,
cube_api_url: str,
cube_api_token: str,
load_dimension_values: bool = True,
dimension_values_limit: int = 10_000,
dimension_values_max_retries: int = 10,
dimension_values_retry_delay: int = 3,
):
self.cube_api_url = cube_api_url
self.cube_api_token = cube_api_token
self.load_dimension_values = load_dimension_values
self.dimension_values_limit = dimension_values_limit
self.dimension_values_max_retries = dimension_values_max_retries
self.dimension_values_retry_delay = dimension_values_retry_delay
def _get_dimension_values(self, dimension_name: str) -> List[str]:
"""Makes a call to Cube's REST API load endpoint to retrieve
values for dimensions.
These values can be used to achieve a more accurate filtering.
"""
logger.info("Loading dimension values for: {dimension_name}...")
headers = {
"Content-Type": "application/json",
"Authorization": self.cube_api_token,
}
query = {
"query": {
"dimensions": [dimension_name],
"limit": self.dimension_values_limit,
}
}
retries = 0
while retries < self.dimension_values_max_retries:
response = requests.request(
"POST",
f"{self.cube_api_url}/load",
headers=headers,
data=json.dumps(query),
)
if response.status_code == 200:
response_data = response.json()
if (
"error" in response_data
and response_data["error"] == "Continue wait"
):
logger.info("Retrying...")
retries += 1
time.sleep(self.dimension_values_retry_delay)
continue
else:
dimension_values = [
item[dimension_name] for item in response_data["data"]
]
return dimension_values
else:
logger.error("Request failed with status code:", response.status_code)
break
if retries == self.dimension_values_max_retries:
logger.info("Maximum retries reached.")
return []
def load(self) -> List[Document]:
"""Makes a call to Cube's REST API metadata endpoint.
Returns:
A list of documents with attributes:
- page_content=column_title + column_description
- metadata
- table_name
- column_name
- column_data_type
- column_member_type
- column_title
- column_description
- column_values
- cube_data_obj_type
"""
headers = {
"Content-Type": "application/json",
"Authorization": self.cube_api_token,
}
logger.info(f"Loading metadata from {self.cube_api_url}...")
response = requests.get(f"{self.cube_api_url}/meta", headers=headers)
response.raise_for_status()
raw_meta_json = response.json()
cube_data_objects = raw_meta_json.get("cubes", [])
logger.info(f"Found {len(cube_data_objects)} cube data objects in metadata.")
if not cube_data_objects:
raise ValueError("No cubes found in metadata.")
docs = []
for cube_data_obj in cube_data_objects:
cube_data_obj_name = cube_data_obj.get("name")
cube_data_obj_type = cube_data_obj.get("type")
cube_data_obj_is_public = cube_data_obj.get("public")
measures = cube_data_obj.get("measures", [])
dimensions = cube_data_obj.get("dimensions", [])
logger.info(f"Processing {cube_data_obj_name}...")
if not cube_data_obj_is_public:
logger.info(f"Skipping {cube_data_obj_name} because it is not public.")
continue
for item in measures + dimensions:
column_member_type = "measure" if item in measures else "dimension"
dimension_values = []
item_name = str(item.get("name"))
item_type = str(item.get("type"))
if (
self.load_dimension_values
and column_member_type == "dimension"
and item_type == "string"
):
dimension_values = self._get_dimension_values(item_name)
metadata = dict(
table_name=str(cube_data_obj_name),
column_name=item_name,
column_data_type=item_type,
column_title=str(item.get("title")),
column_description=str(item.get("description")),
column_member_type=column_member_type,
column_values=dimension_values,
cube_data_obj_type=cube_data_obj_type,
)
page_content = f"{str(item.get('title'))}, "
page_content += f"{str(item.get('description'))}"
docs.append(Document(page_content=page_content, metadata=metadata))
return docs