mirror of
https://github.com/hwchase17/langchain
synced 2024-11-06 03:20:49 +00:00
Extend Cube Semantic Loader functionality (#8186)
**PR Description:** This pull request introduces several enhancements and new features to the `CubeSemanticLoader`. The changes include the following: 1. Added imports for the `json` and `time` modules. 2. Added new constructor parameters: `load_dimension_values`, `dimension_values_limit`, `dimension_values_max_retries`, and `dimension_values_retry_delay`. 3. Updated the class documentation with descriptions for the new constructor parameters. 4. Added a new private method `_get_dimension_values()` to retrieve dimension values from Cube's REST API. 5. Modified the `load()` method to load dimension values for string dimensions if `load_dimension_values` is set to `True`. 6. Updated the API endpoint in the `load()` method from the base URL to the metadata endpoint. 7. Refactored the code to retrieve metadata from the response JSON. 8. Added the `column_member_type` field to the metadata dictionary to indicate if a column is a measure or a dimension. 9. Added the `column_values` field to the metadata dictionary to store the dimension values retrieved from Cube's API. 10. Modified the `page_content` construction to include the column title and description instead of the table name, column name, data type, title, and description. These changes improve the functionality and flexibility of the `CubeSemanticLoader` class by allowing the loading of dimension values and providing more detailed metadata for each document. --------- Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
parent
82b8d8596c
commit
d983046f90
@ -53,11 +53,23 @@
|
|||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
|
"**Input arguments (mandatory)**\n",
|
||||||
|
"\n",
|
||||||
"`Cube Semantic Loader` requires 2 arguments:\n",
|
"`Cube Semantic Loader` requires 2 arguments:\n",
|
||||||
"| Input Parameter | Description |\n",
|
"\n",
|
||||||
"| --- | --- |\n",
|
"- `cube_api_url`: The URL of your Cube's deployment REST API. Please refer to the [Cube documentation](https://cube.dev/docs/http-api/rest#configuration-base-path) for more information on configuring the base path.\n",
|
||||||
"| `cube_api_url` | The URL of your Cube's deployment REST API. Please refer to the [Cube documentation](https://cube.dev/docs/http-api/rest#configuration-base-path) for more information on configuring the base path. |\n",
|
"\n",
|
||||||
"| `cube_api_token` | The authentication token generated based on your Cube's API secret. Please refer to the [Cube documentation](https://cube.dev/docs/security#generating-json-web-tokens-jwt) for instructions on generating JSON Web Tokens (JWT). |\n"
|
"- `cube_api_token`: The authentication token generated based on your Cube's API secret. Please refer to the [Cube documentation](https://cube.dev/docs/security#generating-json-web-tokens-jwt) for instructions on generating JSON Web Tokens (JWT).\n",
|
||||||
|
"\n",
|
||||||
|
"**Input arguments (optional)**\n",
|
||||||
|
"\n",
|
||||||
|
"- `load_dimension_values`: Whether to load dimension values for every string dimension or not.\n",
|
||||||
|
"\n",
|
||||||
|
"- `dimension_values_limit`: Maximum number of dimension values to load.\n",
|
||||||
|
"\n",
|
||||||
|
"- `dimension_values_max_retries`: Maximum number of retries to load dimension values.\n",
|
||||||
|
"\n",
|
||||||
|
"- `dimension_values_retry_delay`: Delay between retries to load dimension values."
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -85,9 +97,7 @@
|
|||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"Returns:\n",
|
"Returns a list of documents with the following attributes:\n",
|
||||||
"\n",
|
|
||||||
"A list of documents with the following attributes:\n",
|
|
||||||
"\n",
|
"\n",
|
||||||
"- `page_content`\n",
|
"- `page_content`\n",
|
||||||
"- `metadata`\n",
|
"- `metadata`\n",
|
||||||
@ -95,7 +105,8 @@
|
|||||||
" - `column_name`\n",
|
" - `column_name`\n",
|
||||||
" - `column_data_type`\n",
|
" - `column_data_type`\n",
|
||||||
" - `column_title`\n",
|
" - `column_title`\n",
|
||||||
" - `column_description`"
|
" - `column_description`\n",
|
||||||
|
" - `column_values`"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -103,7 +114,7 @@
|
|||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"> page_content='table name: orders_view, column name: orders_view.total_amount, column data type: number, column title: Orders View Total Amount, column description: None' metadata={'table_name': 'orders_view', 'column_name': 'orders_view.total_amount', 'column_data_type': 'number', 'column_title': 'Orders View Total Amount', 'column_description': 'None'}"
|
"> page_content='Users View City, None' metadata={'table_name': 'users_view', 'column_name': 'users_view.city', 'column_data_type': 'string', 'column_title': 'Users View City', 'column_description': 'None', 'column_member_type': 'dimension', 'column_values': ['Austin', 'Chicago', 'Los Angeles', 'Mountain View', 'New York', 'Palo Alto', 'San Francisco', 'Seattle']}"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
@ -1,3 +1,6 @@
|
|||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import time
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
@ -5,45 +8,118 @@ import requests
|
|||||||
from langchain.docstore.document import Document
|
from langchain.docstore.document import Document
|
||||||
from langchain.document_loaders.base import BaseLoader
|
from langchain.document_loaders.base import BaseLoader
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class CubeSemanticLoader(BaseLoader):
|
class CubeSemanticLoader(BaseLoader):
|
||||||
"""Load Cube semantic layer metadata."""
|
"""Load Cube semantic layer metadata.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
cube_api_url: REST API endpoint.
|
||||||
|
Use the REST API of your Cube's deployment.
|
||||||
|
Please find out more information here:
|
||||||
|
https://cube.dev/docs/http-api/rest#configuration-base-path
|
||||||
|
cube_api_token: Cube API token.
|
||||||
|
Authentication tokens are generated based on your Cube's API secret.
|
||||||
|
Please find out more information here:
|
||||||
|
https://cube.dev/docs/security#generating-json-web-tokens-jwt
|
||||||
|
load_dimension_values: Whether to load dimension values for every string
|
||||||
|
dimension or not.
|
||||||
|
dimension_values_limit: Maximum number of dimension values to load.
|
||||||
|
dimension_values_max_retries: Maximum number of retries to load dimension
|
||||||
|
values.
|
||||||
|
dimension_values_retry_delay: Delay between retries to load dimension values.
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
cube_api_url: str,
|
cube_api_url: str,
|
||||||
cube_api_token: str,
|
cube_api_token: str,
|
||||||
|
load_dimension_values: bool = True,
|
||||||
|
dimension_values_limit: int = 10_000,
|
||||||
|
dimension_values_max_retries: int = 10,
|
||||||
|
dimension_values_retry_delay: int = 3,
|
||||||
):
|
):
|
||||||
self.cube_api_url = cube_api_url
|
self.cube_api_url = cube_api_url
|
||||||
"""Use the REST API of your Cube's deployment.
|
|
||||||
Please find out more information here:
|
|
||||||
https://cube.dev/docs/http-api/rest#configuration-base-path
|
|
||||||
"""
|
|
||||||
self.cube_api_token = cube_api_token
|
self.cube_api_token = cube_api_token
|
||||||
"""Authentication tokens are generated based on your Cube's API secret.
|
self.load_dimension_values = load_dimension_values
|
||||||
Please find out more information here:
|
self.dimension_values_limit = dimension_values_limit
|
||||||
https://cube.dev/docs/security#generating-json-web-tokens-jwt
|
self.dimension_values_max_retries = dimension_values_max_retries
|
||||||
|
self.dimension_values_retry_delay = dimension_values_retry_delay
|
||||||
|
|
||||||
|
def _get_dimension_values(self, dimension_name: str) -> List[str]:
|
||||||
|
"""Makes a call to Cube's REST API load endpoint to retrieve
|
||||||
|
values for dimensions.
|
||||||
|
|
||||||
|
These values can be used to achieve a more accurate filtering.
|
||||||
"""
|
"""
|
||||||
|
logger.info("Loading dimension values for: {dimension_name}...")
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
"Authorization": self.cube_api_token,
|
||||||
|
}
|
||||||
|
|
||||||
|
query = {
|
||||||
|
"query": {
|
||||||
|
"dimensions": [dimension_name],
|
||||||
|
"limit": self.dimension_values_limit,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
retries = 0
|
||||||
|
while retries < self.dimension_values_max_retries:
|
||||||
|
response = requests.request(
|
||||||
|
"POST",
|
||||||
|
f"{self.cube_api_url}/load",
|
||||||
|
headers=headers,
|
||||||
|
data=json.dumps(query),
|
||||||
|
)
|
||||||
|
|
||||||
|
if response.status_code == 200:
|
||||||
|
response_data = response.json()
|
||||||
|
if (
|
||||||
|
"error" in response_data
|
||||||
|
and response_data["error"] == "Continue wait"
|
||||||
|
):
|
||||||
|
logger.info("Retrying...")
|
||||||
|
retries += 1
|
||||||
|
time.sleep(self.dimension_values_retry_delay)
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
dimension_values = [
|
||||||
|
item[dimension_name] for item in response_data["data"]
|
||||||
|
]
|
||||||
|
return dimension_values
|
||||||
|
else:
|
||||||
|
logger.error("Request failed with status code:", response.status_code)
|
||||||
|
break
|
||||||
|
|
||||||
|
if retries == self.dimension_values_max_retries:
|
||||||
|
logger.info("Maximum retries reached.")
|
||||||
|
return []
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
def load(self) -> List[Document]:
|
||||||
"""Makes a call to Cube's REST API metadata endpoint.
|
"""Makes a call to Cube's REST API metadata endpoint.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
A list of documents with attributes:
|
A list of documents with attributes:
|
||||||
- page_content=column_name
|
- page_content=column_title + column_description
|
||||||
- metadata
|
- metadata
|
||||||
- table_name
|
- table_name
|
||||||
- column_name
|
- column_name
|
||||||
- column_data_type
|
- column_data_type
|
||||||
|
- column_member_type
|
||||||
- column_title
|
- column_title
|
||||||
- column_description
|
- column_description
|
||||||
|
- column_values
|
||||||
"""
|
"""
|
||||||
headers = {
|
headers = {
|
||||||
"Content-Type": "application/json",
|
"Content-Type": "application/json",
|
||||||
"Authorization": self.cube_api_token,
|
"Authorization": self.cube_api_token,
|
||||||
}
|
}
|
||||||
|
|
||||||
response = requests.get(self.cube_api_url, headers=headers)
|
response = requests.get(f"{self.cube_api_url}/meta", headers=headers)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
raw_meta_json = response.json()
|
raw_meta_json = response.json()
|
||||||
cubes = raw_meta_json.get("cubes", [])
|
cubes = raw_meta_json.get("cubes", [])
|
||||||
@ -59,19 +135,30 @@ class CubeSemanticLoader(BaseLoader):
|
|||||||
dimensions = cube.get("dimensions", [])
|
dimensions = cube.get("dimensions", [])
|
||||||
|
|
||||||
for item in measures + dimensions:
|
for item in measures + dimensions:
|
||||||
|
column_member_type = "measure" if item in measures else "dimension"
|
||||||
|
dimension_values = []
|
||||||
|
item_name = str(item.get("name"))
|
||||||
|
item_type = str(item.get("type"))
|
||||||
|
|
||||||
|
if (
|
||||||
|
self.load_dimension_values
|
||||||
|
and column_member_type == "dimension"
|
||||||
|
and item_type == "string"
|
||||||
|
):
|
||||||
|
dimension_values = self._get_dimension_values(item_name)
|
||||||
|
|
||||||
metadata = dict(
|
metadata = dict(
|
||||||
table_name=str(cube_name),
|
table_name=str(cube_name),
|
||||||
column_name=str(item.get("name")),
|
column_name=item_name,
|
||||||
column_data_type=str(item.get("type")),
|
column_data_type=item_type,
|
||||||
column_title=str(item.get("title")),
|
column_title=str(item.get("title")),
|
||||||
column_description=str(item.get("description")),
|
column_description=str(item.get("description")),
|
||||||
|
column_member_type=column_member_type,
|
||||||
|
column_values=dimension_values,
|
||||||
)
|
)
|
||||||
|
|
||||||
page_content = f"table name: {str(cube_name)}, "
|
page_content = f"{str(item.get('title'))}, "
|
||||||
page_content += f"column name: {str(item.get('name'))}, "
|
page_content += f"{str(item.get('description'))}"
|
||||||
page_content += f"column data type: {str(item.get('type'))}, "
|
|
||||||
page_content += f"column title: {str(item.get('title'))}, "
|
|
||||||
page_content += f"column description: {str(item.get('description'))}"
|
|
||||||
|
|
||||||
docs.append(Document(page_content=page_content, metadata=metadata))
|
docs.append(Document(page_content=page_content, metadata=metadata))
|
||||||
|
|
||||||
|
@ -1,86 +1,61 @@
|
|||||||
from typing import List
|
import unittest
|
||||||
from unittest import TestCase
|
from unittest.mock import MagicMock, Mock, patch
|
||||||
from unittest.mock import MagicMock, patch
|
|
||||||
|
|
||||||
import requests
|
|
||||||
|
|
||||||
from langchain.docstore.document import Document
|
|
||||||
from langchain.document_loaders import CubeSemanticLoader
|
from langchain.document_loaders import CubeSemanticLoader
|
||||||
|
|
||||||
|
MODULE_PATH = "langchain.document_loaders.cube_semantic.CubeSemanticLoader"
|
||||||
|
|
||||||
class TestCubeSemanticLoader(TestCase):
|
|
||||||
@patch.object(requests, "get")
|
class TestCubeSemanticLoader(unittest.TestCase):
|
||||||
def test_load_success(self, mock_get: MagicMock) -> None:
|
def setUp(self) -> None:
|
||||||
# Arrange
|
self.loader = CubeSemanticLoader(
|
||||||
cube_api_url: str = "https://example.com/cube_api"
|
cube_api_url="http://example.com", cube_api_token="test_token"
|
||||||
cube_api_token: str = "abc123"
|
)
|
||||||
mock_response: MagicMock = MagicMock()
|
|
||||||
|
@patch("requests.request")
|
||||||
|
def test_get_dimension_values(self, mock_request: MagicMock) -> None:
|
||||||
|
mock_response = Mock()
|
||||||
mock_response.status_code = 200
|
mock_response.status_code = 200
|
||||||
mock_response_json: dict = {
|
mock_response.json.return_value = {"data": [{"test_dimension": "value1"}]}
|
||||||
|
mock_request.return_value = mock_response
|
||||||
|
|
||||||
|
values = self.loader._get_dimension_values("test_dimension")
|
||||||
|
self.assertEqual(values, ["value1"])
|
||||||
|
|
||||||
|
@patch("requests.get")
|
||||||
|
@patch(f"{MODULE_PATH}._get_dimension_values")
|
||||||
|
def test_load(
|
||||||
|
self, mock_get_dimension_values: MagicMock, mock_get: MagicMock
|
||||||
|
) -> None:
|
||||||
|
# Mocking the response
|
||||||
|
mock_response = Mock()
|
||||||
|
mock_response.raise_for_status.return_value = None
|
||||||
|
mock_response.json.return_value = {
|
||||||
"cubes": [
|
"cubes": [
|
||||||
{
|
{
|
||||||
|
"name": "test_cube",
|
||||||
"type": "view",
|
"type": "view",
|
||||||
"name": "cube1",
|
"measures": [],
|
||||||
"measures": [{"type": "sum", "name": "sales", "title": "Sales"}],
|
|
||||||
"dimensions": [
|
"dimensions": [
|
||||||
{
|
{
|
||||||
|
"name": "test_dimension",
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"name": "product_name",
|
"title": "Test Title",
|
||||||
"title": "Product Name",
|
"description": "Test Description",
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
mock_response.json.return_value = mock_response_json
|
|
||||||
mock_get.return_value = mock_response
|
mock_get.return_value = mock_response
|
||||||
|
|
||||||
expected_docs: List[Document] = [
|
mock_get_dimension_values.return_value = ["value1", "value2"]
|
||||||
Document(
|
|
||||||
page_content=(
|
|
||||||
"table name: cube1, "
|
|
||||||
"column name: sales, "
|
|
||||||
"column data type: sum, "
|
|
||||||
"column title: Sales, "
|
|
||||||
"column description: None"
|
|
||||||
),
|
|
||||||
metadata={
|
|
||||||
"table_name": "cube1",
|
|
||||||
"column_name": "sales",
|
|
||||||
"column_data_type": "sum",
|
|
||||||
"column_title": "Sales",
|
|
||||||
"column_description": "None",
|
|
||||||
},
|
|
||||||
),
|
|
||||||
Document(
|
|
||||||
page_content=(
|
|
||||||
"table name: cube1, "
|
|
||||||
"column name: product_name, "
|
|
||||||
"column data type: string, "
|
|
||||||
"column title: Product Name, "
|
|
||||||
"column description: None"
|
|
||||||
),
|
|
||||||
metadata={
|
|
||||||
"table_name": "cube1",
|
|
||||||
"column_name": "product_name",
|
|
||||||
"column_data_type": "string",
|
|
||||||
"column_title": "Product Name",
|
|
||||||
"column_description": "None",
|
|
||||||
},
|
|
||||||
),
|
|
||||||
]
|
|
||||||
|
|
||||||
loader: CubeSemanticLoader = CubeSemanticLoader(cube_api_url, cube_api_token)
|
documents = self.loader.load()
|
||||||
|
self.assertEqual(len(documents), 1)
|
||||||
|
self.assertEqual(documents[0].page_content, "Test Title, Test Description")
|
||||||
|
self.assertEqual(documents[0].metadata["column_values"], ["value1", "value2"])
|
||||||
|
|
||||||
# Act
|
|
||||||
result: List[Document] = loader.load()
|
|
||||||
|
|
||||||
# Assert
|
if __name__ == "__main__":
|
||||||
self.assertEqual(result, expected_docs)
|
unittest.main()
|
||||||
mock_get.assert_called_once_with(
|
|
||||||
cube_api_url,
|
|
||||||
headers={
|
|
||||||
"Content-Type": "application/json",
|
|
||||||
"Authorization": cube_api_token,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
|
Loading…
Reference in New Issue
Block a user