diff --git a/docs/extras/integrations/document_loaders/cube_semantic.ipynb b/docs/extras/integrations/document_loaders/cube_semantic.ipynb index ac6b953046..5868d58c0f 100644 --- a/docs/extras/integrations/document_loaders/cube_semantic.ipynb +++ b/docs/extras/integrations/document_loaders/cube_semantic.ipynb @@ -53,11 +53,23 @@ "cell_type": "markdown", "metadata": {}, "source": [ + "**Input arguments (mandatory)**\n", + "\n", "`Cube Semantic Loader` requires 2 arguments:\n", - "| Input Parameter | Description |\n", - "| --- | --- |\n", - "| `cube_api_url` | The URL of your Cube's deployment REST API. Please refer to the [Cube documentation](https://cube.dev/docs/http-api/rest#configuration-base-path) for more information on configuring the base path. |\n", - "| `cube_api_token` | The authentication token generated based on your Cube's API secret. Please refer to the [Cube documentation](https://cube.dev/docs/security#generating-json-web-tokens-jwt) for instructions on generating JSON Web Tokens (JWT). |\n" + "\n", + "- `cube_api_url`: The URL of your Cube's deployment REST API. Please refer to the [Cube documentation](https://cube.dev/docs/http-api/rest#configuration-base-path) for more information on configuring the base path.\n", + "\n", + "- `cube_api_token`: The authentication token generated based on your Cube's API secret. Please refer to the [Cube documentation](https://cube.dev/docs/security#generating-json-web-tokens-jwt) for instructions on generating JSON Web Tokens (JWT).\n", + "\n", + "**Input arguments (optional)**\n", + "\n", + "- `load_dimension_values`: Whether to load dimension values for every string dimension or not.\n", + "\n", + "- `dimension_values_limit`: Maximum number of dimension values to load.\n", + "\n", + "- `dimension_values_max_retries`: Maximum number of retries to load dimension values.\n", + "\n", + "- `dimension_values_retry_delay`: Delay between retries to load dimension values." ] }, { @@ -85,9 +97,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Returns:\n", - "\n", - "A list of documents with the following attributes:\n", + "Returns a list of documents with the following attributes:\n", "\n", "- `page_content`\n", "- `metadata`\n", @@ -95,7 +105,8 @@ " - `column_name`\n", " - `column_data_type`\n", " - `column_title`\n", - " - `column_description`" + " - `column_description`\n", + " - `column_values`" ] }, { @@ -103,7 +114,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "> page_content='table name: orders_view, column name: orders_view.total_amount, column data type: number, column title: Orders View Total Amount, column description: None' metadata={'table_name': 'orders_view', 'column_name': 'orders_view.total_amount', 'column_data_type': 'number', 'column_title': 'Orders View Total Amount', 'column_description': 'None'}" + "> page_content='Users View City, None' metadata={'table_name': 'users_view', 'column_name': 'users_view.city', 'column_data_type': 'string', 'column_title': 'Users View City', 'column_description': 'None', 'column_member_type': 'dimension', 'column_values': ['Austin', 'Chicago', 'Los Angeles', 'Mountain View', 'New York', 'Palo Alto', 'San Francisco', 'Seattle']}" ] } ], diff --git a/libs/langchain/langchain/document_loaders/cube_semantic.py b/libs/langchain/langchain/document_loaders/cube_semantic.py index eb4341a59a..2d645e9e5c 100644 --- a/libs/langchain/langchain/document_loaders/cube_semantic.py +++ b/libs/langchain/langchain/document_loaders/cube_semantic.py @@ -1,3 +1,6 @@ +import json +import logging +import time from typing import List import requests @@ -5,45 +8,118 @@ import requests from langchain.docstore.document import Document from langchain.document_loaders.base import BaseLoader +logger = logging.getLogger(__name__) + class CubeSemanticLoader(BaseLoader): - """Load Cube semantic layer metadata.""" + """Load Cube semantic layer metadata. + + Args: + cube_api_url: REST API endpoint. + Use the REST API of your Cube's deployment. + Please find out more information here: + https://cube.dev/docs/http-api/rest#configuration-base-path + cube_api_token: Cube API token. + Authentication tokens are generated based on your Cube's API secret. + Please find out more information here: + https://cube.dev/docs/security#generating-json-web-tokens-jwt + load_dimension_values: Whether to load dimension values for every string + dimension or not. + dimension_values_limit: Maximum number of dimension values to load. + dimension_values_max_retries: Maximum number of retries to load dimension + values. + dimension_values_retry_delay: Delay between retries to load dimension values. + """ def __init__( self, cube_api_url: str, cube_api_token: str, + load_dimension_values: bool = True, + dimension_values_limit: int = 10_000, + dimension_values_max_retries: int = 10, + dimension_values_retry_delay: int = 3, ): self.cube_api_url = cube_api_url - """Use the REST API of your Cube's deployment. - Please find out more information here: - https://cube.dev/docs/http-api/rest#configuration-base-path - """ self.cube_api_token = cube_api_token - """Authentication tokens are generated based on your Cube's API secret. - Please find out more information here: - https://cube.dev/docs/security#generating-json-web-tokens-jwt + self.load_dimension_values = load_dimension_values + self.dimension_values_limit = dimension_values_limit + self.dimension_values_max_retries = dimension_values_max_retries + self.dimension_values_retry_delay = dimension_values_retry_delay + + def _get_dimension_values(self, dimension_name: str) -> List[str]: + """Makes a call to Cube's REST API load endpoint to retrieve + values for dimensions. + + These values can be used to achieve a more accurate filtering. """ + logger.info("Loading dimension values for: {dimension_name}...") + + headers = { + "Content-Type": "application/json", + "Authorization": self.cube_api_token, + } + + query = { + "query": { + "dimensions": [dimension_name], + "limit": self.dimension_values_limit, + } + } + + retries = 0 + while retries < self.dimension_values_max_retries: + response = requests.request( + "POST", + f"{self.cube_api_url}/load", + headers=headers, + data=json.dumps(query), + ) + + if response.status_code == 200: + response_data = response.json() + if ( + "error" in response_data + and response_data["error"] == "Continue wait" + ): + logger.info("Retrying...") + retries += 1 + time.sleep(self.dimension_values_retry_delay) + continue + else: + dimension_values = [ + item[dimension_name] for item in response_data["data"] + ] + return dimension_values + else: + logger.error("Request failed with status code:", response.status_code) + break + + if retries == self.dimension_values_max_retries: + logger.info("Maximum retries reached.") + return [] def load(self) -> List[Document]: """Makes a call to Cube's REST API metadata endpoint. Returns: A list of documents with attributes: - - page_content=column_name + - page_content=column_title + column_description - metadata - table_name - column_name - column_data_type + - column_member_type - column_title - column_description + - column_values """ headers = { "Content-Type": "application/json", "Authorization": self.cube_api_token, } - response = requests.get(self.cube_api_url, headers=headers) + response = requests.get(f"{self.cube_api_url}/meta", headers=headers) response.raise_for_status() raw_meta_json = response.json() cubes = raw_meta_json.get("cubes", []) @@ -59,19 +135,30 @@ class CubeSemanticLoader(BaseLoader): dimensions = cube.get("dimensions", []) for item in measures + dimensions: + column_member_type = "measure" if item in measures else "dimension" + dimension_values = [] + item_name = str(item.get("name")) + item_type = str(item.get("type")) + + if ( + self.load_dimension_values + and column_member_type == "dimension" + and item_type == "string" + ): + dimension_values = self._get_dimension_values(item_name) + metadata = dict( table_name=str(cube_name), - column_name=str(item.get("name")), - column_data_type=str(item.get("type")), + column_name=item_name, + column_data_type=item_type, column_title=str(item.get("title")), column_description=str(item.get("description")), + column_member_type=column_member_type, + column_values=dimension_values, ) - page_content = f"table name: {str(cube_name)}, " - page_content += f"column name: {str(item.get('name'))}, " - page_content += f"column data type: {str(item.get('type'))}, " - page_content += f"column title: {str(item.get('title'))}, " - page_content += f"column description: {str(item.get('description'))}" + page_content = f"{str(item.get('title'))}, " + page_content += f"{str(item.get('description'))}" docs.append(Document(page_content=page_content, metadata=metadata)) diff --git a/libs/langchain/tests/unit_tests/document_loaders/test_cube_semantic.py b/libs/langchain/tests/unit_tests/document_loaders/test_cube_semantic.py index cdd5fd711b..c309f17ffe 100644 --- a/libs/langchain/tests/unit_tests/document_loaders/test_cube_semantic.py +++ b/libs/langchain/tests/unit_tests/document_loaders/test_cube_semantic.py @@ -1,86 +1,61 @@ -from typing import List -from unittest import TestCase -from unittest.mock import MagicMock, patch +import unittest +from unittest.mock import MagicMock, Mock, patch -import requests - -from langchain.docstore.document import Document from langchain.document_loaders import CubeSemanticLoader +MODULE_PATH = "langchain.document_loaders.cube_semantic.CubeSemanticLoader" + -class TestCubeSemanticLoader(TestCase): - @patch.object(requests, "get") - def test_load_success(self, mock_get: MagicMock) -> None: - # Arrange - cube_api_url: str = "https://example.com/cube_api" - cube_api_token: str = "abc123" - mock_response: MagicMock = MagicMock() +class TestCubeSemanticLoader(unittest.TestCase): + def setUp(self) -> None: + self.loader = CubeSemanticLoader( + cube_api_url="http://example.com", cube_api_token="test_token" + ) + + @patch("requests.request") + def test_get_dimension_values(self, mock_request: MagicMock) -> None: + mock_response = Mock() mock_response.status_code = 200 - mock_response_json: dict = { + mock_response.json.return_value = {"data": [{"test_dimension": "value1"}]} + mock_request.return_value = mock_response + + values = self.loader._get_dimension_values("test_dimension") + self.assertEqual(values, ["value1"]) + + @patch("requests.get") + @patch(f"{MODULE_PATH}._get_dimension_values") + def test_load( + self, mock_get_dimension_values: MagicMock, mock_get: MagicMock + ) -> None: + # Mocking the response + mock_response = Mock() + mock_response.raise_for_status.return_value = None + mock_response.json.return_value = { "cubes": [ { + "name": "test_cube", "type": "view", - "name": "cube1", - "measures": [{"type": "sum", "name": "sales", "title": "Sales"}], + "measures": [], "dimensions": [ { + "name": "test_dimension", "type": "string", - "name": "product_name", - "title": "Product Name", + "title": "Test Title", + "description": "Test Description", } ], } ] } - mock_response.json.return_value = mock_response_json mock_get.return_value = mock_response - expected_docs: List[Document] = [ - Document( - page_content=( - "table name: cube1, " - "column name: sales, " - "column data type: sum, " - "column title: Sales, " - "column description: None" - ), - metadata={ - "table_name": "cube1", - "column_name": "sales", - "column_data_type": "sum", - "column_title": "Sales", - "column_description": "None", - }, - ), - Document( - page_content=( - "table name: cube1, " - "column name: product_name, " - "column data type: string, " - "column title: Product Name, " - "column description: None" - ), - metadata={ - "table_name": "cube1", - "column_name": "product_name", - "column_data_type": "string", - "column_title": "Product Name", - "column_description": "None", - }, - ), - ] + mock_get_dimension_values.return_value = ["value1", "value2"] - loader: CubeSemanticLoader = CubeSemanticLoader(cube_api_url, cube_api_token) + documents = self.loader.load() + self.assertEqual(len(documents), 1) + self.assertEqual(documents[0].page_content, "Test Title, Test Description") + self.assertEqual(documents[0].metadata["column_values"], ["value1", "value2"]) - # Act - result: List[Document] = loader.load() - # Assert - self.assertEqual(result, expected_docs) - mock_get.assert_called_once_with( - cube_api_url, - headers={ - "Content-Type": "application/json", - "Authorization": cube_api_token, - }, - ) +if __name__ == "__main__": + unittest.main()