diff --git a/.gitignore b/.gitignore index c24d6e3f56..aed12c91c6 100644 --- a/.gitignore +++ b/.gitignore @@ -116,6 +116,7 @@ celerybeat.pid .env .envrc .venv* +venv* env/ ENV/ env.bak/ diff --git a/libs/community/langchain_community/vectorstores/clickhouse.py b/libs/community/langchain_community/vectorstores/clickhouse.py index 816ccd2aac..bc54108454 100644 --- a/libs/community/langchain_community/vectorstores/clickhouse.py +++ b/libs/community/langchain_community/vectorstores/clickhouse.py @@ -211,12 +211,48 @@ CREATE TABLE IF NOT EXISTS {self.config.database}.{self.config.table}( @property def embeddings(self) -> Embeddings: + """Provides access to the embedding mechanism used by the Clickhouse instance. + + This property allows direct access to the embedding function or model being + used by the Clickhouse instance to convert text documents into embedding vectors + for vector similarity search. + + Returns: + The `Embeddings` instance associated with this Clickhouse instance. + """ return self.embedding_function def escape_str(self, value: str) -> str: + """Escape special characters in a string for Clickhouse SQL queries. + + This method is used internally to prepare strings for safe insertion + into SQL queries by escaping special characters that might otherwise + interfere with the query syntax. + + Args: + value: The string to be escaped. + + Returns: + The escaped string, safe for insertion into SQL queries. + """ return "".join(f"{self.BS}{c}" if c in self.must_escape else c for c in value) def _build_insert_sql(self, transac: Iterable, column_names: Iterable[str]) -> str: + """Construct an SQL query for inserting data into the Clickhouse database. + + This method formats and constructs an SQL `INSERT` query string using the + provided transaction data and column names. It is utilized internally during + the process of batch insertion of documents and their embeddings into the + database. + + Args: + transac: iterable of tuples, representing a row of data to be inserted. + column_names: iterable of strings representing the names of the columns + into which data will be inserted. + + Returns: + A string containing the constructed SQL `INSERT` query. + """ ks = ",".join(column_names) _data = [] for n in transac: @@ -231,6 +267,17 @@ CREATE TABLE IF NOT EXISTS {self.config.database}.{self.config.table}( return i_str def _insert(self, transac: Iterable, column_names: Iterable[str]) -> None: + """Execute an SQL query to insert data into the Clickhouse database. + + This method performs the actual insertion of data into the database by + executing the SQL query constructed by `_build_insert_sql`. It's a critical + step in adding new documents and their associated data into the vector store. + + Args: + transac:iterable of tuples, representing a row of data to be inserted. + column_names: An iterable of strings representing the names of the columns + into which data will be inserted. + """ _insert_query = self._build_insert_sql(transac, column_names) self.client.command(_insert_query) @@ -345,6 +392,21 @@ CREATE TABLE IF NOT EXISTS {self.config.database}.{self.config.table}( def _build_query_sql( self, q_emb: List[float], topk: int, where_str: Optional[str] = None ) -> str: + """Construct an SQL query for performing a similarity search. + + This internal method generates an SQL query for finding the top-k most similar + vectors in the database to a given query vector.It allows for optional filtering + conditions to be applied via a WHERE clause. + + Args: + q_emb: The query vector as a list of floats. + topk: The number of top similar items to retrieve. + where_str: opt str representing additional WHERE conditions for the query + Defaults to None. + + Returns: + A string containing the SQL query for the similarity search. + """ q_emb_str = ",".join(map(str, q_emb)) if where_str: where_str = f"PREWHERE {where_str}"