Community patch clickhouse make it possible to not specify index (#20460)

Vector indexes in ClickHouse are experimental at the moment and can
sometimes break/change behaviour. So this PR makes it possible to say
that you don't want to specify an index type.

Any queries against the embedding column will be brute force/linear
scan, but that gives reasonable performance for small-medium dataset
sizes.

---------

Co-authored-by: Erick Friis <erick@langchain.dev>
Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com>
This commit is contained in:
Mark Needham 2024-04-22 18:46:37 +01:00 committed by GitHub
parent c010ec8b71
commit ce23f8293a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -72,7 +72,7 @@ class ClickhouseSettings(BaseSettings):
username: Optional[str] = None
password: Optional[str] = None
index_type: str = "annoy"
index_type: Optional[str] = "annoy"
# Annoy supports L2Distance and cosineDistance.
index_param: Optional[Union[List, Dict]] = ["'L2Distance'", 100]
index_query_params: Dict[str, str] = {}
@ -172,23 +172,15 @@ class Clickhouse(VectorStore):
else ""
)
if isinstance(self.config.index_param, Dict)
else ",".join([str(p) for p in self.config.index_param])
if isinstance(self.config.index_param, List)
else self.config.index_param
else (
",".join([str(p) for p in self.config.index_param])
if isinstance(self.config.index_param, List)
else self.config.index_param
)
)
self.schema = f"""\
CREATE TABLE IF NOT EXISTS {self.config.database}.{self.config.table}(
{self.config.column_map['id']} Nullable(String),
{self.config.column_map['document']} Nullable(String),
{self.config.column_map['embedding']} Array(Float32),
{self.config.column_map['metadata']} JSON,
{self.config.column_map['uuid']} UUID DEFAULT generateUUIDv4(),
CONSTRAINT cons_vec_len CHECK length({self.config.column_map['embedding']}) = {dim},
INDEX vec_idx {self.config.column_map['embedding']} TYPE \
{self.config.index_type}({index_params}) GRANULARITY 1000
) ENGINE = MergeTree ORDER BY uuid SETTINGS index_granularity = 8192\
"""
self.schema = self._schema(dim, index_params)
self.dim = dim
self.BS = "\\"
self.must_escape = ("\\", "'")
@ -205,10 +197,53 @@ CREATE TABLE IF NOT EXISTS {self.config.database}.{self.config.table}(
)
# Enable JSON type
self.client.command("SET allow_experimental_object_type=1")
# Enable index
self.client.command(f"SET allow_experimental_{self.config.index_type}_index=1")
if self.config.index_type:
# Enable index
self.client.command(
f"SET allow_experimental_{self.config.index_type}_index=1"
)
self.client.command(self.schema)
def _schema(self, dim: int, index_params: Optional[str] = "") -> str:
"""Create table schema
:param dim: dimension of embeddings
:param index_params: parameters used for index
This function returns a `CREATE TABLE` statement based on the value of
`self.config.index_type`.
If an index type is specified that index will be created, otherwise
no index will be created.
In the case of there being no index, a linear scan will be performed
when the embedding field is queried.
"""
if self.config.index_type:
return f"""\
CREATE TABLE IF NOT EXISTS {self.config.database}.{self.config.table}(
{self.config.column_map['id']} Nullable(String),
{self.config.column_map['document']} Nullable(String),
{self.config.column_map['embedding']} Array(Float32),
{self.config.column_map['metadata']} JSON,
{self.config.column_map['uuid']} UUID DEFAULT generateUUIDv4(),
CONSTRAINT cons_vec_len CHECK length(
{self.config.column_map['embedding']}) = {dim},
INDEX vec_idx {self.config.column_map['embedding']} TYPE \
{self.config.index_type}({index_params}) GRANULARITY 1000
) ENGINE = MergeTree ORDER BY uuid SETTINGS index_granularity = 8192\
"""
else:
return f"""\
CREATE TABLE IF NOT EXISTS {self.config.database}.{self.config.table}(
{self.config.column_map['id']} Nullable(String),
{self.config.column_map['document']} Nullable(String),
{self.config.column_map['embedding']} Array(Float32),
{self.config.column_map['metadata']} JSON,
{self.config.column_map['uuid']} UUID DEFAULT generateUUIDv4(),
CONSTRAINT cons_vec_len CHECK length({
self.config.column_map['embedding']}) = {dim}
) ENGINE = MergeTree ORDER BY uuid
"""
@property
def embeddings(self) -> Embeddings:
"""Provides access to the embedding mechanism used by the Clickhouse instance.