Add MyScaleWithoutJSON which allows user to wrap columns into Document's Metadata (#13164)

<!-- Thank you for contributing to LangChain!

Replace this entire comment with:
  - **Description:** a description of the change, 
  - **Issue:** the issue # it fixes (if applicable),
  - **Dependencies:** any dependencies required for this change,
- **Tag maintainer:** for a quicker response, tag the relevant
maintainer (see below),
- **Twitter handle:** we announce bigger features on Twitter. If your PR
gets announced, and you'd like a mention, we'll gladly shout you out!

Please make sure your PR is passing linting and testing before
submitting. Run `make format`, `make lint` and `make test` to check this
locally.

See contribution guidelines for more information on how to write/run
tests, lint, etc:

https://github.com/langchain-ai/langchain/blob/master/.github/CONTRIBUTING.md

If you're adding a new integration, please include:
1. a test for the integration, preferably unit tests that do not rely on
network access,
2. an example notebook showing its use. It lives in `docs/extras`
directory.

If no one reviews your PR within a few days, please @-mention one of
@baskaryan, @eyurtsev, @hwchase17.
 -->
Replace this entire comment with:
- **Description:** Add MyScaleWithoutJSON which allows user to wrap
columns into Document's Metadata
  - **Tag maintainer:** @baskaryan
pull/13301/head
刘 方瑞 11 months ago committed by GitHub
parent 2aa13f1e10
commit 46af56dc4f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -490,3 +490,125 @@ class MyScale(VectorStore):
@property
def metadata_column(self) -> str:
return self.config.column_map["metadata"]
class MyScaleWithoutJSON(MyScale):
"""MyScale vector store without metadata column
This is super handy if you are working to a SQL-native table
"""
def __init__(
self,
embedding: Embeddings,
config: Optional[MyScaleSettings] = None,
must_have_cols: List[str] = [],
**kwargs: Any,
) -> None:
"""Building a myscale vector store without metadata column
embedding (Embeddings): embedding model
config (MyScaleSettings): Configuration to MyScale Client
must_have_cols (List[str]): column names to be included in query
Other keyword arguments will pass into
[clickhouse-connect](https://docs.myscale.com/)
"""
super().__init__(embedding, config, **kwargs)
self.must_have_cols: List[str] = must_have_cols
def _build_qstr(
self, q_emb: List[float], topk: int, where_str: Optional[str] = None
) -> str:
q_emb_str = ",".join(map(str, q_emb))
if where_str:
where_str = f"PREWHERE {where_str}"
else:
where_str = ""
q_str = f"""
SELECT {self.config.column_map['text']}, dist,
{','.join(self.must_have_cols)}
FROM {self.config.database}.{self.config.table}
{where_str}
ORDER BY distance({self.config.column_map['vector']}, [{q_emb_str}])
AS dist {self.dist_order}
LIMIT {topk}
"""
return q_str
def similarity_search_by_vector(
self,
embedding: List[float],
k: int = 4,
where_str: Optional[str] = None,
**kwargs: Any,
) -> List[Document]:
"""Perform a similarity search with MyScale by vectors
Args:
query (str): query string
k (int, optional): Top K neighbors to retrieve. Defaults to 4.
where_str (Optional[str], optional): where condition string.
Defaults to None.
NOTE: Please do not let end-user to fill this and always be aware
of SQL injection. When dealing with metadatas, remember to
use `{self.metadata_column}.attribute` instead of `attribute`
alone. The default name for it is `metadata`.
Returns:
List[Document]: List of (Document, similarity)
"""
q_str = self._build_qstr(embedding, k, where_str)
try:
return [
Document(
page_content=r[self.config.column_map["text"]],
metadata={k: r[k] for k in self.must_have_cols},
)
for r in self.client.query(q_str).named_results()
]
except Exception as e:
logger.error(f"\033[91m\033[1m{type(e)}\033[0m \033[95m{str(e)}\033[0m")
return []
def similarity_search_with_relevance_scores(
self, query: str, k: int = 4, where_str: Optional[str] = None, **kwargs: Any
) -> List[Tuple[Document, float]]:
"""Perform a similarity search with MyScale
Args:
query (str): query string
k (int, optional): Top K neighbors to retrieve. Defaults to 4.
where_str (Optional[str], optional): where condition string.
Defaults to None.
NOTE: Please do not let end-user to fill this and always be aware
of SQL injection. When dealing with metadatas, remember to
use `{self.metadata_column}.attribute` instead of `attribute`
alone. The default name for it is `metadata`.
Returns:
List[Document]: List of documents most similar to the query text
and cosine distance in float for each.
Lower score represents more similarity.
"""
q_str = self._build_qstr(self._embeddings.embed_query(query), k, where_str)
try:
return [
(
Document(
page_content=r[self.config.column_map["text"]],
metadata={k: r[k] for k in self.must_have_cols},
),
r["dist"],
)
for r in self.client.query(q_str).named_results()
]
except Exception as e:
logger.error(f"\033[91m\033[1m{type(e)}\033[0m \033[95m{str(e)}\033[0m")
return []
@property
def metadata_column(self) -> str:
return ""

Loading…
Cancel
Save