Add sqlite-vss as a vector database (#10047)

This adds sqlite-vss as an option for a vector database. Contains the
code and a few tests. Tests are passing and the library sqlite-vss is
added as optional as explained in the contributing guidelines. I
adjusted the code for lint/black/ and mypy. It looks that everything is
currently passing.

Adding sqlite-vss was mentioned in this issue:
https://github.com/langchain-ai/langchain/issues/1019.
Also mentioned here in the sqlite-vss repo for the curious:
https://github.com/asg017/sqlite-vss/issues/66

Maintainer tag: @baskaryan

---------

Co-authored-by: Philippe Oger <philippe.oger@adevinta.com>
pull/10098/head
Pihplipe Oegr 1 year ago committed by GitHub
parent c7a5504789
commit 3dafbd852e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -63,6 +63,7 @@ from langchain.vectorstores.rocksetdb import Rockset
from langchain.vectorstores.scann import ScaNN
from langchain.vectorstores.singlestoredb import SingleStoreDB
from langchain.vectorstores.sklearn import SKLearnVectorStore
from langchain.vectorstores.sqlitevss import SQLiteVSS
from langchain.vectorstores.starrocks import StarRocks
from langchain.vectorstores.supabase import SupabaseVectorStore
from langchain.vectorstores.tair import Tair
@ -125,6 +126,7 @@ __all__ = [
"ScaNN",
"SingleStoreDB",
"SingleStoreDB",
"SQLiteVSS",
"StarRocks",
"SupabaseVectorStore",
"Tair",

@ -0,0 +1,222 @@
from __future__ import annotations
import json
import logging
import sqlite3
import warnings
from typing import (
Any,
Iterable,
List,
Optional,
Tuple,
Type,
)
from langchain.docstore.document import Document
from langchain.embeddings.base import Embeddings
from langchain.vectorstores.base import VectorStore
logger = logging.getLogger(__name__)
class SQLiteVSS(VectorStore):
"""Wrapper around SQLite with vss extension as a vector database.
To use, you should have the ``sqlite-vss`` python package installed.
Example:
.. code-block:: python
from langchain.vectorstores import SQLiteVSS
from langchain.embeddings.openai import OpenAIEmbeddings
...
"""
def __init__(
self,
table: str,
connection: Optional[sqlite3.Connection],
embedding: Embeddings,
db_file: str = "vss.db",
):
"""Initialize with sqlite client with vss extension."""
try:
import sqlite_vss # noqa # pylint: disable=unused-import
except ImportError:
raise ImportError(
"Could not import sqlite-vss python package. "
"Please install it with `pip install sqlite-vss`."
)
if not connection:
connection = self.create_connection(db_file)
if not isinstance(embedding, Embeddings):
warnings.warn("embeddings input must be Embeddings object.")
self._connection = connection
self._table = table
self._embedding = embedding
self.create_table_if_not_exists()
def create_table_if_not_exists(self) -> None:
self._connection.execute(
f"""
CREATE TABLE IF NOT EXISTS {self._table}
(
rowid INTEGER PRIMARY KEY AUTOINCREMENT,
text TEXT,
metadata BLOB,
text_embedding BLOB
)
;
"""
)
self._connection.execute(
f"""
CREATE VIRTUAL TABLE IF NOT EXISTS vss_{self._table} USING vss0(
text_embedding({self.get_dimensionality()})
);
"""
)
self._connection.execute(
f"""
CREATE TRIGGER IF NOT EXISTS embed_text
AFTER INSERT ON {self._table}
BEGIN
INSERT INTO vss_{self._table}(rowid, text_embedding)
VALUES (new.rowid, new.text_embedding)
;
END;
"""
)
self._connection.commit()
def add_texts(
self,
texts: Iterable[str],
metadatas: Optional[List[dict]] = None,
**kwargs: Any,
) -> List[str]:
"""Add more texts to the vectorstore index.
Args:
texts: Iterable of strings to add to the vectorstore.
metadatas: Optional list of metadatas associated with the texts.
kwargs: vectorstore specific parameters
"""
max_id = self._connection.execute(
f"SELECT max(rowid) as rowid FROM {self._table}"
).fetchone()["rowid"]
if max_id is None: # no text added yet
max_id = 0
embeds = self._embedding.embed_documents(list(texts))
if not metadatas:
metadatas = [{} for _ in texts]
data_input = [
(text, json.dumps(metadata), json.dumps(embed))
for text, metadata, embed in zip(texts, metadatas, embeds)
]
self._connection.executemany(
f"INSERT INTO {self._table}(text, metadata, text_embedding) "
f"VALUES (?,?,?)",
data_input,
)
self._connection.commit()
# pulling every ids we just inserted
results = self._connection.execute(
f"SELECT rowid FROM {self._table} WHERE rowid > {max_id}"
)
return [row["rowid"] for row in results]
def similarity_search_with_score_by_vector(
self, embedding: List[float], k: int = 4, **kwargs: Any
) -> List[Tuple[Document, float]]:
sql_query = f"""
SELECT
text,
metadata,
distance
FROM {self._table} e
INNER JOIN vss_{self._table} v on v.rowid = e.rowid
WHERE vss_search(
v.text_embedding,
vss_search_params('{json.dumps(embedding)}', {k})
)
"""
cursor = self._connection.cursor()
cursor.execute(sql_query)
results = cursor.fetchall()
documents = []
for row in results:
metadata = json.loads(row["metadata"]) or {}
doc = Document(page_content=row["text"], metadata=metadata)
documents.append((doc, row["distance"]))
return documents
def similarity_search(
self, query: str, k: int = 4, **kwargs: Any
) -> List[Document]:
"""Return docs most similar to query."""
embedding = self._embedding.embed_query(query)
documents = self.similarity_search_with_score_by_vector(
embedding=embedding, k=k
)
return [doc for doc, _ in documents]
def similarity_search_with_score(
self, query: str, k: int = 4, **kwargs: Any
) -> List[Tuple[Document, float]]:
"""Return docs most similar to query."""
embedding = self._embedding.embed_query(query)
documents = self.similarity_search_with_score_by_vector(
embedding=embedding, k=k
)
return documents
def similarity_search_by_vector(
self, embedding: List[float], k: int = 4, **kwargs: Any
) -> List[Document]:
documents = self.similarity_search_with_score_by_vector(
embedding=embedding, k=k
)
return [doc for doc, _ in documents]
@classmethod
def from_texts(
cls: Type[SQLiteVSS],
texts: List[str],
embedding: Embeddings,
metadatas: Optional[List[dict]] = None,
table: str = "langchain",
db_file: str = "vss.db",
**kwargs: Any,
) -> SQLiteVSS:
"""Return VectorStore initialized from texts and embeddings."""
connection = cls.create_connection(db_file)
vss = cls(
table=table, connection=connection, db_file=db_file, embedding=embedding
)
vss.add_texts(texts=texts, metadatas=metadatas)
return vss
@staticmethod
def create_connection(db_file: str) -> sqlite3.Connection:
import sqlite_vss
connection = sqlite3.connect(db_file)
connection.row_factory = sqlite3.Row
connection.enable_load_extension(True)
sqlite_vss.load(connection)
connection.enable_load_extension(False)
return connection
def get_dimensionality(self) -> int:
"""
Function that does a dummy embedding to figure out how many dimensions
this embedding function returns. Needed for the virtual table DDL.
"""
dummy_text = "This is a dummy text"
dummy_embedding = self._embedding.embed_query(dummy_text)
return len(dummy_embedding)

@ -3542,6 +3542,7 @@ optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, !=3.6.*"
files = [
{file = "jsonpointer-2.4-py2.py3-none-any.whl", hash = "sha256:15d51bba20eea3165644553647711d150376234112651b4f1811022aecad7d7a"},
{file = "jsonpointer-2.4.tar.gz", hash = "sha256:585cee82b70211fa9e6043b7bb89db6e1aa49524340dde8ad6b63206ea689d88"},
]
[[package]]
@ -8148,10 +8149,8 @@ description = "Fast and Safe Tensor serialization"
optional = true
python-versions = "*"
files = [
{file = "safetensors-0.3.2-cp310-cp310-macosx_10_11_x86_64.whl", hash = "sha256:4c7827b64b1da3f082301b5f5a34331b8313104c14f257099a12d32ac621c5cd"},
{file = "safetensors-0.3.2-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:b6a66989075c2891d743153e8ba9ca84ee7232c8539704488f454199b8b8f84d"},
{file = "safetensors-0.3.2-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:670d6bc3a3b377278ce2971fa7c36ebc0a35041c4ea23b9df750a39380800195"},
{file = "safetensors-0.3.2-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:67ef2cc747c88e3a8d8e4628d715874c0366a8ff1e66713a9d42285a429623ad"},
{file = "safetensors-0.3.2-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:564f42838721925b5313ae864ba6caa6f4c80a9fbe63cf24310c3be98ab013cd"},
{file = "safetensors-0.3.2-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:7f80af7e4ab3188daaff12d43d078da3017a90d732d38d7af4eb08b6ca2198a5"},
{file = "safetensors-0.3.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ec30d78f20f1235b252d59cbb9755beb35a1fde8c24c89b3c98e6a1804cfd432"},
@ -8160,9 +8159,7 @@ files = [
{file = "safetensors-0.3.2-cp310-cp310-win32.whl", hash = "sha256:2961c1243fd0da46aa6a1c835305cc4595486f8ac64632a604d0eb5f2de76175"},
{file = "safetensors-0.3.2-cp310-cp310-win_amd64.whl", hash = "sha256:c813920482c337d1424d306e1b05824a38e3ef94303748a0a287dea7a8c4f805"},
{file = "safetensors-0.3.2-cp311-cp311-macosx_10_11_universal2.whl", hash = "sha256:707df34bd9b9047e97332136ad98e57028faeccdb9cfe1c3b52aba5964cc24bf"},
{file = "safetensors-0.3.2-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:23d1d9f74208c9dfdf852a9f986dac63e40092385f84bf0789d599efa8e6522f"},
{file = "safetensors-0.3.2-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:becc5bb85b2947eae20ed23b407ebfd5277d9a560f90381fe2c42e6c043677ba"},
{file = "safetensors-0.3.2-cp311-cp311-macosx_12_0_universal2.whl", hash = "sha256:c1913c6c549b1805e924f307159f0ee97b73ae3ce150cd2401964da015e0fa0b"},
{file = "safetensors-0.3.2-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:30a75707be5cc9686490bde14b9a371cede4af53244ea72b340cfbabfffdf58a"},
{file = "safetensors-0.3.2-cp311-cp311-macosx_13_0_universal2.whl", hash = "sha256:54ad6af663e15e2b99e2ea3280981b7514485df72ba6d014dc22dae7ba6a5e6c"},
{file = "safetensors-0.3.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:37764b3197656ef507a266c453e909a3477dabc795962b38e3ad28226f53153b"},
@ -8170,28 +8167,22 @@ files = [
{file = "safetensors-0.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ada0fac127ff8fb04834da5c6d85a8077e6a1c9180a11251d96f8068db922a17"},
{file = "safetensors-0.3.2-cp311-cp311-win32.whl", hash = "sha256:155b82dbe2b0ebff18cde3f76b42b6d9470296e92561ef1a282004d449fa2b4c"},
{file = "safetensors-0.3.2-cp311-cp311-win_amd64.whl", hash = "sha256:a86428d196959619ce90197731be9391b5098b35100a7228ef4643957648f7f5"},
{file = "safetensors-0.3.2-cp37-cp37m-macosx_10_11_x86_64.whl", hash = "sha256:91e796b6e465d9ffaca4c411d749f236c211e257f3a8e9b25a5ffc1a42d3bfa7"},
{file = "safetensors-0.3.2-cp37-cp37m-macosx_11_0_x86_64.whl", hash = "sha256:c1f8ab41ed735c5b581f451fd15d9602ff51aa88044bfa933c5fa4b1d0c644d1"},
{file = "safetensors-0.3.2-cp37-cp37m-macosx_12_0_x86_64.whl", hash = "sha256:e6a8ff5652493598c45cd27f5613c193d3f15e76e0f81613d399c487a7b8cc50"},
{file = "safetensors-0.3.2-cp37-cp37m-macosx_13_0_x86_64.whl", hash = "sha256:bc9cfb3c9ea2aec89685b4d656f9f2296f0f0d67ecf2bebf950870e3be89b3db"},
{file = "safetensors-0.3.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ace5d471e3d78e0d93f952707d808b5ab5eac77ddb034ceb702e602e9acf2be9"},
{file = "safetensors-0.3.2-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:de3e20a388b444381bcda1a3193cce51825ddca277e4cf3ed1fe8d9b2d5722cd"},
{file = "safetensors-0.3.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3d7d70d48585fe8df00725aa788f2e64fd24a4c9ae07cd6be34f6859d0f89a9c"},
{file = "safetensors-0.3.2-cp37-cp37m-win32.whl", hash = "sha256:6ff59bc90cdc857f68b1023be9085fda6202bbe7f2fd67d06af8f976d6adcc10"},
{file = "safetensors-0.3.2-cp37-cp37m-win_amd64.whl", hash = "sha256:8b05c93da15fa911763a89281906ca333ed800ab0ef1c7ce53317aa1a2322f19"},
{file = "safetensors-0.3.2-cp38-cp38-macosx_10_11_x86_64.whl", hash = "sha256:94857abc019b49a22a0065cc7741c48fb788aa7d8f3f4690c092c56090227abe"},
{file = "safetensors-0.3.2-cp38-cp38-macosx_11_0_x86_64.whl", hash = "sha256:8969cfd9e8d904e8d3c67c989e1bd9a95e3cc8980d4f95e4dcd43c299bb94253"},
{file = "safetensors-0.3.2-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:da482fa011dc88fe7376d8f8b42c0ccef2f260e0cbc847ceca29c708bf75a868"},
{file = "safetensors-0.3.2-cp38-cp38-macosx_13_0_x86_64.whl", hash = "sha256:f54148ac027556eb02187e9bc1556c4d916c99ca3cb34ca36a7d304d675035c1"},
{file = "safetensors-0.3.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:caec25fedbcf73f66c9261984f07885680f71417fc173f52279276c7f8a5edd3"},
{file = "safetensors-0.3.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:50224a1d99927ccf3b75e27c3d412f7043280431ab100b4f08aad470c37cf99a"},
{file = "safetensors-0.3.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aa98f49e95f02eb750d32c4947e7d5aa43883149ebd0414920866446525b70f0"},
{file = "safetensors-0.3.2-cp38-cp38-win32.whl", hash = "sha256:33409df5e28a83dc5cc5547a3ac17c0f1b13a1847b1eb3bc4b3be0df9915171e"},
{file = "safetensors-0.3.2-cp38-cp38-win_amd64.whl", hash = "sha256:e04a7cbbb3856159ab99e3adb14521544f65fcb8548cce773a1435a0f8d78d27"},
{file = "safetensors-0.3.2-cp39-cp39-macosx_10_11_x86_64.whl", hash = "sha256:f39f3d951543b594c6bc5082149d994c47ca487fd5d55b4ce065ab90441aa334"},
{file = "safetensors-0.3.2-cp39-cp39-macosx_11_0_x86_64.whl", hash = "sha256:7c864cf5dcbfb608c5378f83319c60cc9c97263343b57c02756b7613cd5ab4dd"},
{file = "safetensors-0.3.2-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:14e8c19d6dc51d4f70ee33c46aff04c8ba3f95812e74daf8036c24bc86e75cae"},
{file = "safetensors-0.3.2-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:41b10b0a6dfe8fdfbe4b911d64717d5647e87fbd7377b2eb3d03fb94b59810ea"},
{file = "safetensors-0.3.2-cp39-cp39-macosx_13_0_arm64.whl", hash = "sha256:042a60f633c3c7009fdf6a7c182b165cb7283649d2a1e9c7a4a1c23454bd9a5b"},
{file = "safetensors-0.3.2-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:fafd95e5ef41e8f312e2a32b7031f7b9b2a621b255f867b221f94bb2e9f51ae8"},
{file = "safetensors-0.3.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8ed77cf358abce2307f03634694e0b2a29822e322a1623e0b1aa4b41e871bf8b"},
@ -8706,6 +8697,21 @@ postgresql-psycopgbinary = ["psycopg[binary] (>=3.0.7)"]
pymysql = ["pymysql"]
sqlcipher = ["sqlcipher3-binary"]
[[package]]
name = "sqlite-vss"
version = "0.1.2"
description = ""
optional = true
python-versions = ">=3.7"
files = [
{file = "sqlite_vss-0.1.2-py3-none-macosx_10_6_x86_64.whl", hash = "sha256:9eefa4207f8b522e32b2747fce44422c773e36710bf807613795218c7ba125f0"},
{file = "sqlite_vss-0.1.2-py3-none-macosx_11_0_arm64.whl", hash = "sha256:84994eaf7fe700218b258422358c4536a6aca39b96026c308b28630967f954c4"},
{file = "sqlite_vss-0.1.2-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux1_x86_64.whl", hash = "sha256:e44f03bc4cb214bb77b206519abfb623e3e4795967a569218e288927a7715806"},
]
[package.extras]
test = ["pytest"]
[[package]]
name = "sqlitedict"
version = "2.1.0"
@ -10437,7 +10443,7 @@ clarifai = ["clarifai"]
cohere = ["cohere"]
docarray = ["docarray"]
embeddings = ["sentence-transformers"]
extended-testing = ["amazon-textract-caller", "assemblyai", "atlassian-python-api", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "esprima", "faiss-cpu", "feedparser", "geopandas", "gitpython", "gql", "html2text", "jinja2", "jq", "lxml", "markdownify", "mwparserfromhell", "mwxml", "newspaper3k", "openai", "openai", "openapi-schema-pydantic", "pandas", "pdfminer-six", "pgvector", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "requests-toolbelt", "scikit-learn", "streamlit", "sympy", "telethon", "tqdm", "xata", "xmltodict"]
extended-testing = ["amazon-textract-caller", "assemblyai", "atlassian-python-api", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "esprima", "faiss-cpu", "feedparser", "geopandas", "gitpython", "gql", "html2text", "jinja2", "jq", "lxml", "markdownify", "mwparserfromhell", "mwxml", "newspaper3k", "openai", "openai", "openapi-schema-pydantic", "pandas", "pdfminer-six", "pgvector", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "requests-toolbelt", "scikit-learn", "sqlite-vss", "streamlit", "sympy", "telethon", "tqdm", "xata", "xmltodict"]
javascript = ["esprima"]
llms = ["clarifai", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "openlm", "torch", "transformers"]
openai = ["openai", "tiktoken"]
@ -10447,4 +10453,4 @@ text-helpers = ["chardet"]
[metadata]
lock-version = "2.0"
python-versions = ">=3.8.1,<4.0"
content-hash = "43a6bd42efc0baf917418087f788aaf3b1bc793cb4aa81de99c52ed6a7d54d26"
content-hash = "1fbea4c22b1df46fd1062b58657321ef49b4c503851859d3ad1109acf427df9d"

@ -127,6 +127,7 @@ xata = {version = "^1.0.0a7", optional = true}
xmltodict = {version = "^0.13.0", optional = true}
markdownify = {version = "^0.11.6", optional = true}
assemblyai = {version = "^0.17.0", optional = true}
sqlite-vss = {version = "^0.1.2", optional = true}
[tool.poetry.group.test.dependencies]
@ -341,6 +342,7 @@ extended_testing = [
"faiss-cpu",
"openapi-schema-pydantic",
"markdownify",
"sqlite-vss",
]
[tool.ruff]

@ -0,0 +1,58 @@
from typing import List, Optional
import pytest
from langchain.docstore.document import Document
from langchain.vectorstores import SQLiteVSS
from tests.integration_tests.vectorstores.fake_embeddings import (
FakeEmbeddings,
fake_texts,
)
def _sqlite_vss_from_texts(
metadatas: Optional[List[dict]] = None, drop: bool = True
) -> SQLiteVSS:
return SQLiteVSS.from_texts(
fake_texts,
FakeEmbeddings(),
metadatas=metadatas,
table="test",
db_file=":memory:",
)
@pytest.mark.requires("sqlite-vss")
def test_sqlitevss() -> None:
"""Test end to end construction and search."""
docsearch = _sqlite_vss_from_texts()
output = docsearch.similarity_search("foo", k=1)
assert output == [Document(page_content="foo", metadata={})]
@pytest.mark.requires("sqlite-vss")
def test_sqlitevss_with_score() -> None:
"""Test end to end construction and search with scores and IDs."""
texts = ["foo", "bar", "baz"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = _sqlite_vss_from_texts(metadatas=metadatas)
output = docsearch.similarity_search_with_score("foo", k=3)
docs = [o[0] for o in output]
distances = [o[1] for o in output]
assert docs == [
Document(page_content="foo", metadata={"page": 0}),
Document(page_content="bar", metadata={"page": 1}),
Document(page_content="baz", metadata={"page": 2}),
]
assert distances[0] < distances[1] < distances[2]
@pytest.mark.requires("sqlite-vss")
def test_sqlitevss_add_extra() -> None:
"""Test end to end construction and MRR search."""
texts = ["foo", "bar", "baz"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = _sqlite_vss_from_texts(metadatas=metadatas)
docsearch.add_texts(texts, metadatas)
output = docsearch.similarity_search("foo", k=10)
assert len(output) == 6
Loading…
Cancel
Save