langchain/libs/community/langchain_community/vectorstores/marqo.py

471 lines
17 KiB
Python
Raw Normal View History

community[major], core[patch], langchain[patch], experimental[patch]: Create langchain-community (#14463) Moved the following modules to new package langchain-community in a backwards compatible fashion: ``` mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community ``` Moved the following to core ``` mv langchain/langchain/utils/json_schema.py core/langchain_core/utils mv langchain/langchain/utils/html.py core/langchain_core/utils mv langchain/langchain/utils/strings.py core/langchain_core/utils cat langchain/langchain/utils/env.py >> core/langchain_core/utils/env.py rm langchain/langchain/utils/env.py ``` See .scripts/community_split/script_integrations.sh for all changes
2023-12-11 21:53:30 +00:00
from __future__ import annotations
import json
import uuid
from typing import (
TYPE_CHECKING,
Any,
Callable,
Dict,
Iterable,
List,
Optional,
Tuple,
Type,
Union,
)
from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_core.vectorstores import VectorStore
if TYPE_CHECKING:
import marqo
class Marqo(VectorStore):
"""`Marqo` vector store.
Marqo indexes have their own models associated with them to generate your
embeddings. This means that you can selected from a range of different models
and also use CLIP models to create multimodal indexes
with images and text together.
Marqo also supports more advanced queries with multiple weighted terms, see See
https://docs.marqo.ai/latest/#searching-using-weights-in-queries.
This class can flexibly take strings or dictionaries for weighted queries
in its similarity search methods.
To use, you should have the `marqo` python package installed, you can do this with
`pip install marqo`.
Example:
.. code-block:: python
import marqo
from langchain_community.vectorstores import Marqo
client = marqo.Client(url=os.environ["MARQO_URL"], ...)
vectorstore = Marqo(client, index_name)
"""
def __init__(
self,
client: marqo.Client,
index_name: str,
add_documents_settings: Optional[Dict[str, Any]] = None,
searchable_attributes: Optional[List[str]] = None,
page_content_builder: Optional[Callable[[Dict[str, Any]], str]] = None,
):
"""Initialize with Marqo client."""
try:
import marqo
except ImportError:
raise ImportError(
"Could not import marqo python package. "
"Please install it with `pip install marqo`."
)
if not isinstance(client, marqo.Client):
raise ValueError(
f"client should be an instance of marqo.Client, got {type(client)}"
)
self._client = client
self._index_name = index_name
self._add_documents_settings = (
{} if add_documents_settings is None else add_documents_settings
)
self._searchable_attributes = searchable_attributes
self.page_content_builder = page_content_builder
self.tensor_fields = ["text"]
self._document_batch_size = 1024
@property
def embeddings(self) -> Optional[Embeddings]:
return None
def add_texts(
self,
texts: Iterable[str],
metadatas: Optional[List[dict]] = None,
**kwargs: Any,
) -> List[str]:
"""Upload texts with metadata (properties) to Marqo.
You can either have marqo generate ids for each document or you can provide
your own by including a "_id" field in the metadata objects.
Args:
texts (Iterable[str]): am iterator of texts - assumed to preserve an
order that matches the metadatas.
metadatas (Optional[List[dict]], optional): a list of metadatas.
Raises:
ValueError: if metadatas is provided and the number of metadatas differs
from the number of texts.
Returns:
List[str]: The list of ids that were added.
"""
if self._client.index(self._index_name).get_settings()["index_defaults"][
"treat_urls_and_pointers_as_images"
]:
raise ValueError(
"Marqo.add_texts is disabled for multimodal indexes. To add documents "
"with a multimodal index use the Python client for Marqo directly."
)
documents: List[Dict[str, str]] = []
num_docs = 0
for i, text in enumerate(texts):
doc = {
"text": text,
"metadata": json.dumps(metadatas[i]) if metadatas else json.dumps({}),
}
documents.append(doc)
num_docs += 1
ids = []
for i in range(0, num_docs, self._document_batch_size):
response = self._client.index(self._index_name).add_documents(
documents[i : i + self._document_batch_size],
tensor_fields=self.tensor_fields,
**self._add_documents_settings,
)
if response["errors"]:
err_msg = (
f"Error in upload for documents in index range [{i},"
f"{i + self._document_batch_size}], "
f"check Marqo logs."
)
raise RuntimeError(err_msg)
ids += [item["_id"] for item in response["items"]]
return ids
def similarity_search(
self,
query: Union[str, Dict[str, float]],
k: int = 4,
**kwargs: Any,
) -> List[Document]:
"""Search the marqo index for the most similar documents.
Args:
query (Union[str, Dict[str, float]]): The query for the search, either
as a string or a weighted query.
k (int, optional): The number of documents to return. Defaults to 4.
Returns:
List[Document]: k documents ordered from best to worst match.
"""
results = self.marqo_similarity_search(query=query, k=k)
documents = self._construct_documents_from_results_without_score(results)
return documents
def similarity_search_with_score(
self,
query: Union[str, Dict[str, float]],
k: int = 4,
) -> List[Tuple[Document, float]]:
"""Return documents from Marqo that are similar to the query as well
as their scores.
Args:
query (str): The query to search with, either as a string or a weighted
query.
k (int, optional): The number of documents to return. Defaults to 4.
Returns:
List[Tuple[Document, float]]: The matching documents and their scores,
ordered by descending score.
"""
results = self.marqo_similarity_search(query=query, k=k)
scored_documents = self._construct_documents_from_results_with_score(results)
return scored_documents
def bulk_similarity_search(
self,
queries: Iterable[Union[str, Dict[str, float]]],
k: int = 4,
**kwargs: Any,
) -> List[List[Document]]:
"""Search the marqo index for the most similar documents in bulk with multiple
queries.
Args:
queries (Iterable[Union[str, Dict[str, float]]]): An iterable of queries to
execute in bulk, queries in the list can be strings or dictionaries of
weighted queries.
k (int, optional): The number of documents to return for each query.
Defaults to 4.
Returns:
List[List[Document]]: A list of results for each query.
"""
bulk_results = self.marqo_bulk_similarity_search(queries=queries, k=k)
bulk_documents: List[List[Document]] = []
for results in bulk_results["result"]:
documents = self._construct_documents_from_results_without_score(results)
bulk_documents.append(documents)
return bulk_documents
def bulk_similarity_search_with_score(
self,
queries: Iterable[Union[str, Dict[str, float]]],
k: int = 4,
**kwargs: Any,
) -> List[List[Tuple[Document, float]]]:
"""Return documents from Marqo that are similar to the query as well as
their scores using a batch of queries.
Args:
query (Iterable[Union[str, Dict[str, float]]]): An iterable of queries
to execute in bulk, queries in the list can be strings or dictionaries
of weighted queries.
k (int, optional): The number of documents to return. Defaults to 4.
Returns:
List[Tuple[Document, float]]: A list of lists of the matching
documents and their scores for each query
"""
bulk_results = self.marqo_bulk_similarity_search(queries=queries, k=k)
bulk_documents: List[List[Tuple[Document, float]]] = []
for results in bulk_results["result"]:
documents = self._construct_documents_from_results_with_score(results)
bulk_documents.append(documents)
return bulk_documents
def _construct_documents_from_results_with_score(
self, results: Dict[str, List[Dict[str, str]]]
) -> List[Tuple[Document, Any]]:
"""Helper to convert Marqo results into documents.
Args:
results (List[dict]): A marqo results object with the 'hits'.
include_scores (bool, optional): Include scores alongside documents.
Defaults to False.
Returns:
Union[List[Document], List[Tuple[Document, float]]]: The documents or
document score pairs if `include_scores` is true.
"""
documents: List[Tuple[Document, Any]] = []
for res in results["hits"]:
if self.page_content_builder is None:
text = res["text"]
else:
text = self.page_content_builder(res)
metadata = json.loads(res.get("metadata", "{}"))
documents.append(
(Document(page_content=text, metadata=metadata), res["_score"])
)
return documents
def _construct_documents_from_results_without_score(
self, results: Dict[str, List[Dict[str, str]]]
) -> List[Document]:
"""Helper to convert Marqo results into documents.
Args:
results (List[dict]): A marqo results object with the 'hits'.
include_scores (bool, optional): Include scores alongside documents.
Defaults to False.
Returns:
Union[List[Document], List[Tuple[Document, float]]]: The documents or
document score pairs if `include_scores` is true.
"""
documents: List[Document] = []
for res in results["hits"]:
if self.page_content_builder is None:
text = res["text"]
else:
text = self.page_content_builder(res)
metadata = json.loads(res.get("metadata", "{}"))
documents.append(Document(page_content=text, metadata=metadata))
return documents
def marqo_similarity_search(
self,
query: Union[str, Dict[str, float]],
k: int = 4,
) -> Dict[str, List[Dict[str, str]]]:
"""Return documents from Marqo exposing Marqo's output directly
Args:
query (str): The query to search with.
k (int, optional): The number of documents to return. Defaults to 4.
Returns:
List[Dict[str, Any]]: This hits from marqo.
"""
results = self._client.index(self._index_name).search(
q=query, searchable_attributes=self._searchable_attributes, limit=k
)
return results
def marqo_bulk_similarity_search(
self, queries: Iterable[Union[str, Dict[str, float]]], k: int = 4
) -> Dict[str, List[Dict[str, List[Dict[str, str]]]]]:
"""Return documents from Marqo using a bulk search, exposes Marqo's
output directly
Args:
queries (Iterable[Union[str, Dict[str, float]]]): A list of queries.
k (int, optional): The number of documents to return for each query.
Defaults to 4.
Returns:
Dict[str, Dict[List[Dict[str, Dict[str, Any]]]]]: A bulk search results
object
"""
bulk_results = {
"result": [
self._client.index(self._index_name).search(
q=query, searchable_attributes=self._searchable_attributes, limit=k
)
for query in queries
]
}
return bulk_results
@classmethod
def from_documents(
cls: Type[Marqo],
documents: List[Document],
embedding: Union[Embeddings, None] = None,
**kwargs: Any,
) -> Marqo:
"""Return VectorStore initialized from documents. Note that Marqo does not
need embeddings, we retain the parameter to adhere to the Liskov substitution
principle.
Args:
documents (List[Document]): Input documents
embedding (Any, optional): Embeddings (not required). Defaults to None.
Returns:
VectorStore: A Marqo vectorstore
"""
texts = [d.page_content for d in documents]
metadatas = [d.metadata for d in documents]
return cls.from_texts(texts, metadatas=metadatas, **kwargs)
@classmethod
def from_texts(
cls,
texts: List[str],
embedding: Any = None,
metadatas: Optional[List[dict]] = None,
index_name: str = "",
url: str = "http://localhost:8882",
api_key: str = "",
add_documents_settings: Optional[Dict[str, Any]] = None,
searchable_attributes: Optional[List[str]] = None,
page_content_builder: Optional[Callable[[Dict[str, str]], str]] = None,
index_settings: Optional[Dict[str, Any]] = None,
verbose: bool = True,
**kwargs: Any,
) -> Marqo:
"""Return Marqo initialized from texts. Note that Marqo does not need
embeddings, we retain the parameter to adhere to the Liskov
substitution principle.
This is a quick way to get started with marqo - simply provide your texts and
metadatas and this will create an instance of the data store and index the
provided data.
To know the ids of your documents with this approach you will need to include
them in under the key "_id" in your metadatas for each text
Example:
.. code-block:: python
from langchain_community.vectorstores import Marqo
datastore = Marqo(texts=['text'], index_name='my-first-index',
url='http://localhost:8882')
Args:
texts (List[str]): A list of texts to index into marqo upon creation.
embedding (Any, optional): Embeddings (not required). Defaults to None.
index_name (str, optional): The name of the index to use, if none is
provided then one will be created with a UUID. Defaults to None.
url (str, optional): The URL for Marqo. Defaults to "http://localhost:8882".
api_key (str, optional): The API key for Marqo. Defaults to "".
metadatas (Optional[List[dict]], optional): A list of metadatas, to
accompany the texts. Defaults to None.
this is only used when a new index is being created. Defaults to "cpu". Can
be "cpu" or "cuda".
add_documents_settings (Optional[Dict[str, Any]], optional): Settings
for adding documents, see
https://docs.marqo.ai/0.0.16/API-Reference/documents/#query-parameters.
Defaults to {}.
index_settings (Optional[Dict[str, Any]], optional): Index settings if
the index doesn't exist, see
https://docs.marqo.ai/0.0.16/API-Reference/indexes/#index-defaults-object.
Defaults to {}.
Returns:
Marqo: An instance of the Marqo vector store
"""
try:
import marqo
except ImportError:
raise ImportError(
"Could not import marqo python package. "
"Please install it with `pip install marqo`."
)
if not index_name:
index_name = str(uuid.uuid4())
client = marqo.Client(url=url, api_key=api_key)
try:
client.create_index(index_name, settings_dict=index_settings or {})
if verbose:
print(f"Created {index_name} successfully.")
except Exception:
if verbose:
print(f"Index {index_name} exists.")
instance: Marqo = cls(
client,
index_name,
searchable_attributes=searchable_attributes,
add_documents_settings=add_documents_settings or {},
page_content_builder=page_content_builder,
)
instance.add_texts(texts, metadatas)
return instance
def get_indexes(self) -> List[Dict[str, str]]:
"""Helper to see your available indexes in marqo, useful if the
from_texts method was used without an index name specified
Returns:
List[Dict[str, str]]: The list of indexes
"""
return self._client.get_indexes()["results"]
def get_number_of_documents(self) -> int:
"""Helper to see the number of documents in the index
Returns:
int: The number of documents
"""
return self._client.index(self._index_name).get_stats()["numberOfDocuments"]