Milvus allows to store metadata as json field (#14636)

Because Milvus doesn't support nullable fields, but document metadata is
very rich, so it makes more sense to store it as json.


https://github.com/milvus-io/pymilvus/issues/1705#issuecomment-1731112372

<!-- Thank you for contributing to LangChain!

Replace this entire comment with:
  - **Description:** a description of the change, 
  - **Issue:** the issue # it fixes (if applicable),
  - **Dependencies:** any dependencies required for this change,
- **Tag maintainer:** for a quicker response, tag the relevant
maintainer (see below),
- **Twitter handle:** we announce bigger features on Twitter. If your PR
gets announced, and you'd like a mention, we'll gladly shout you out!

Please make sure your PR is passing linting and testing before
submitting. Run `make format`, `make lint` and `make test` to check this
locally.

See contribution guidelines for more information on how to write/run
tests, lint, etc:

https://github.com/langchain-ai/langchain/blob/master/.github/CONTRIBUTING.md

If you're adding a new integration, please include:
1. a test for the integration, preferably unit tests that do not rely on
network access,
2. an example notebook showing its use. It lives in `docs/extras`
directory.

If no one reviews your PR within a few days, please @-mention one of
@baskaryan, @eyurtsev, @hwchase17.
 -->

---------

Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
pull/15438/head
YISH 8 months ago committed by GitHub
parent 620168e459
commit da0f750a0b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -53,6 +53,9 @@ class Milvus(VectorStore):
primary_field (str): Name of the primary key field. Defaults to "pk". primary_field (str): Name of the primary key field. Defaults to "pk".
text_field (str): Name of the text field. Defaults to "text". text_field (str): Name of the text field. Defaults to "text".
vector_field (str): Name of the vector field. Defaults to "vector". vector_field (str): Name of the vector field. Defaults to "vector".
metadata_field (str): Name of the metadta field. Defaults to None.
When metadata_field is specified,
the document's metadata will store as json.
The connection args used for this class comes in the form of a dict, The connection args used for this class comes in the form of a dict,
here are a few of the options: here are a few of the options:
@ -112,6 +115,7 @@ class Milvus(VectorStore):
primary_field: str = "pk", primary_field: str = "pk",
text_field: str = "text", text_field: str = "text",
vector_field: str = "vector", vector_field: str = "vector",
metadata_field: Optional[str] = None,
): ):
"""Initialize the Milvus vector store.""" """Initialize the Milvus vector store."""
try: try:
@ -148,6 +152,7 @@ class Milvus(VectorStore):
self._text_field = text_field self._text_field = text_field
# In order for compatibility, the vector field needs to be called "vector" # In order for compatibility, the vector field needs to be called "vector"
self._vector_field = vector_field self._vector_field = vector_field
self._metadata_field = metadata_field
self.fields: list[str] = [] self.fields: list[str] = []
# Create the connection to the server # Create the connection to the server
if connection_args is None: if connection_args is None:
@ -250,6 +255,9 @@ class Milvus(VectorStore):
# Determine embedding dim # Determine embedding dim
dim = len(embeddings[0]) dim = len(embeddings[0])
fields = [] fields = []
if self._metadata_field is not None:
fields.append(FieldSchema(self._metadata_field, DataType.JSON))
else:
# Determine metadata schema # Determine metadata schema
if metadatas: if metadatas:
# Create FieldSchema for each entry in metadata. # Create FieldSchema for each entry in metadata.
@ -259,13 +267,18 @@ class Milvus(VectorStore):
# Datatype isn't compatible # Datatype isn't compatible
if dtype == DataType.UNKNOWN or dtype == DataType.NONE: if dtype == DataType.UNKNOWN or dtype == DataType.NONE:
logger.error( logger.error(
"Failure to create collection, unrecognized dtype for key: %s", (
"Failure to create collection, "
"unrecognized dtype for key: %s"
),
key, key,
) )
raise ValueError(f"Unrecognized datatype for {key}.") raise ValueError(f"Unrecognized datatype for {key}.")
# Dataype is a string/varchar equivalent # Dataype is a string/varchar equivalent
elif dtype == DataType.VARCHAR: elif dtype == DataType.VARCHAR:
fields.append(FieldSchema(key, DataType.VARCHAR, max_length=65_535)) fields.append(
FieldSchema(key, DataType.VARCHAR, max_length=65_535)
)
else: else:
fields.append(FieldSchema(key, dtype)) fields.append(FieldSchema(key, dtype))
@ -442,6 +455,10 @@ class Milvus(VectorStore):
self._vector_field: embeddings, self._vector_field: embeddings,
} }
if self._metadata_field is not None:
for d in metadatas:
insert_dict.setdefault(self._metadata_field, []).append(d)
else:
# Collect the metadata into the insert dict. # Collect the metadata into the insert dict.
if metadatas is not None: if metadatas is not None:
for d in metadatas: for d in metadatas:
@ -630,8 +647,8 @@ class Milvus(VectorStore):
# Organize results. # Organize results.
ret = [] ret = []
for result in res[0]: for result in res[0]:
meta = {x: result.entity.get(x) for x in output_fields} data = {x: result.entity.get(x) for x in output_fields}
doc = Document(page_content=meta.pop(self._text_field), metadata=meta) doc = self._parse_document(data)
pair = (doc, result.score) pair = (doc, result.score)
ret.append(pair) ret.append(pair)
@ -746,8 +763,8 @@ class Milvus(VectorStore):
documents = [] documents = []
scores = [] scores = []
for result in res[0]: for result in res[0]:
meta = {x: result.entity.get(x) for x in output_fields} data = {x: result.entity.get(x) for x in output_fields}
doc = Document(page_content=meta.pop(self._text_field), metadata=meta) doc = self._parse_document(data)
documents.append(doc) documents.append(doc)
scores.append(result.score) scores.append(result.score)
ids.append(result.id) ids.append(result.id)
@ -826,3 +843,9 @@ class Milvus(VectorStore):
) )
vector_db.add_texts(texts=texts, metadatas=metadatas) vector_db.add_texts(texts=texts, metadatas=metadatas)
return vector_db return vector_db
def _parse_document(self, data: dict) -> Document:
return Document(
page_content=data.pop(self._text_field),
metadata=data.pop(self._metadata_field) if self._metadata_field else data,
)

Loading…
Cancel
Save