Milvus allows to store metadata as json field (#14636)

Because Milvus doesn't support nullable fields, but document metadata is
very rich, so it makes more sense to store it as json.


https://github.com/milvus-io/pymilvus/issues/1705#issuecomment-1731112372

<!-- Thank you for contributing to LangChain!

Replace this entire comment with:
  - **Description:** a description of the change, 
  - **Issue:** the issue # it fixes (if applicable),
  - **Dependencies:** any dependencies required for this change,
- **Tag maintainer:** for a quicker response, tag the relevant
maintainer (see below),
- **Twitter handle:** we announce bigger features on Twitter. If your PR
gets announced, and you'd like a mention, we'll gladly shout you out!

Please make sure your PR is passing linting and testing before
submitting. Run `make format`, `make lint` and `make test` to check this
locally.

See contribution guidelines for more information on how to write/run
tests, lint, etc:

https://github.com/langchain-ai/langchain/blob/master/.github/CONTRIBUTING.md

If you're adding a new integration, please include:
1. a test for the integration, preferably unit tests that do not rely on
network access,
2. an example notebook showing its use. It lives in `docs/extras`
directory.

If no one reviews your PR within a few days, please @-mention one of
@baskaryan, @eyurtsev, @hwchase17.
 -->

---------

Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
pull/15438/head
YISH 8 months ago committed by GitHub
parent 620168e459
commit da0f750a0b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -53,6 +53,9 @@ class Milvus(VectorStore):
primary_field (str): Name of the primary key field. Defaults to "pk". primary_field (str): Name of the primary key field. Defaults to "pk".
text_field (str): Name of the text field. Defaults to "text". text_field (str): Name of the text field. Defaults to "text".
vector_field (str): Name of the vector field. Defaults to "vector". vector_field (str): Name of the vector field. Defaults to "vector".
metadata_field (str): Name of the metadta field. Defaults to None.
When metadata_field is specified,
the document's metadata will store as json.
The connection args used for this class comes in the form of a dict, The connection args used for this class comes in the form of a dict,
here are a few of the options: here are a few of the options:
@ -112,6 +115,7 @@ class Milvus(VectorStore):
primary_field: str = "pk", primary_field: str = "pk",
text_field: str = "text", text_field: str = "text",
vector_field: str = "vector", vector_field: str = "vector",
metadata_field: Optional[str] = None,
): ):
"""Initialize the Milvus vector store.""" """Initialize the Milvus vector store."""
try: try:
@ -148,6 +152,7 @@ class Milvus(VectorStore):
self._text_field = text_field self._text_field = text_field
# In order for compatibility, the vector field needs to be called "vector" # In order for compatibility, the vector field needs to be called "vector"
self._vector_field = vector_field self._vector_field = vector_field
self._metadata_field = metadata_field
self.fields: list[str] = [] self.fields: list[str] = []
# Create the connection to the server # Create the connection to the server
if connection_args is None: if connection_args is None:
@ -250,24 +255,32 @@ class Milvus(VectorStore):
# Determine embedding dim # Determine embedding dim
dim = len(embeddings[0]) dim = len(embeddings[0])
fields = [] fields = []
# Determine metadata schema if self._metadata_field is not None:
if metadatas: fields.append(FieldSchema(self._metadata_field, DataType.JSON))
# Create FieldSchema for each entry in metadata. else:
for key, value in metadatas[0].items(): # Determine metadata schema
# Infer the corresponding datatype of the metadata if metadatas:
dtype = infer_dtype_bydata(value) # Create FieldSchema for each entry in metadata.
# Datatype isn't compatible for key, value in metadatas[0].items():
if dtype == DataType.UNKNOWN or dtype == DataType.NONE: # Infer the corresponding datatype of the metadata
logger.error( dtype = infer_dtype_bydata(value)
"Failure to create collection, unrecognized dtype for key: %s", # Datatype isn't compatible
key, if dtype == DataType.UNKNOWN or dtype == DataType.NONE:
) logger.error(
raise ValueError(f"Unrecognized datatype for {key}.") (
# Dataype is a string/varchar equivalent "Failure to create collection, "
elif dtype == DataType.VARCHAR: "unrecognized dtype for key: %s"
fields.append(FieldSchema(key, DataType.VARCHAR, max_length=65_535)) ),
else: key,
fields.append(FieldSchema(key, dtype)) )
raise ValueError(f"Unrecognized datatype for {key}.")
# Dataype is a string/varchar equivalent
elif dtype == DataType.VARCHAR:
fields.append(
FieldSchema(key, DataType.VARCHAR, max_length=65_535)
)
else:
fields.append(FieldSchema(key, dtype))
# Create the text field # Create the text field
fields.append( fields.append(
@ -442,12 +455,16 @@ class Milvus(VectorStore):
self._vector_field: embeddings, self._vector_field: embeddings,
} }
# Collect the metadata into the insert dict. if self._metadata_field is not None:
if metadatas is not None:
for d in metadatas: for d in metadatas:
for key, value in d.items(): insert_dict.setdefault(self._metadata_field, []).append(d)
if key in self.fields: else:
insert_dict.setdefault(key, []).append(value) # Collect the metadata into the insert dict.
if metadatas is not None:
for d in metadatas:
for key, value in d.items():
if key in self.fields:
insert_dict.setdefault(key, []).append(value)
# Total insert count # Total insert count
vectors: list = insert_dict[self._vector_field] vectors: list = insert_dict[self._vector_field]
@ -630,8 +647,8 @@ class Milvus(VectorStore):
# Organize results. # Organize results.
ret = [] ret = []
for result in res[0]: for result in res[0]:
meta = {x: result.entity.get(x) for x in output_fields} data = {x: result.entity.get(x) for x in output_fields}
doc = Document(page_content=meta.pop(self._text_field), metadata=meta) doc = self._parse_document(data)
pair = (doc, result.score) pair = (doc, result.score)
ret.append(pair) ret.append(pair)
@ -746,8 +763,8 @@ class Milvus(VectorStore):
documents = [] documents = []
scores = [] scores = []
for result in res[0]: for result in res[0]:
meta = {x: result.entity.get(x) for x in output_fields} data = {x: result.entity.get(x) for x in output_fields}
doc = Document(page_content=meta.pop(self._text_field), metadata=meta) doc = self._parse_document(data)
documents.append(doc) documents.append(doc)
scores.append(result.score) scores.append(result.score)
ids.append(result.id) ids.append(result.id)
@ -826,3 +843,9 @@ class Milvus(VectorStore):
) )
vector_db.add_texts(texts=texts, metadatas=metadatas) vector_db.add_texts(texts=texts, metadatas=metadatas)
return vector_db return vector_db
def _parse_document(self, data: dict) -> Document:
return Document(
page_content=data.pop(self._text_field),
metadata=data.pop(self._metadata_field) if self._metadata_field else data,
)

Loading…
Cancel
Save