From ef99b0636271c0840f47f7b346a3b8e257b8fe02 Mon Sep 17 00:00:00 2001 From: Joaquin Menendez <43391630+joaquinmenendez@users.noreply.github.com> Date: Wed, 11 Oct 2023 20:05:13 -0300 Subject: [PATCH] =?UTF-8?q?feature:=20add=20metadata=20information=20into?= =?UTF-8?q?=20the=20embedding=20file=20before=20uplo=E2=80=A6=20(#11553)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace this entire comment with: - **Description:** In this modified version of the function, if the metadatas parameter is not None, the function includes the corresponding metadata in the JSON object for each text. This allows the metadata to be stored alongside the text's embedding in the vector store. - - **Issue:** #10924 - **Dependencies:** None - **Tag maintainer:** @hwchase17 @agola11 - **Twitter handle:** @MelliJoaco --------- Co-authored-by: Bagatur --- .../langchain/vectorstores/matching_engine.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/libs/langchain/langchain/vectorstores/matching_engine.py b/libs/langchain/langchain/vectorstores/matching_engine.py index 2c725969fe..cd0a26d9d6 100644 --- a/libs/langchain/langchain/vectorstores/matching_engine.py +++ b/libs/langchain/langchain/vectorstores/matching_engine.py @@ -116,15 +116,24 @@ class MatchingEngine(VectorStore): Returns: List of ids from adding the texts into the vectorstore. """ + texts = list(texts) + if metadatas is not None and len(texts) != len(metadatas): + raise ValueError( + "texts and metadatas do not have the same length. Received " + f"{len(texts)} texts and {len(metadatas)} metadatas." + ) logger.debug("Embedding documents.") - embeddings = self.embedding.embed_documents(list(texts)) + embeddings = self.embedding.embed_documents(texts) jsons = [] ids = [] # Could be improved with async. - for embedding, text in zip(embeddings, texts): + for idx, (embedding, text) in enumerate(zip(embeddings, texts)): id = str(uuid.uuid4()) ids.append(id) - jsons.append({"id": id, "embedding": embedding}) + json_: dict = {"id": id, "embedding": embedding} + if metadatas is not None: + json_["metadata"] = metadatas[idx] + jsons.append(json) self._upload_to_gcs(text, f"documents/{id}") logger.debug(f"Uploaded {len(ids)} documents to GCS.")