Harrison/shallow metadata (#1599)

Co-authored-by: Jesse Zhang <jessetanzhang@gmail.com>
pull/1601/head
Harrison Chase 2 years ago committed by GitHub
parent c6bfa00178
commit f95d551f7a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -1,6 +1,7 @@
"""Functionality for splitting text."""
from __future__ import annotations
import copy
import logging
from abc import ABC, abstractmethod
from typing import (
@ -51,7 +52,10 @@ class TextSplitter(ABC):
documents = []
for i, text in enumerate(texts):
for chunk in self.split_text(text):
documents.append(Document(page_content=chunk, metadata=_metadatas[i]))
new_doc = Document(
page_content=chunk, metadata=copy.deepcopy(_metadatas[i])
)
documents.append(new_doc)
return documents
def split_documents(self, documents: List[Document]) -> List[Document]:

@ -94,6 +94,21 @@ def test_create_documents_with_metadata() -> None:
assert docs == expected_docs
def test_metadata_not_shallow() -> None:
"""Test that metadatas are not shallow."""
texts = ["foo bar"]
splitter = CharacterTextSplitter(separator=" ", chunk_size=3, chunk_overlap=0)
docs = splitter.create_documents(texts, [{"source": "1"}])
expected_docs = [
Document(page_content="foo", metadata={"source": "1"}),
Document(page_content="bar", metadata={"source": "1"}),
]
assert docs == expected_docs
docs[0].metadata["foo"] = 1
assert docs[0].metadata == {"source": "1", "foo": 1}
assert docs[1].metadata == {"source": "1"}
def test_iterative_text_splitter() -> None:
"""Test iterative text splitter."""
text = """Hi.\n\nI'm Harrison.\n\nHow? Are? You?\nOkay then f f f f.

Loading…
Cancel
Save