mirror of
https://github.com/hwchase17/langchain
synced 2024-10-29 17:07:25 +00:00
46542dc774
Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
29 lines
1.1 KiB
Python
29 lines
1.1 KiB
Python
"""Integration test for compression pipelines."""
|
|
from langchain.document_transformers import EmbeddingsRedundantFilter
|
|
from langchain.embeddings import OpenAIEmbeddings
|
|
from langchain.retrievers.document_compressors import (
|
|
DocumentCompressorPipeline,
|
|
EmbeddingsFilter,
|
|
)
|
|
from langchain.schema import Document
|
|
from langchain.text_splitter import CharacterTextSplitter
|
|
|
|
|
|
def test_document_compressor_pipeline() -> None:
|
|
embeddings = OpenAIEmbeddings()
|
|
splitter = CharacterTextSplitter(chunk_size=20, chunk_overlap=0, separator=". ")
|
|
redundant_filter = EmbeddingsRedundantFilter(embeddings=embeddings)
|
|
relevant_filter = EmbeddingsFilter(embeddings=embeddings, similarity_threshold=0.8)
|
|
pipeline_filter = DocumentCompressorPipeline(
|
|
transformers=[splitter, redundant_filter, relevant_filter]
|
|
)
|
|
texts = [
|
|
"This sentence is about cows",
|
|
"This sentence was about cows",
|
|
"foo bar baz",
|
|
]
|
|
docs = [Document(page_content=". ".join(texts))]
|
|
actual = pipeline_filter.compress_documents(docs, "Tell me about farm animals")
|
|
assert len(actual) == 1
|
|
assert actual[0].page_content in texts[:2]
|