diff --git a/docs/docs/integrations/document_transformers/volcengine_rerank.ipynb b/docs/docs/integrations/document_transformers/volcengine_rerank.ipynb new file mode 100644 index 0000000000..59c04de45a --- /dev/null +++ b/docs/docs/integrations/document_transformers/volcengine_rerank.ipynb @@ -0,0 +1,420 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Volcengine Reranker\n", + "\n", + "This notebook shows how to use Volcengine Reranker for document compression and retrieval. [Volcengine](https://www.volcengine.com/) is a cloud service platform developed by ByteDance, the parent company of TikTok.\n", + "\n", + "Volcengine's Rerank Service supports reranking up to 50 documents with a maximum of 4000 tokens. For more, please visit [here](https://www.volcengine.com/docs/84313/1254474) and [here](https://www.volcengine.com/docs/84313/1254605)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install --upgrade --quiet volcengine" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install --upgrade --quiet faiss\n", + "\n", + "# OR (depending on Python version)\n", + "\n", + "%pip install --upgrade --quiet faiss-cpu" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# To obtain ak/sk: https://www.volcengine.com/docs/84313/1254488\n", + "\n", + "import getpass\n", + "import os\n", + "\n", + "os.environ[\"VOLC_API_AK\"] = getpass.getpass(\"Volcengine API AK:\")\n", + "os.environ[\"VOLC_API_SK\"] = getpass.getpass(\"Volcengine API SK:\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# Helper function for printing docs\n", + "def pretty_print_docs(docs):\n", + " print(\n", + " f\"\\n{'-' * 100}\\n\".join(\n", + " [f\"Document {i+1}:\\n\\n\" + d.page_content for i, d in enumerate(docs)]\n", + " )\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Set up the base vector store retriever\n", + "Let's start by initializing a simple vector store retriever and storing the 2023 State of the Union speech (in chunks). We can set up the retriever to retrieve a high number (20) of docs." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/terminator/Developer/langchain/.venv/lib/python3.11/site-packages/sentence_transformers/cross_encoder/CrossEncoder.py:11: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n", + " from tqdm.autonotebook import tqdm, trange\n", + "/Users/terminator/Developer/langchain/.venv/lib/python3.11/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Document 1:\n", + "\n", + "One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n", + "\n", + "And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.\n", + "----------------------------------------------------------------------------------------------------\n", + "Document 2:\n", + "\n", + "We cannot let this happen. \n", + "\n", + "Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n", + "\n", + "Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service.\n", + "----------------------------------------------------------------------------------------------------\n", + "Document 3:\n", + "\n", + "As I said last year, especially to our younger transgender Americans, I will always have your back as your President, so you can be yourself and reach your God-given potential. \n", + "\n", + "While it often appears that we never agree, that isn’t true. I signed 80 bipartisan bills into law last year. From preventing government shutdowns to protecting Asian-Americans from still-too-common hate crimes to reforming military justice.\n", + "----------------------------------------------------------------------------------------------------\n", + "Document 4:\n", + "\n", + "He will never extinguish their love of freedom. He will never weaken the resolve of the free world. \n", + "\n", + "We meet tonight in an America that has lived through two of the hardest years this nation has ever faced. \n", + "\n", + "The pandemic has been punishing. \n", + "\n", + "And so many families are living paycheck to paycheck, struggling to keep up with the rising cost of food, gas, housing, and so much more. \n", + "\n", + "I understand.\n", + "----------------------------------------------------------------------------------------------------\n", + "Document 5:\n", + "\n", + "As Ohio Senator Sherrod Brown says, “It’s time to bury the label “Rust Belt.” \n", + "\n", + "It’s time. \n", + "\n", + "But with all the bright spots in our economy, record job growth and higher wages, too many families are struggling to keep up with the bills. \n", + "\n", + "Inflation is robbing them of the gains they might otherwise feel. \n", + "\n", + "I get it. That’s why my top priority is getting prices under control.\n", + "----------------------------------------------------------------------------------------------------\n", + "Document 6:\n", + "\n", + "A former top litigator in private practice. A former federal public defender. And from a family of public school educators and police officers. A consensus builder. Since she’s been nominated, she’s received a broad range of support—from the Fraternal Order of Police to former judges appointed by Democrats and Republicans. \n", + "\n", + "And if we are to advance liberty and justice, we need to secure the Border and fix the immigration system.\n", + "----------------------------------------------------------------------------------------------------\n", + "Document 7:\n", + "\n", + "It’s not only the right thing to do—it’s the economically smart thing to do. \n", + "\n", + "That’s why immigration reform is supported by everyone from labor unions to religious leaders to the U.S. Chamber of Commerce. \n", + "\n", + "Let’s get it done once and for all. \n", + "\n", + "Advancing liberty and justice also requires protecting the rights of women. \n", + "\n", + "The constitutional right affirmed in Roe v. Wade—standing precedent for half a century—is under attack as never before.\n", + "----------------------------------------------------------------------------------------------------\n", + "Document 8:\n", + "\n", + "I understand. \n", + "\n", + "I remember when my Dad had to leave our home in Scranton, Pennsylvania to find work. I grew up in a family where if the price of food went up, you felt it. \n", + "\n", + "That’s why one of the first things I did as President was fight to pass the American Rescue Plan. \n", + "\n", + "Because people were hurting. We needed to act, and we did. \n", + "\n", + "Few pieces of legislation have done more in a critical moment in our history to lift us out of crisis.\n", + "----------------------------------------------------------------------------------------------------\n", + "Document 9:\n", + "\n", + "Third – we can end the shutdown of schools and businesses. We have the tools we need. \n", + "\n", + "It’s time for Americans to get back to work and fill our great downtowns again. People working from home can feel safe to begin to return to the office. \n", + "\n", + "We’re doing that here in the federal government. The vast majority of federal workers will once again work in person. \n", + "\n", + "Our schools are open. Let’s keep it that way. Our kids need to be in school.\n", + "----------------------------------------------------------------------------------------------------\n", + "Document 10:\n", + "\n", + "He met the Ukrainian people. \n", + "\n", + "From President Zelenskyy to every Ukrainian, their fearlessness, their courage, their determination, inspires the world. \n", + "\n", + "Groups of citizens blocking tanks with their bodies. Everyone from students to retirees teachers turned soldiers defending their homeland. \n", + "\n", + "In this struggle as President Zelenskyy said in his speech to the European Parliament “Light will win over darkness.” The Ukrainian Ambassador to the United States is here tonight.\n", + "----------------------------------------------------------------------------------------------------\n", + "Document 11:\n", + "\n", + "The widow of Sergeant First Class Heath Robinson. \n", + "\n", + "He was born a soldier. Army National Guard. Combat medic in Kosovo and Iraq. \n", + "\n", + "Stationed near Baghdad, just yards from burn pits the size of football fields. \n", + "\n", + "Heath’s widow Danielle is here with us tonight. They loved going to Ohio State football games. He loved building Legos with their daughter. \n", + "\n", + "But cancer from prolonged exposure to burn pits ravaged Heath’s lungs and body. \n", + "\n", + "Danielle says Heath was a fighter to the very end.\n", + "----------------------------------------------------------------------------------------------------\n", + "Document 12:\n", + "\n", + "Danielle says Heath was a fighter to the very end. \n", + "\n", + "He didn’t know how to stop fighting, and neither did she. \n", + "\n", + "Through her pain she found purpose to demand we do better. \n", + "\n", + "Tonight, Danielle—we are. \n", + "\n", + "The VA is pioneering new ways of linking toxic exposures to diseases, already helping more veterans get benefits. \n", + "\n", + "And tonight, I’m announcing we’re expanding eligibility to veterans suffering from nine respiratory cancers.\n", + "----------------------------------------------------------------------------------------------------\n", + "Document 13:\n", + "\n", + "We can do all this while keeping lit the torch of liberty that has led generations of immigrants to this land—my forefathers and so many of yours. \n", + "\n", + "Provide a pathway to citizenship for Dreamers, those on temporary status, farm workers, and essential workers. \n", + "\n", + "Revise our laws so businesses have the workers they need and families don’t wait decades to reunite. \n", + "\n", + "It’s not only the right thing to do—it’s the economically smart thing to do.\n", + "----------------------------------------------------------------------------------------------------\n", + "Document 14:\n", + "\n", + "He rejected repeated efforts at diplomacy. \n", + "\n", + "He thought the West and NATO wouldn’t respond. And he thought he could divide us at home. Putin was wrong. We were ready. Here is what we did. \n", + "\n", + "We prepared extensively and carefully. \n", + "\n", + "We spent months building a coalition of other freedom-loving nations from Europe and the Americas to Asia and Africa to confront Putin.\n", + "----------------------------------------------------------------------------------------------------\n", + "Document 15:\n", + "\n", + "As I’ve told Xi Jinping, it is never a good bet to bet against the American people. \n", + "\n", + "We’ll create good jobs for millions of Americans, modernizing roads, airports, ports, and waterways all across America. \n", + "\n", + "And we’ll do it all to withstand the devastating effects of the climate crisis and promote environmental justice.\n", + "----------------------------------------------------------------------------------------------------\n", + "Document 16:\n", + "\n", + "Tonight I say to the Russian oligarchs and corrupt leaders who have bilked billions of dollars off this violent regime no more. \n", + "\n", + "The U.S. Department of Justice is assembling a dedicated task force to go after the crimes of Russian oligarchs. \n", + "\n", + "We are joining with our European allies to find and seize your yachts your luxury apartments your private jets. We are coming for your ill-begotten gains.\n", + "----------------------------------------------------------------------------------------------------\n", + "Document 17:\n", + "\n", + "Look at cars. \n", + "\n", + "Last year, there weren’t enough semiconductors to make all the cars that people wanted to buy. \n", + "\n", + "And guess what, prices of automobiles went up. \n", + "\n", + "So—we have a choice. \n", + "\n", + "One way to fight inflation is to drive down wages and make Americans poorer. \n", + "\n", + "I have a better plan to fight inflation. \n", + "\n", + "Lower your costs, not your wages. \n", + "\n", + "Make more cars and semiconductors in America. \n", + "\n", + "More infrastructure and innovation in America. \n", + "\n", + "More goods moving faster and cheaper in America.\n", + "----------------------------------------------------------------------------------------------------\n", + "Document 18:\n", + "\n", + "So that’s my plan. It will grow the economy and lower costs for families. \n", + "\n", + "So what are we waiting for? Let’s get this done. And while you’re at it, confirm my nominees to the Federal Reserve, which plays a critical role in fighting inflation. \n", + "\n", + "My plan will not only lower costs to give families a fair shot, it will lower the deficit.\n", + "----------------------------------------------------------------------------------------------------\n", + "Document 19:\n", + "\n", + "Let each of us here tonight in this Chamber send an unmistakable signal to Ukraine and to the world. \n", + "\n", + "Please rise if you are able and show that, Yes, we the United States of America stand with the Ukrainian people. \n", + "\n", + "Throughout our history we’ve learned this lesson when dictators do not pay a price for their aggression they cause more chaos. \n", + "\n", + "They keep moving. \n", + "\n", + "And the costs and the threats to America and the world keep rising.\n", + "----------------------------------------------------------------------------------------------------\n", + "Document 20:\n", + "\n", + "It’s based on DARPA—the Defense Department project that led to the Internet, GPS, and so much more. \n", + "\n", + "ARPA-H will have a singular purpose—to drive breakthroughs in cancer, Alzheimer’s, diabetes, and more. \n", + "\n", + "A unity agenda for the nation. \n", + "\n", + "We can do this. \n", + "\n", + "My fellow Americans—tonight , we have gathered in a sacred space—the citadel of our democracy. \n", + "\n", + "In this Capitol, generation after generation, Americans have debated great questions amid great strife, and have done great things.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", + "To disable this warning, you can either:\n", + "\t- Avoid using `tokenizers` before the fork if possible\n", + "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" + ] + } + ], + "source": [ + "from langchain_community.document_loaders import TextLoader\n", + "from langchain_community.vectorstores.faiss import FAISS\n", + "from langchain_huggingface import HuggingFaceEmbeddings\n", + "from langchain_text_splitters import RecursiveCharacterTextSplitter\n", + "\n", + "documents = TextLoader(\"../../how_to/state_of_the_union.txt\").load()\n", + "text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)\n", + "texts = text_splitter.split_documents(documents)\n", + "retriever = FAISS.from_documents(\n", + " texts, HuggingFaceEmbeddings(model_name=\"all-MiniLM-L6-v2\")\n", + ").as_retriever(search_kwargs={\"k\": 20})\n", + "\n", + "query = \"What did the president say about Ketanji Brown Jackson\"\n", + "docs = retriever.invoke(query)\n", + "pretty_print_docs(docs)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Reranking with VolcengineRerank\n", + "Now let's wrap our base retriever with a `ContextualCompressionRetriever`. We'll use the `VolcengineRerank` to rerank the returned results." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Document 1:\n", + "\n", + "One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n", + "\n", + "And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.\n", + "----------------------------------------------------------------------------------------------------\n", + "Document 2:\n", + "\n", + "As I said last year, especially to our younger transgender Americans, I will always have your back as your President, so you can be yourself and reach your God-given potential. \n", + "\n", + "While it often appears that we never agree, that isn’t true. I signed 80 bipartisan bills into law last year. From preventing government shutdowns to protecting Asian-Americans from still-too-common hate crimes to reforming military justice.\n", + "----------------------------------------------------------------------------------------------------\n", + "Document 3:\n", + "\n", + "We cannot let this happen. \n", + "\n", + "Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n", + "\n", + "Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service.\n" + ] + } + ], + "source": [ + "from langchain.retrievers import ContextualCompressionRetriever\n", + "from langchain_community.document_compressors.volcengine_rerank import VolcengineRerank\n", + "\n", + "compressor = VolcengineRerank()\n", + "compression_retriever = ContextualCompressionRetriever(\n", + " base_compressor=compressor, base_retriever=retriever\n", + ")\n", + "\n", + "compressed_docs = compression_retriever.invoke(\n", + " \"What did the president say about Ketanji Jackson Brown\"\n", + ")\n", + "pretty_print_docs(compressed_docs)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/libs/community/langchain_community/document_compressors/__init__.py b/libs/community/langchain_community/document_compressors/__init__.py index 8f27a8d445..1d7fbb62dc 100644 --- a/libs/community/langchain_community/document_compressors/__init__.py +++ b/libs/community/langchain_community/document_compressors/__init__.py @@ -20,7 +20,9 @@ if TYPE_CHECKING: from langchain_community.document_compressors.rankllm_rerank import ( RankLLMRerank, ) - + from langchain_community.document_compressors.volcengine_rerank import ( + VolcengineRerank, + ) _module_lookup = { "LLMLinguaCompressor": "langchain_community.document_compressors.llmlingua_filter", @@ -29,6 +31,7 @@ _module_lookup = { "RankLLMRerank": "langchain_community.document_compressors.rankllm_rerank", "FlashrankRerank": "langchain_community.document_compressors.flashrank_rerank", "DashScopeRerank": "langchain_community.document_compressors.dashscope_rerank", + "VolcengineRerank": "langchain_community.document_compressors.volcengine_rerank", } @@ -46,4 +49,5 @@ __all__ = [ "JinaRerank", "RankLLMRerank", "DashScopeRerank", + "VolcengineRerank", ] diff --git a/libs/community/langchain_community/document_compressors/volcengine_rerank.py b/libs/community/langchain_community/document_compressors/volcengine_rerank.py new file mode 100644 index 0000000000..f62ed88514 --- /dev/null +++ b/libs/community/langchain_community/document_compressors/volcengine_rerank.py @@ -0,0 +1,134 @@ +from __future__ import annotations + +from copy import deepcopy +from typing import Any, Dict, List, Optional, Sequence, Union + +from langchain_core.callbacks.base import Callbacks +from langchain_core.documents import BaseDocumentCompressor, Document +from langchain_core.pydantic_v1 import Extra, root_validator +from langchain_core.utils import get_from_dict_or_env + + +class VolcengineRerank(BaseDocumentCompressor): + """Document compressor that uses `Volcengine Rerank API`.""" + + client: Any = None + """Volcengine client to use for compressing documents.""" + + ak: Optional[str] = None + """Access Key ID. + https://www.volcengine.com/docs/84313/1254553""" + + sk: Optional[str] = None + """Secret Access Key. + https://www.volcengine.com/docs/84313/1254553""" + + region: str = "api-vikingdb.volces.com" + """https://www.volcengine.com/docs/84313/1254488. """ + + host: str = "cn-beijing" + """https://www.volcengine.com/docs/84313/1254488. """ + + top_n: Optional[int] = 3 + """Number of documents to return.""" + + class Config: + """Configuration for this pydantic object.""" + + extra = Extra.forbid + arbitrary_types_allowed = True + allow_population_by_field_name = True + + @root_validator() + def validate_environment(cls, values: Dict) -> Dict: + """Validate that api key and python package exists in environment.""" + + if not values.get("client"): + try: + from volcengine.viking_db import VikingDBService + except ImportError: + raise ImportError( + "Could not import volcengine python package. " + "Please install it with `pip install volcengine` " + "or `pip install --user volcengine`." + ) + + values["ak"] = get_from_dict_or_env(values, "ak", "VOLC_API_AK") + values["sk"] = get_from_dict_or_env(values, "sk", "VOLC_API_SK") + + values["client"] = VikingDBService( + host="api-vikingdb.volces.com", + region="cn-beijing", + scheme="https", + connection_timeout=30, + socket_timeout=30, + ak=values["ak"], + sk=values["sk"], + ) + + return values + + def rerank( + self, + documents: Sequence[Union[str, Document, dict]], + query: str, + *, + top_n: Optional[int] = -1, + ) -> List[Dict[str, Any]]: + """Returns an ordered list of documents ordered by their relevance to the provided query. + + Args: + query: The query to use for reranking. + documents: A sequence of documents to rerank. + top_n : The number of results to return. If None returns all results. + Defaults to self.top_n. + """ # noqa: E501 + + if len(documents) == 0: # to avoid empty api call + return [] + docs = [ + { + "query": query, + "content": doc.page_content if isinstance(doc, Document) else doc, + } + for doc in documents + ] + + from volcengine.viking_db import VikingDBService + + client: VikingDBService = self.client + results = client.batch_rerank(docs) + + result_dicts = [] + for index, score in enumerate(results): + result_dicts.append({"index": index, "relevance_score": score}) + + result_dicts.sort(key=lambda x: x["relevance_score"], reverse=True) + top_n = top_n if (top_n is None or top_n > 0) else self.top_n + + return result_dicts[:top_n] + + def compress_documents( + self, + documents: Sequence[Document], + query: str, + callbacks: Optional[Callbacks] = None, + ) -> Sequence[Document]: + """ + Compress documents using Volcengine's rerank API. + + Args: + documents: A sequence of documents to compress. + query: The query to use for compressing the documents. + callbacks: Callbacks to run during the compression process. + + Returns: + A sequence of compressed documents. + """ + compressed = [] + for res in self.rerank(documents, query): + doc = documents[res["index"]] + doc_copy = Document(doc.page_content, metadata=deepcopy(doc.metadata)) + doc_copy.metadata["relevance_score"] = res["relevance_score"] + compressed.append(doc_copy) + return compressed diff --git a/libs/community/tests/integration_tests/document_compressors/test_volcengine_rerank.py b/libs/community/tests/integration_tests/document_compressors/test_volcengine_rerank.py new file mode 100644 index 0000000000..0f830e83f3 --- /dev/null +++ b/libs/community/tests/integration_tests/document_compressors/test_volcengine_rerank.py @@ -0,0 +1,24 @@ +from langchain_core.documents import Document + +from langchain_community.document_compressors.volcengine_rerank import ( + VolcengineRerank, +) + + +def test_rerank() -> None: + reranker = VolcengineRerank() + docs = [ + Document(page_content="量子计算是计算科学的一个前沿领域"), + Document(page_content="预训练语言模型的发展给文本排序模型带来了新的进展"), + Document( + page_content="文本排序模型广泛用于搜索引擎和推荐系统中,它们根据文本相关性对候选文本进行排序" + ), + Document(page_content="random text for nothing"), + ] + compressed = reranker.compress_documents( + query="什么是文本排序模型", + documents=docs, + ) + + assert len(compressed) == 3, "default top_n is 3" + assert compressed[0].page_content == docs[2].page_content, "rerank works" diff --git a/libs/community/tests/unit_tests/document_compressors/test_imports.py b/libs/community/tests/unit_tests/document_compressors/test_imports.py index 3a722b0530..37ed3d69c6 100644 --- a/libs/community/tests/unit_tests/document_compressors/test_imports.py +++ b/libs/community/tests/unit_tests/document_compressors/test_imports.py @@ -7,6 +7,7 @@ EXPECTED_ALL = [ "RankLLMRerank", "FlashrankRerank", "DashScopeRerank", + "VolcengineRerank", ]