diff --git a/docs/docs/integrations/retrievers/rememberizer.ipynb b/docs/docs/integrations/retrievers/rememberizer.ipynb new file mode 100644 index 0000000000..d3737e5e68 --- /dev/null +++ b/docs/docs/integrations/retrievers/rememberizer.ipynb @@ -0,0 +1,221 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Rememberizer\n", + "\n", + ">[Rememberizer](https://rememberizer.ai/) is a knowledge enhancement service for AI applications created by SkyDeck AI Inc.\n", + "\n", + "This notebook shows how to retrieve documents from `Rememberizer` into the Document format that is used downstream." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Preparation\n", + "\n", + "You will need an API key: you can get one after creating a common knowledge at [https://rememberizer.ai](https://rememberizer.ai/). Once you have an API key, you must set it as an environment variable `REMEMBERIZER_API_KEY` or pass it as `rememberizer_api_key` when initializing `RememberizerRetriever`.\n", + "\n", + "`RememberizerRetriever` has these arguments:\n", + "- optional `top_k_results`: default=10. Use it to limit number of returned documents. \n", + "- optional `rememberizer_api_key`: required if you don't set the environment variable `REMEMBERIZER_API_KEY`.\n", + "\n", + "`get_relevant_documents()` has one argument, `query`: free text which used to find documents in the common knowledge of `Rememberizer.ai`" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Examples\n", + "\n", + "## Basic usage" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# Setup API key\n", + "from getpass import getpass\n", + "\n", + "REMEMBERIZER_API_KEY = getpass()" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "from langchain_community.retrievers import RememberizerRetriever\n", + "\n", + "os.environ[\"REMEMBERIZER_API_KEY\"] = REMEMBERIZER_API_KEY\n", + "retriever = RememberizerRetriever(top_k_results=5)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "docs = retriever.get_relevant_documents(query=\"How does Large Language Models works?\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'id': 13646493,\n", + " 'document_id': '17s3LlMbpkTk0ikvGwV0iLMCj-MNubIaP',\n", + " 'name': 'What is a large language model (LLM)_ _ Cloudflare.pdf',\n", + " 'type': 'application/pdf',\n", + " 'path': '/langchain/What is a large language model (LLM)_ _ Cloudflare.pdf',\n", + " 'url': 'https://drive.google.com/file/d/17s3LlMbpkTk0ikvGwV0iLMCj-MNubIaP/view',\n", + " 'size': 337089,\n", + " 'created_time': '',\n", + " 'modified_time': '',\n", + " 'indexed_on': '2024-04-04T03:36:28.886170Z',\n", + " 'integration': {'id': 347, 'integration_type': 'google_drive'}}" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "docs[0].metadata # meta-information of the Document" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "before, or contextualized in new ways. on some level they \" understand \" semantics in that they can associate words and concepts by their meaning, having seen them grouped together in that way millions or billions of times. how developers can quickly start building their own llms to build llm applications, developers need easy access to multiple data sets, and they need places for those data sets \n" + ] + } + ], + "source": [ + "print(docs[0].page_content[:400]) # a content of the Document" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Usage in a chain" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "OPENAI_API_KEY = getpass()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "os.environ[\"OPENAI_API_KEY\"] = OPENAI_API_KEY" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.chains import ConversationalRetrievalChain\n", + "from langchain_openai import ChatOpenAI\n", + "\n", + "model = ChatOpenAI(model_name=\"gpt-3.5-turbo\")\n", + "qa = ConversationalRetrievalChain.from_llm(model, retriever=retriever)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-> **Question**: What is RAG? \n", + "\n", + "**Answer**: RAG stands for Retrieval-Augmented Generation. It is an AI framework that retrieves facts from an external knowledge base to enhance the responses generated by Large Language Models (LLMs) by providing up-to-date and accurate information. This framework helps users understand the generative process of LLMs and ensures that the model has access to reliable information sources. \n", + "\n", + "-> **Question**: How does Large Language Models works? \n", + "\n", + "**Answer**: Large Language Models (LLMs) work by analyzing massive data sets of language to comprehend and generate human language text. They are built on machine learning, specifically deep learning, which involves training a program to recognize features of data without human intervention. LLMs use neural networks, specifically transformer models, to understand context in human language, making them better at interpreting language even in vague or new contexts. Developers can quickly start building their own LLMs by accessing multiple data sets and using services like Cloudflare's Vectorize and Cloudflare Workers AI platform. \n", + "\n" + ] + } + ], + "source": [ + "questions = [\n", + " \"What is RAG?\",\n", + " \"How does Large Language Models works?\",\n", + "]\n", + "chat_history = []\n", + "\n", + "for question in questions:\n", + " result = qa.invoke({\"question\": question, \"chat_history\": chat_history})\n", + " chat_history.append((question, result[\"answer\"]))\n", + " print(f\"-> **Question**: {question} \\n\")\n", + " print(f\"**Answer**: {result['answer']} \\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "langchain", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.18" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/libs/community/langchain_community/retrievers/__init__.py b/libs/community/langchain_community/retrievers/__init__.py index b4904fac22..a4b43dd978 100644 --- a/libs/community/langchain_community/retrievers/__init__.py +++ b/libs/community/langchain_community/retrievers/__init__.py @@ -202,6 +202,7 @@ _module_lookup = { "PineconeHybridSearchRetriever": "langchain_community.retrievers.pinecone_hybrid_search", # noqa: E501 "PubMedRetriever": "langchain_community.retrievers.pubmed", "QdrantSparseVectorRetriever": "langchain_community.retrievers.qdrant_sparse_vector_retriever", # noqa: E501 + "RememberizerRetriever": "langchain_community.retrievers.rememberizer", "RemoteLangChainRetriever": "langchain_community.retrievers.remote_retriever", "SVMRetriever": "langchain_community.retrievers.svm", "TFIDFRetriever": "langchain_community.retrievers.tfidf", diff --git a/libs/community/langchain_community/retrievers/rememberizer.py b/libs/community/langchain_community/retrievers/rememberizer.py new file mode 100644 index 0000000000..c0aae8bd52 --- /dev/null +++ b/libs/community/langchain_community/retrievers/rememberizer.py @@ -0,0 +1,20 @@ +from typing import List + +from langchain_core.callbacks import CallbackManagerForRetrieverRun +from langchain_core.documents import Document +from langchain_core.retrievers import BaseRetriever + +from langchain_community.utilities.rememberizer import RememberizerAPIWrapper + + +class RememberizerRetriever(BaseRetriever, RememberizerAPIWrapper): + """`Rememberizer` retriever. + + It wraps load() to get_relevant_documents(). + It uses all RememberizerAPIWrapper arguments without any change. + """ + + def _get_relevant_documents( + self, query: str, *, run_manager: CallbackManagerForRetrieverRun + ) -> List[Document]: + return self.load(query=query) diff --git a/libs/community/langchain_community/utilities/__init__.py b/libs/community/langchain_community/utilities/__init__.py index 593086d192..6e93776003 100644 --- a/libs/community/langchain_community/utilities/__init__.py +++ b/libs/community/langchain_community/utilities/__init__.py @@ -265,6 +265,7 @@ _module_lookup = { "PowerBIDataset": "langchain_community.utilities.powerbi", "PubMedAPIWrapper": "langchain_community.utilities.pubmed", "PythonREPL": "langchain_community.utilities.python", + "RememberizerAPIWrapper": "langchain_community.utilities.rememberizer", "Requests": "langchain_community.utilities.requests", "RequestsWrapper": "langchain_community.utilities.requests", "RivaASR": "langchain_community.utilities.nvidia_riva", diff --git a/libs/community/langchain_community/utilities/rememberizer.py b/libs/community/langchain_community/utilities/rememberizer.py new file mode 100644 index 0000000000..f7023cad39 --- /dev/null +++ b/libs/community/langchain_community/utilities/rememberizer.py @@ -0,0 +1,48 @@ +"""Wrapper for Rememberizer APIs.""" +from typing import Dict, List, Optional + +import requests +from langchain_core.documents import Document +from langchain_core.pydantic_v1 import BaseModel, root_validator +from langchain_core.utils import get_from_dict_or_env + + +class RememberizerAPIWrapper(BaseModel): + """Wrapper for Rememberizer APIs.""" + + top_k_results: int = 10 + rememberizer_api_key: Optional[str] = None + + @root_validator() + def validate_environment(cls, values: Dict) -> Dict: + """Validate that api key in environment.""" + rememberizer_api_key = get_from_dict_or_env( + values, "rememberizer_api_key", "REMEMBERIZER_API_KEY" + ) + values["rememberizer_api_key"] = rememberizer_api_key + + return values + + def search(self, query: str) -> dict: + """Search for a query in the Rememberizer API.""" + url = f"https://api.rememberizer.ai/api/v1/documents/search?q={query}&n={self.top_k_results}" + response = requests.get(url, headers={"x-api-key": self.rememberizer_api_key}) + data = response.json() + + if response.status_code != 200: + raise ValueError(f"API Error: {data}") + + matched_chunks = data.get("matched_chunks", []) + return matched_chunks + + def load(self, query: str) -> List[Document]: + matched_chunks = self.search(query) + docs = [] + for matched_chunk in matched_chunks: + docs.append( + Document( + page_content=matched_chunk["matched_content"], + metadata=matched_chunk["document"], + ) + ) + return docs diff --git a/libs/community/tests/unit_tests/retrievers/test_imports.py b/libs/community/tests/unit_tests/retrievers/test_imports.py index b2897f30f5..773da501d1 100644 --- a/libs/community/tests/unit_tests/retrievers/test_imports.py +++ b/libs/community/tests/unit_tests/retrievers/test_imports.py @@ -29,6 +29,7 @@ EXPECTED_ALL = [ "PubMedRetriever", "QdrantSparseVectorRetriever", "RemoteLangChainRetriever", + "RememberizerRetriever", "SVMRetriever", "TavilySearchAPIRetriever", "TFIDFRetriever", diff --git a/libs/community/tests/unit_tests/utilities/test_imports.py b/libs/community/tests/unit_tests/utilities/test_imports.py index f561e3ac26..f0cf53c19c 100644 --- a/libs/community/tests/unit_tests/utilities/test_imports.py +++ b/libs/community/tests/unit_tests/utilities/test_imports.py @@ -42,6 +42,7 @@ EXPECTED_ALL = [ "PythonREPL", "Requests", "RequestsWrapper", + "RememberizerAPIWrapper", "SQLDatabase", "SceneXplainAPIWrapper", "SearchApiAPIWrapper", diff --git a/libs/community/tests/unit_tests/utilities/test_rememberizer.py b/libs/community/tests/unit_tests/utilities/test_rememberizer.py new file mode 100644 index 0000000000..3b288a107f --- /dev/null +++ b/libs/community/tests/unit_tests/utilities/test_rememberizer.py @@ -0,0 +1,75 @@ +import unittest +from typing import Any +from unittest.mock import patch + +import responses + +from langchain_community.utilities import RememberizerAPIWrapper + + +class TestRememberizerAPIWrapper(unittest.TestCase): + @responses.activate + def test_search_successful(self) -> None: + responses.add( + responses.GET, + "https://api.rememberizer.ai/api/v1/documents/search?q=test&n=10", + json={ + "matched_chunks": [ + { + "chunk_id": "chunk", + "matched_content": "content", + "document": {"id": "id", "name": "name"}, + } + ] + }, + ) + wrapper = RememberizerAPIWrapper(rememberizer_api_key="dummy_key", n=10) + result = wrapper.search("test") + self.assertEqual( + result, + [ + { + "chunk_id": "chunk", + "matched_content": "content", + "document": {"id": "id", "name": "name"}, + } + ], + ) + + @responses.activate + def test_search_fail(self) -> None: + responses.add( + responses.GET, + "https://api.rememberizer.ai/api/v1/documents/search?q=test&n=10", + status=400, + json={"detail": "Incorrect authentication credentials."}, + ) + wrapper = RememberizerAPIWrapper(rememberizer_api_key="dummy_key", n=10) + with self.assertRaises(ValueError) as e: + wrapper.search("test") + self.assertEqual( + str(e.exception), + "API Error: {'detail': 'Incorrect authentication credentials.'}", + ) + + @patch("langchain_community.utilities.rememberizer.RememberizerAPIWrapper.search") + def test_load(self, mock_search: Any) -> None: + mock_search.return_value = [ + { + "chunk_id": "chunk1", + "matched_content": "content1", + "document": {"id": "id1", "name": "name1"}, + }, + { + "chunk_id": "chunk2", + "matched_content": "content2", + "document": {"id": "id2", "name": "name2"}, + }, + ] + wrapper = RememberizerAPIWrapper(rememberizer_api_key="dummy_key", n=10) + result = wrapper.load("test") + self.assertEqual(len(result), 2) + self.assertEqual(result[0].page_content, "content1") + self.assertEqual(result[0].metadata, {"id": "id1", "name": "name1"}) + self.assertEqual(result[1].page_content, "content2") + self.assertEqual(result[1].metadata, {"id": "id2", "name": "name2"})