diff --git a/docs/extras/use_cases/more/data_generation.ipynb b/docs/extras/use_cases/more/data_generation.ipynb new file mode 100644 index 0000000000..dd7b617f5b --- /dev/null +++ b/docs/extras/use_cases/more/data_generation.ipynb @@ -0,0 +1,437 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "aa3571cc", + "metadata": {}, + "source": [ + "# Data generation\n", + "\n", + "[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/use_cases/data_generation.ipynb)\n", + "\n", + "## Use case\n", + "\n", + "Creating synthethic language data can be beneficial for multiple reasons:\n", + "- providing data augmentation\n", + "- obtaining domain-specific examples\n", + "- increasing data diversity\n", + "- enabling quick iteration and experimentation\n", + "\n", + "## Quickstart\n", + "\n", + "Let's see a very straightforward example of how we can use OpenAI functions for creating synthetic data in LangChain." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7ae36b66", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "!pip install langchain openai \n", + "\n", + "# Set env var OPENAI_API_KEY or load from a .env file:\n", + "# import dotenv\n", + "# dotenv.load_dotenv()" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "9e715d94", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "from langchain.chat_models import ChatOpenAI\n", + "from langchain_experimental.synthetic_data import create_data_generation_chain, DatasetGenerator" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "94fccedd", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# LLM\n", + "model = ChatOpenAI(model_name=\"gpt-3.5-turbo\", temperature=0.7)\n", + "chain = create_data_generation_chain(model)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "4314c3ea", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'fields': ['blue', 'yellow'],\n", + " 'preferences': {},\n", + " 'text': 'The vibrant blue sky contrasted beautifully with the bright yellow sun, creating a stunning display of colors that instantly lifted the spirits of all who gazed upon it.'}" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chain({\"fields\": [\"blue\", \"yellow\"], \"preferences\": {}})" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "b116c487", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'fields': {'colors': ['blue', 'yellow']},\n", + " 'preferences': {'style': 'Make it in a style of a weather forecast.'},\n", + " 'text': \"Good morning! Today's weather forecast brings a beautiful combination of colors to the sky, with hues of blue and yellow gently blending together like a mesmerizing painting.\"}" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chain({\"fields\": {\"colors\": [\"blue\", \"yellow\"]}, \"preferences\": {\"style\": \"Make it in a style of a weather forecast.\"}})" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "ff823394", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'fields': {'actor': 'Tom Hanks', 'movies': ['Forrest Gump', 'Green Mile']},\n", + " 'preferences': None,\n", + " 'text': 'Tom Hanks, the renowned actor known for his incredible versatility and charm, has graced the silver screen in unforgettable movies such as \"Forrest Gump\" and \"Green Mile\".'}" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chain({\"fields\": {\"actor\": \"Tom Hanks\", \"movies\": [\"Forrest Gump\", \"Green Mile\"]}, \"preferences\": None})" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "1ea1ad5b", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'fields': [{'actor': 'Tom Hanks', 'movies': ['Forrest Gump', 'Green Mile']},\n", + " {'actor': 'Mads Mikkelsen', 'movies': ['Hannibal', 'Another round']}],\n", + " 'preferences': {'minimum_length': 200, 'style': 'gossip'},\n", + " 'text': 'Did you know that Tom Hanks, the beloved Hollywood actor known for his roles in \"Forrest Gump\" and \"Green Mile\", has shared the screen with the talented Mads Mikkelsen, who gained international acclaim for his performances in \"Hannibal\" and \"Another round\"? These two incredible actors have brought their exceptional skills and captivating charisma to the big screen, delivering unforgettable performances that have enthralled audiences around the world. Whether it\\'s Hanks\\' endearing portrayal of Forrest Gump or Mikkelsen\\'s chilling depiction of Hannibal Lecter, these movies have solidified their places in cinematic history, leaving a lasting impact on viewers and cementing their status as true icons of the silver screen.'}" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chain(\n", + " {\n", + " \"fields\": [\n", + " {\"actor\": \"Tom Hanks\", \"movies\": [\"Forrest Gump\", \"Green Mile\"]},\n", + " {\"actor\": \"Mads Mikkelsen\", \"movies\": [\"Hannibal\", \"Another round\"]}\n", + " ],\n", + " \"preferences\": {\"minimum_length\": 200, \"style\": \"gossip\"}\n", + " }\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "93c7a4bb", + "metadata": {}, + "source": [ + "As we can see created examples are diversified and possess information we wanted them to have. Also, their style reflects the given preferences quite well." + ] + }, + { + "cell_type": "markdown", + "id": "75f7f55a", + "metadata": {}, + "source": [ + "## Generating exemplary dataset for extraction benchmarking purposes" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "94e98bc4", + "metadata": {}, + "outputs": [], + "source": [ + "inp = [\n", + " {\n", + " 'Actor': 'Tom Hanks',\n", + " 'Film': [\n", + " 'Forrest Gump',\n", + " 'Saving Private Ryan',\n", + " 'The Green Mile',\n", + " 'Toy Story',\n", + " 'Catch Me If You Can']\n", + " },\n", + " {\n", + " 'Actor': 'Tom Hardy',\n", + " 'Film': [\n", + " 'Inception',\n", + " 'The Dark Knight Rises',\n", + " 'Mad Max: Fury Road',\n", + " 'The Revenant',\n", + " 'Dunkirk'\n", + " ]\n", + " }\n", + "]\n", + "\n", + "generator = DatasetGenerator(model, {\"style\": \"informal\", \"minimal length\": 500})\n", + "dataset = generator(inp)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "478eaca4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'fields': {'Actor': 'Tom Hanks',\n", + " 'Film': ['Forrest Gump',\n", + " 'Saving Private Ryan',\n", + " 'The Green Mile',\n", + " 'Toy Story',\n", + " 'Catch Me If You Can']},\n", + " 'preferences': {'style': 'informal', 'minimal length': 500},\n", + " 'text': 'Tom Hanks, the versatile and charismatic actor, has graced the silver screen in numerous iconic films including the heartwarming and inspirational \"Forrest Gump,\" the intense and gripping war drama \"Saving Private Ryan,\" the emotionally charged and thought-provoking \"The Green Mile,\" the beloved animated classic \"Toy Story,\" and the thrilling and captivating true story adaptation \"Catch Me If You Can.\" With his impressive range and genuine talent, Hanks continues to captivate audiences worldwide, leaving an indelible mark on the world of cinema.'},\n", + " {'fields': {'Actor': 'Tom Hardy',\n", + " 'Film': ['Inception',\n", + " 'The Dark Knight Rises',\n", + " 'Mad Max: Fury Road',\n", + " 'The Revenant',\n", + " 'Dunkirk']},\n", + " 'preferences': {'style': 'informal', 'minimal length': 500},\n", + " 'text': 'Tom Hardy, the versatile actor known for his intense performances, has graced the silver screen in numerous iconic films, including \"Inception,\" \"The Dark Knight Rises,\" \"Mad Max: Fury Road,\" \"The Revenant,\" and \"Dunkirk.\" Whether he\\'s delving into the depths of the subconscious mind, donning the mask of the infamous Bane, or navigating the treacherous wasteland as the enigmatic Max Rockatansky, Hardy\\'s commitment to his craft is always evident. From his breathtaking portrayal of the ruthless Eames in \"Inception\" to his captivating transformation into the ferocious Max in \"Mad Max: Fury Road,\" Hardy\\'s dynamic range and magnetic presence captivate audiences and leave an indelible mark on the world of cinema. In his most physically demanding role to date, he endured the harsh conditions of the freezing wilderness as he portrayed the rugged frontiersman John Fitzgerald in \"The Revenant,\" earning him critical acclaim and an Academy Award nomination. In Christopher Nolan\\'s war epic \"Dunkirk,\" Hardy\\'s stoic and heroic portrayal of Royal Air Force pilot Farrier showcases his ability to convey deep emotion through nuanced performances. With his chameleon-like ability to inhabit a wide range of characters and his unwavering commitment to his craft, Tom Hardy has undoubtedly solidified his place as one of the most talented and sought-after actors of his generation.'}]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset" + ] + }, + { + "cell_type": "markdown", + "id": "293a7d64", + "metadata": {}, + "source": [ + "## Extraction from generated examples\n", + "Okay, let's see if we can now extract output from this generated data and how it compares with our case!" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "03c6a375", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.llms import OpenAI\n", + "from langchain.prompts import PromptTemplate\n", + "from langchain.output_parsers import PydanticOutputParser\n", + "from langchain.chains import create_extraction_chain_pydantic, SimpleSequentialChain\n", + "from pydantic import BaseModel, Field\n", + "from typing import List" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "9461d225", + "metadata": {}, + "outputs": [], + "source": [ + "class Actor(BaseModel):\n", + " Actor: str = Field(description=\"name of an actor\")\n", + " Film: List[str] = Field(description=\"list of names of films they starred in\")" + ] + }, + { + "cell_type": "markdown", + "id": "8390171d", + "metadata": {}, + "source": [ + "### Parsers" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "8a5528d2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Actor(Actor='Tom Hanks', Film=['Forrest Gump', 'Saving Private Ryan', 'The Green Mile', 'Toy Story', 'Catch Me If You Can'])" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "llm = OpenAI()\n", + "parser = PydanticOutputParser(pydantic_object=Actor)\n", + "\n", + "prompt = PromptTemplate(\n", + " template=\"Extract fields from a given text.\\n{format_instructions}\\n{text}\\n\",\n", + " input_variables=[\"text\"],\n", + " partial_variables={\"format_instructions\": parser.get_format_instructions()},\n", + ")\n", + "\n", + "_input = prompt.format_prompt(text=dataset[0][\"text\"])\n", + "output = llm(_input.to_string())\n", + "\n", + "parsed = parser.parse(output)\n", + "parsed" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "926a7eed", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(parsed.Actor == inp[0][\"Actor\"]) & (parsed.Film == inp[0][\"Film\"])" + ] + }, + { + "cell_type": "markdown", + "id": "b00f0b87", + "metadata": {}, + "source": [ + "### Extractors" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "523bb584", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Actor(Actor='Tom Hardy', Film=['Inception', 'The Dark Knight Rises', 'Mad Max: Fury Road', 'The Revenant', 'Dunkirk'])]" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "extractor = create_extraction_chain_pydantic(pydantic_schema=Actor, llm=model)\n", + "extracted = extractor.run(dataset[1][\"text\"])\n", + "extracted" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "f8451c2b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(extracted[0].Actor == inp[1][\"Actor\"]) & (extracted[0].Film == inp[1][\"Film\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0b03de4d", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/libs/experimental/langchain_experimental/synthetic_data/__init__.py b/libs/experimental/langchain_experimental/synthetic_data/__init__.py new file mode 100644 index 0000000000..ddad227b21 --- /dev/null +++ b/libs/experimental/langchain_experimental/synthetic_data/__init__.py @@ -0,0 +1,51 @@ +from typing import TYPE_CHECKING, Any, Dict, List, Optional + +from langchain.chains.llm import LLMChain + +from langchain_experimental.synthetic_data.prompts import SENTENCE_PROMPT + +if TYPE_CHECKING: + from langchain.chains.base import Chain + from langchain.prompts import PromptTemplate + from langchain.schema.language_model import BaseLanguageModel + + +def create_data_generation_chain( + llm: BaseLanguageModel, + prompt: Optional[PromptTemplate] = None, +) -> Chain: + """Creates a chain that generates synthetic sentences with + provided fields. + + Args: + llm: The language model to use. + prompt: Prompt to feed the language model with. + If not provided, the default one will be used. + """ + prompt = prompt or SENTENCE_PROMPT + return LLMChain( + llm=llm, + prompt=prompt, + ) + + +class DatasetGenerator: + """Generates synthetic dataset with a given language model.""" + + def __init__( + self, + llm: BaseLanguageModel, + sentence_preferences: Optional[Dict[str, Any]] = None, + ): + self.generator = create_data_generation_chain(llm) + self.sentence_preferences = sentence_preferences or {} + + def __call__(self, fields_collection: List[List[Any]]) -> List[Dict[str, Any]]: + results: List[Dict[str, Any]] = [] + for fields in fields_collection: + results.append( + self.generator( + {"fields": fields, "preferences": self.sentence_preferences} + ) + ) + return results diff --git a/libs/experimental/langchain_experimental/synthetic_data/prompts.py b/libs/experimental/langchain_experimental/synthetic_data/prompts.py new file mode 100644 index 0000000000..2e0b600ec7 --- /dev/null +++ b/libs/experimental/langchain_experimental/synthetic_data/prompts.py @@ -0,0 +1,15 @@ +from langchain.prompts.prompt import PromptTemplate + +sentence_template = """Given the following fields, create a sentence about them. +Make the sentence detailed and interesting. Use every given field. +If any additional preferences are given, use them during sentence construction as well. +Fields: +{fields} +Preferences: +{preferences} +Sentence: +""" + +SENTENCE_PROMPT = PromptTemplate( + template=sentence_template, input_variables=["fields", "preferences"] +)