From 488d2d5da95a2bacdca3d1623d862ac5ab28d59e Mon Sep 17 00:00:00 2001
From: Francisco Ingham <24279597+fpingham@users.noreply.github.com>
Date: Thu, 13 Jul 2023 03:16:05 -0300
Subject: [PATCH] Entity extraction improvements (#6342)

Added fix to avoid irrelevant attributes being returned plus an example
of extracting unrelated entities and an exampe of using an 'extra_info'
attribute to extract unstructured data for an entity.

---------

Co-authored-by: Bagatur <baskaryan@gmail.com>
---
 .../chains/additional/extraction.ipynb        | 309 +++++++++++++++++-
 .../chains/openai_functions/extraction.py     |   8 +-
 langchain/chains/openai_functions/tagging.py  |   2 +
 3 files changed, 310 insertions(+), 9 deletions(-)

diff --git a/docs/extras/modules/chains/additional/extraction.ipynb b/docs/extras/modules/chains/additional/extraction.ipynb
index 7e2a0258e9..8e0db268e2 100644
--- a/docs/extras/modules/chains/additional/extraction.ipynb
+++ b/docs/extras/modules/chains/additional/extraction.ipynb
@@ -14,7 +14,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 2,
    "id": "34f04daf",
    "metadata": {},
    "outputs": [
@@ -35,7 +35,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 3,
    "id": "a2648974",
    "metadata": {},
    "outputs": [],
@@ -56,15 +56,110 @@
    "id": "78ff9df9",
    "metadata": {},
    "source": [
-    "To extract entities, we need to create a schema like the following, were we specify all the properties we want to find and the type we expect them to have. We can also specify which of these properties are required and which are optional."
+    "To extract entities, we need to create a schema where we specify all the properties we want to find and the type we expect them to have. We can also specify which of these properties are required and which are optional."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 4,
    "id": "4ac43eba",
    "metadata": {},
    "outputs": [],
+   "source": [
+    "schema = {\n",
+    "    \"properties\": {\n",
+    "        \"name\": {\"type\": \"string\"},\n",
+    "        \"height\": {\"type\": \"integer\"},\n",
+    "        \"hair_color\": {\"type\": \"string\"},\n",
+    "    },\n",
+    "    \"required\": [\"name\", \"height\"],\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "640bd005",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "inp = \"\"\"\n",
+    "Alex is 5 feet tall. Claudia is 1 feet taller Alex and jumps higher than him. Claudia is a brunette and Alex is blonde.\n",
+    "        \"\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "64313214",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "chain = create_extraction_chain(schema, llm)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "17c48adb",
+   "metadata": {},
+   "source": [
+    "As we can see, we extracted the required entities and their properties in the required format (it even calculated Claudia's height before returning!)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "cc5436ed",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[{'name': 'Alex', 'height': 5, 'hair_color': 'blonde'},\n",
+       " {'name': 'Claudia', 'height': 6, 'hair_color': 'brunette'}]"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "chain.run(inp)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8d51fcdc",
+   "metadata": {},
+   "source": [
+    "## Several entity types"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5813affe",
+   "metadata": {},
+   "source": [
+    "Notice that we are using OpenAI functions under the hood and thus the model can only call one function per request (with one, unique schema)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "511b9838",
+   "metadata": {},
+   "source": [
+    "If we want to extract more than one entity type, we need to introduce a little hack - we will define our properties with an included entity type. \n",
+    "\n",
+    "Following we have an example where we also want to extract dog attributes from the passage. Notice the 'person_' and 'dog_' prefixes we use for each property; this tells the model which entity type the property refers to. In this way, the model can return properties from several entity types in one single call."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "cf243a26",
+   "metadata": {},
+   "outputs": [],
    "source": [
     "schema = {\n",
     "    \"properties\": {\n",
@@ -103,10 +198,10 @@
   },
   {
    "cell_type": "markdown",
-   "id": "17c48adb",
+   "id": "eb074f7b",
    "metadata": {},
    "source": [
-    "As we can see, we extracted the required entities and their properties in the required format:"
+    "People attributes and dog attributes were correctly extracted from the text in the same call"
    ]
   },
   {
@@ -128,7 +223,207 @@
        "  'person_hair_color': 'brunette'}]"
       ]
      },
-     "execution_count": 6,
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "chain.run(inp)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0273e0e2",
+   "metadata": {},
+   "source": [
+    "## Unrelated entities"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c07b3480",
+   "metadata": {},
+   "source": [
+    "What if our entities are unrelated? In that case, the model will return the unrelated entities in different dictionaries, allowing us to successfully extract several unrelated entity types in the same call."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "01d98af0",
+   "metadata": {},
+   "source": [
+    "Notice that we use `required: []`: we need to allow the model to return **only** person attributes or **only** dog attributes for a single entity (person or dog)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "id": "e584c993",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "schema = {\n",
+    "    \"properties\": {\n",
+    "        \"person_name\": {\"type\": \"string\"},\n",
+    "        \"person_height\": {\"type\": \"integer\"},\n",
+    "        \"person_hair_color\": {\"type\": \"string\"},\n",
+    "        \"dog_name\": {\"type\": \"string\"},\n",
+    "        \"dog_breed\": {\"type\": \"string\"},\n",
+    "    },\n",
+    "    \"required\": [],\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "id": "ad6b105f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "inp = \"\"\"\n",
+    "Alex is 5 feet tall. Claudia is 1 feet taller Alex and jumps higher than him. Claudia is a brunette and Alex is blonde.\n",
+    "\n",
+    "Willow is a German Shepherd that likes to play with other dogs and can always be found playing with Milo, a border collie that lives close by.\n",
+    "\"\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "id": "6bfe5a33",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "chain = create_extraction_chain(schema, llm)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "24fe09af",
+   "metadata": {},
+   "source": [
+    "We have each entity in its own separate dictionary, with only the appropriate attributes being returned"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 51,
+   "id": "f6e1fd89",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[{'person_name': 'Alex', 'person_height': 5, 'person_hair_color': 'blonde'},\n",
+       " {'person_name': 'Claudia',\n",
+       "  'person_height': 6,\n",
+       "  'person_hair_color': 'brunette'},\n",
+       " {'dog_name': 'Willow', 'dog_breed': 'German Shepherd'},\n",
+       " {'dog_name': 'Milo', 'dog_breed': 'border collie'}]"
+      ]
+     },
+     "execution_count": 51,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "chain.run(inp)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0ac466d1",
+   "metadata": {},
+   "source": [
+    "## Extra info for an entity"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d240ffc1",
+   "metadata": {},
+   "source": [
+    "What if.. _we don't know what we want?_ More specifically, say we know a few properties we want to extract for a given entity but we also want to know if there's any extra information in the passage. Fortunately, we don't need to structure everything - we can have unstructured extraction as well. \n",
+    "\n",
+    "We can do this by introducing another hack, namely the *extra_info* attribute - let's see an example."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 68,
+   "id": "f19685f6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "schema = {\n",
+    "    \"properties\": {\n",
+    "        \"person_name\": {\"type\": \"string\"},\n",
+    "        \"person_height\": {\"type\": \"integer\"},\n",
+    "        \"person_hair_color\": {\"type\": \"string\"},\n",
+    "        \"dog_name\": {\"type\": \"string\"},\n",
+    "        \"dog_breed\": {\"type\": \"string\"},\n",
+    "        \"dog_extra_info\": {\"type\": \"string\"},\n",
+    "    },\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 81,
+   "id": "200c3477",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "inp = \"\"\"\n",
+    "Alex is 5 feet tall. Claudia is 1 feet taller Alex and jumps higher than him. Claudia is a brunette and Alex is blonde.\n",
+    "\n",
+    "Willow is a German Shepherd that likes to play with other dogs and can always be found playing with Milo, a border collie that lives close by.\n",
+    "\"\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 82,
+   "id": "ddad7dc6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "chain = create_extraction_chain(schema, llm)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e5c0dbbc",
+   "metadata": {},
+   "source": [
+    "It is nice to know more about Willow and Milo!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 83,
+   "id": "c22cfd30",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[{'person_name': 'Alex', 'person_height': 5, 'person_hair_color': 'blonde'},\n",
+       " {'person_name': 'Claudia',\n",
+       "  'person_height': 6,\n",
+       "  'person_hair_color': 'brunette'},\n",
+       " {'dog_name': 'Willow',\n",
+       "  'dog_breed': 'German Shepherd',\n",
+       "  'dog_extra_information': 'likes to play with other dogs'},\n",
+       " {'dog_name': 'Milo',\n",
+       "  'dog_breed': 'border collie',\n",
+       "  'dog_extra_information': 'lives close by'}]"
+      ]
+     },
+     "execution_count": 83,
      "metadata": {},
      "output_type": "execute_result"
     }
diff --git a/langchain/chains/openai_functions/extraction.py b/langchain/chains/openai_functions/extraction.py
index 609bee18d9..35d78357b3 100644
--- a/langchain/chains/openai_functions/extraction.py
+++ b/langchain/chains/openai_functions/extraction.py
@@ -32,11 +32,15 @@ def _get_extraction_function(entity_schema: dict) -> dict:
 
 
 _EXTRACTION_TEMPLATE = """Extract and save the relevant entities mentioned\
- in the following passage together with their properties.
+in the following passage together with their properties.
+
+Only extract the properties mentioned in the 'information_extraction' function.
+
+If a property is not present and is not required in the function parameters, do not include it in the output.
 
 Passage:
 {input}
-"""
+"""  # noqa: E501
 
 
 def create_extraction_chain(schema: dict, llm: BaseLanguageModel) -> Chain:
diff --git a/langchain/chains/openai_functions/tagging.py b/langchain/chains/openai_functions/tagging.py
index d39ad36ca5..4688f42f11 100644
--- a/langchain/chains/openai_functions/tagging.py
+++ b/langchain/chains/openai_functions/tagging.py
@@ -21,6 +21,8 @@ def _get_tagging_function(schema: dict) -> dict:
 
 _TAGGING_TEMPLATE = """Extract the desired information from the following passage.
 
+Only extract the properties mentioned in the 'information_extraction' function.
+
 Passage:
 {input}
 """