Switching character chunking out for token-based chunking

This commit is contained in:
colin-openai 2023-03-01 04:01:28 -08:00
parent 9f2915b92c
commit 0c1912b8b8

View File

@ -38,18 +38,20 @@
"metadata": {},
"outputs": [],
"source": [
"!pip install textract"
"!pip install textract\n",
"!pip install tiktoken"
]
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"import textract\n",
"import os\n",
"import openai\n",
"import tiktoken\n",
"\n",
"# Extract the raw text from each PDF using textract\n",
"text = textract.process('data/fia_f1_power_unit_financial_regulations_issue_1_-_2022-08-16.pdf', method='pdfminer').decode('utf-8')\n",
@ -65,25 +67,25 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 28,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Extract key pieces of information from this regulation document. \n",
"If a particular piece of information is not present, output \"Not specified\". \n",
"Extract key pieces of information from this regulation document.\n",
"If a particular piece of information is not present, output \"Not specified\".\n",
"When you extract a key piece of information, include the closest page number.\n",
"Use the following format:\n",
"0. Author\n",
"1. What is the amount of the power unit cost cap in USD, GBP and EUR\n",
"0. Who is the author\n",
"1. What is the amount of the \"Power Unit Cost Cap\" in USD, GBP and EUR\n",
"2. What is the value of External Manufacturing Costs in USD\n",
"3. What is the Capital Expenditure Limit in USD\n",
"\n",
"Document: \"\"\"<document>\"\"\"\n",
"\n",
"0. Author: Tom Anderson (Page 1)\n",
"0. Who is the author: Tom Anderson (Page 1)\n",
"1.\n"
]
}
@ -94,19 +96,35 @@
"template_prompt=f'''Extract key pieces of information from this regulation document.\n",
"If a particular piece of information is not present, output \\\"Not specified\\\".\n",
"When you extract a key piece of information, include the closest page number.\n",
"Use the following format:\\n0. Who is the author\\n1. What is the amount of the power unit cost cap in USD, GBP and EUR\\n2. What is the value of External Manufacturing Costs in USD\\n3. What is the Capital Expenditure Limit in USD\\n\\nDocument: \\\"\\\"\\\"{document}\\\"\\\"\\\"\\n\\n0. Who is the author: Tom Anderson (Page 1)\\n1.'''\n",
"Use the following format:\\n0. Who is the author\\n1. What is the amount of the \"Power Unit Cost Cap\" in USD, GBP and EUR\\n2. What is the value of External Manufacturing Costs in USD\\n3. What is the Capital Expenditure Limit in USD\\n\\nDocument: \\\"\\\"\\\"{document}\\\"\\\"\\\"\\n\\n0. Who is the author: Tom Anderson (Page 1)\\n1.'''\n",
"print(template_prompt)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import openai\n",
"\n",
"# Split a text into smaller chunks of size n, preferably ending at the end of a sentence\n",
"def create_chunks(text, n, tokenizer):\n",
" tokens = tokenizer.encode(text)\n",
" \"\"\"Yield successive n-sized chunks from text.\"\"\"\n",
" i = 0\n",
" while i < len(tokens):\n",
" # Find the nearest end of sentence within a range of 0.5 * n and 1.5 * n tokens\n",
" j = min(i + int(1.5 * n), len(tokens))\n",
" while j > i + int(0.5 * n):\n",
" # Decode the tokens and check for full stop or newline\n",
" chunk = tokenizer.decode(tokens[i:j])\n",
" if chunk.endswith(\".\") or chunk.endswith(\"\\n\"):\n",
" break\n",
" j -= 1\n",
" # If no end of sentence found, use n tokens as the chunk size\n",
" if j == i + int(0.5 * n):\n",
" j = min(i + n, len(tokens))\n",
" yield tokens[i:j]\n",
" i = j\n",
"\n",
"def extract_chunk(document,template_prompt):\n",
" \n",
@ -126,84 +144,27 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1. What is the amount of the power unit cost cap in USD, GBP and EUR: USD 95,000,000 (Page 3) GBP 76,459,000 (Page 4) EUR 90,210,000 (Page 4)\n",
"2. What is the value of External Manufacturing Costs in USD: Not specified\n",
"3. What is the Capital Expenditure Limit in USD: Not specified\n",
"1. What is the amount of the power unit cost cap in USD, GBP and EUR: Not specified (Page 1)\n",
"2. What is the value of External Manufacturing Costs in USD: Not specified (Page 1)\n",
"3. What is the Capital Expenditure Limit in USD: Not specified (Page 1)\n",
"1. What is the amount of the power unit cost cap in USD, GBP and EUR: Not specified (Page 8)\n",
"2. What is the value of External Manufacturing Costs in USD: US Dollars 20,000,000 in respect of each of the Full Year Reporting Periods ending on 31 December 2023, 31 December 2024 and 31 December 2025, adjusted for Indexation (Page 10)\n",
"3. What is the Capital Expenditure Limit in USD: Not specified (Page 10)\n",
"1. What is the amount of the power unit cost cap in USD, GBP and EUR: Not specified (Page 1)\n",
"2. What is the value of External Manufacturing Costs in USD: Not specified (Page 1)\n",
"3. What is the Capital Expenditure Limit in USD: Not specified (Page 1)\n",
"1. What is the amount of the power unit cost cap in USD, GBP and EUR: Not specified (Page 15)\n",
"2. What is the value of External Manufacturing Costs in USD: Not specified (Page 15)\n",
"3. What is the Capital Expenditure Limit in USD: Not specified (Page 15)\n",
"1. What is the amount of the power unit cost cap in USD, GBP and EUR: Not specified (Page 18)\n",
"2. What is the value of External Manufacturing Costs in USD: Not specified (Page 18)\n",
"3. What is the Capital Expenditure Limit in USD: Not specified (Page 18)\n",
"1. What is the amount of the power unit cost cap in USD, GBP and EUR: Not specified (Page 1)\n",
"2. What is the value of External Manufacturing Costs in USD: Not specified (Page 1)\n",
"3. What is the Capital Expenditure Limit in USD: Not specified (Page 1)\n",
"1. What is the amount of the power unit cost cap in USD, GBP and EUR: Not specified (Page 25)\n",
"2. What is the value of External Manufacturing Costs in USD: Not specified (Page 25)\n",
"3. What is the Capital Expenditure Limit in USD: Not specified (Page 25)\n",
"1. What is the amount of the power unit cost cap in USD, GBP and EUR: Not specified (Page 29)\n",
"2. What is the value of External Manufacturing Costs in USD: Not specified (Page 29)\n",
"3. What is the Capital Expenditure Limit in USD: US Dollars 30,000,000 (Page 32)\n",
"1. What is the amount of the power unit cost cap in USD, GBP and EUR: Not specified (Page 34)\n",
"2. What is the value of External Manufacturing Costs in USD: Not specified (Page 35)\n",
"3. What is the Capital Expenditure Limit in USD: Not specified (Page 35)\n",
"1. What is the amount of the power unit cost cap in USD, GBP and EUR: Not specified (Page 36)\n",
"2. What is the value of External Manufacturing Costs in USD: Not specified (Page 36)\n",
"3. What is the Capital Expenditure Limit in USD: Not specified (Page 36)\n",
"1. What is the amount of the power unit cost cap in USD, GBP and EUR: Not specified (Page 41)\n",
"2. What is the value of External Manufacturing Costs in USD: Not specified (Page 41)\n",
"3. What is the Capital Expenditure Limit in USD: Not specified (Page 41)\n",
"1. What is the amount of the power unit cost cap in USD, GBP and EUR: Not specified (Page 45)\n",
"2. What is the value of External Manufacturing Costs in USD: Not specified (Page 45)\n",
"3. What is the Capital Expenditure Limit in USD: Not specified (Page 45)\n",
"1. What is the amount of the power unit cost cap in USD, GBP and EUR: Not specified (Page 48)\n",
"2. What is the value of External Manufacturing Costs in USD: Not specified (Page 48)\n",
"3. What is the Capital Expenditure Limit in USD: Not specified (Page 49)\n",
"1. What is the amount of the power unit cost cap in USD, GBP and EUR: Not specified (Page 1)\n",
"2. What is the value of External Manufacturing Costs in USD: Not specified (Page 1)\n",
"3. What is the Capital Expenditure Limit in USD: Not specified (Page 56)\n",
"1. What is the amount of the power unit cost cap in USD, GBP and EUR: Not specified (Page 1)\n",
"2. What is the value of External Manufacturing Costs in USD: Not specified (Page 1)\n",
"3. What is the Capital Expenditure Limit in USD: Not specified (Page 1)\n",
"1. What is the amount of the power unit cost cap in USD, GBP and EUR: Not specified\n",
"2. What is the value of External Manufacturing Costs in USD: Not specified\n",
"3. What is the Capital Expenditure Limit in USD: Not specified\n"
]
}
],
"outputs": [],
"source": [
"chunks = []\n",
"# Initialise tokenizer\n",
"tokenizer = tiktoken.get_encoding(\"cl100k_base\")\n",
"\n",
"results = []\n",
" \n",
"chunks = create_chunks(clean_text,1000,tokenizer)\n",
"text_chunks = [tokenizer.decode(chunk) for chunk in chunks]\n",
"\n",
"# chunk up the contract into consecutive 10k character chunks\n",
"for i in range(0, len(clean_text), 10000):\n",
" #print(clean_text[i:i+10000])\n",
" chunks.append(clean_text[i:i+10000])\n",
"\n",
"for chunk in chunks:\n",
"for chunk in text_chunks:\n",
" results.append(extract_chunk(chunk,template_prompt))\n",
" #print(chunk)\n",
" print(results[-1])\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 31,
"metadata": {
"scrolled": true
},
@ -211,12 +172,12 @@
{
"data": {
"text/plain": [
"['1. What is the amount of the power unit cost cap in USD, GBP and EUR: USD 95,000,000 (Page 3) GBP 76,459,000 (Page 4) EUR 90,210,000 (Page 4)',\n",
"['1. What is the amount of the \"Power Unit Cost Cap\" in USD, GBP and EUR: USD 95,000,000 (Page 2); GBP 76,459,000 (Page 2); EUR 90,210,000 (Page 2)',\n",
" '2. What is the value of External Manufacturing Costs in USD: US Dollars 20,000,000 in respect of each of the Full Year Reporting Periods ending on 31 December 2023, 31 December 2024 and 31 December 2025, adjusted for Indexation (Page 10)',\n",
" '3. What is the Capital Expenditure Limit in USD: US Dollars 30,000,000 (Page 32)']"
]
},
"execution_count": 7,
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
@ -239,25 +200,25 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 32,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Extract key pieces of information from this regulation document. \n",
"If a particular piece of information is not present, output \"Not specified\". \n",
"Extract key pieces of information from this regulation document.\n",
"If a particular piece of information is not present, output \"Not specified\".\n",
"When you extract a key piece of information, include the closest page number.\n",
"Use the following format:\n",
"0. Author\n",
"1. How is a minor overspend breach calculated\n",
"2. How is a major overspend breach calculated\n",
"3.Which years do these financial regulations apply to\n",
"0. Who is the author\n",
"1. How is a Minor Overspend Breach calculated\n",
"2. How is a Major Overspend Breach calculated\n",
"3. Which years do these financial regulations apply to\n",
"\n",
"Document: \"\"\"<document>\"\"\"\n",
"\n",
"0. Author: Tom Anderson (Page 1)\n",
"0. Who is the author: Tom Anderson (Page 1)\n",
"1.\n"
]
}
@ -267,48 +228,55 @@
"template_prompt=f'''Extract key pieces of information from this regulation document.\n",
"If a particular piece of information is not present, output \\\"Not specified\\\".\n",
"When you extract a key piece of information, include the closest page number.\n",
"Use the following format:\\n0. Who is the author\\n1. How is a minor overspend breach calculated\\n2. How is a major overspend breach calculated\\n3. Which years do these financial regulations apply to\\n\\nDocument: \\\"\\\"\\\"{document}\\\"\\\"\\\"\\n\\n0. Who is the author: Tom Anderson (Page 1)\\n1.'''\n",
"Use the following format:\\n0. Who is the author\\n1. How is a Minor Overspend Breach calculated\\n2. How is a Major Overspend Breach calculated\\n3. Which years do these financial regulations apply to\\n\\nDocument: \\\"\\\"\\\"{document}\\\"\\\"\\\"\\n\\n0. Who is the author: Tom Anderson (Page 1)\\n1.'''\n",
"print(template_prompt)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 34,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['1. How is a minor overspend breach calculated: Relevant Costs reported exceed the Power Unit Cost Cap by less than 5% (Page 24)',\n",
" '2. How is a major overspend breach calculated: Relevant Costs reported exceed the Power Unit Cost Cap by more than 5% (Page 24)',\n",
" '3. Which years do these financial regulations apply to: 2023-2026 (Page 2)',\n",
" '3. Which years do these financial regulations apply to: 2022 (Page 4)',\n",
" '3. Which years do these financial regulations apply to: 2022 (Page 8)',\n",
"['1. How is a Minor Overspend Breach calculated: A Minor Overspend Breach arises when a Power Unit Manufacturer submits its Full Year Reporting Documentation and Relevant Costs reported therein exceed the Power Unit Cost Cap by less than 5% (Page 24)',\n",
" '2. How is a Major Overspend Breach calculated: A Material Overspend Breach arises when a Power Unit Manufacturer submits its Full Year Reporting Documentation and Relevant Costs reported therein exceed the Power Unit Cost Cap by 5% or more (Page 25)',\n",
" '3. Which years do these financial regulations apply to: 2026 onwards (Page 1)',\n",
" '3. Which years do these financial regulations apply to: 2023, 2024, 2025, 2026 and subsequent Full Year Reporting Periods (Page 2)',\n",
" '3. Which years do these financial regulations apply to: 2022-2025 (Page 6)',\n",
" '3. Which years do these financial regulations apply to: 2023, 2024, 2025, 2026 and subsequent Full Year Reporting Periods (Page 10)',\n",
" '3. Which years do these financial regulations apply to: 2022 (Page 14)',\n",
" '3. Which years do these financial regulations apply to: 2022 (Page 16)',\n",
" '3. Which years do these financial regulations apply to: 2022 (Page 19)',\n",
" '3. Which years do these financial regulations apply to: 2022 (Page 21)',\n",
" '3. Which years do these financial regulations apply to: 2026 onwards (Page 26)',\n",
" '3. Which years do these financial regulations apply to: 2026 (Page 2)',\n",
" '3. Which years do these financial regulations apply to: 2022 (Page 30)',\n",
" '3. Which years do these financial regulations apply to: 2022 (Page 32)',\n",
" '3. Which years do these financial regulations apply to: 2023, 2024 and 2025 (Page 1)',\n",
" '3. Which years do these financial regulations apply to: 2022 (Page 37)',\n",
" '3. Which years do these financial regulations apply to: 2026 onwards (Page 40)',\n",
" '3. Which years do these financial regulations apply to: 2022 (Page 1)',\n",
" '3. Which years do these financial regulations apply to: 2026 to 2030 seasons (Page 46)',\n",
" '3. Which years do these financial regulations apply to: 2022 (Page 47)',\n",
" '3. Which years do these financial regulations apply to: 2022 (Page 1)',\n",
" '3. Which years do these financial regulations apply to: 2022 (Page 1)',\n",
" '3.Which years do these financial regulations apply to: 2022 (Page 1)',\n",
" '3. Which years do these financial regulations apply to: 2023, 2024, 2025, 2026 onwards (Page 34)',\n",
" '3.Which years do these financial regulations apply to: 2022 (Page 1)',\n",
" '3.Which years do these financial regulations apply to: 2026 onwards (Page 41)',\n",
" '3.Which years do these financial regulations apply to: 2026 to 2030 seasons (inclusive) (Page 46)',\n",
" '3. Which years do these financial regulations apply to: 2022 (Page 56)',\n",
" '3. Which years do these financial regulations apply to: 2022 (Page 1)',\n",
" '3. Which years do these financial regulations apply to: 2022 (Page 57)',\n",
" '3. Which years do these financial regulations apply to: 2022 (Page 61)']"
" '3. Which years do these financial regulations apply to: 2022 (Page 16)',\n",
" '3. Which years do these financial regulations apply to: 2022 (Page 16)']"
]
},
"execution_count": 9,
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"chunks = []\n",
"results = []\n",
"\n",
"for i in range(0, len(clean_text), 10000):\n",
" chunks.append(clean_text[i:i+10000])\n",
"\n",
"for chunk in chunks:\n",
"for chunk in text_chunks:\n",
" results.append(extract_chunk(chunk,template_prompt))\n",
" \n",
"groups = [r.split('\\n') for r in results]\n",
@ -330,17 +298,10 @@
"To tune this further you can consider experimenting with:\n",
"- A more descriptive or specific prompt\n",
"- If you have sufficient training data, fine-tuning a model to find a set of outputs very well\n",
"- The way you chunk your data - we have gone for 10,000 characters with no overlap, but more intelligent chunking that breaks info into sections, cuts by tokens or similar may get better results\n",
"- The way you chunk your data - we have gone for 1000 tokens with no overlap, but more intelligent chunking that breaks info into sections, cuts by tokens or similar may get better results\n",
"\n",
"However, with minimal tuning we have now answered 6 questions of varying difficulty using the contents of a long document, and have a reusable approach that we can apply to any long document requiring entity extraction. Look forward to seeing what you can do with this!"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {