From 472d14919f07388361db20f3f2a440ff57ccecb2 Mon Sep 17 00:00:00 2001 From: blob42 Date: Mon, 13 Feb 2023 23:50:56 +0100 Subject: [PATCH] initial --- .gitignore | 1 + chains.ipynb | 348 +++++++++++++++++++++++++++++++++++ chains.py | 193 ++++++++++++++++++++ document-loaders.ipynb | 157 ++++++++++++++++ document-loaders.py | 74 ++++++++ llm.json | 13 ++ llms.ipynb | 400 +++++++++++++++++++++++++++++++++++++++++ llms.py | 126 +++++++++++++ pdf-loader.ipynb | 104 +++++++++++ pdf-loader.py | 37 ++++ prompt_templates.ipynb | 347 +++++++++++++++++++++++++++++++++++ prompt_templates.py | 152 ++++++++++++++++ test.ipynb | 23 +++ test.py | 0 utils.ipynb | 78 ++++++++ utils.py | 32 ++++ 16 files changed, 2085 insertions(+) create mode 100644 .gitignore create mode 100644 chains.ipynb create mode 100644 chains.py create mode 100644 document-loaders.ipynb create mode 100644 document-loaders.py create mode 100644 llm.json create mode 100644 llms.ipynb create mode 100644 llms.py create mode 100644 pdf-loader.ipynb create mode 100644 pdf-loader.py create mode 100644 prompt_templates.ipynb create mode 100644 prompt_templates.py create mode 100644 test.ipynb create mode 100644 test.py create mode 100644 utils.ipynb create mode 100644 utils.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..763513e --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.ipynb_checkpoints diff --git a/chains.ipynb b/chains.ipynb new file mode 100644 index 0000000..503e89c --- /dev/null +++ b/chains.ipynb @@ -0,0 +1,348 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "jukit_cell_id": "2XVP2VXIL1" + }, + "source": [ + "# Chains\n", + "\n", + "Chaining LLMs with each other or with other experts.\n", + "\n", + "## Getting Started\n", + "\n", + "- Using the simple LLM chain\n", + "- Creating sequential chains\n", + "- Creating a custom chain\n", + "\n", + "### Why Use Chains ?\n", + "\n", + "- combine multiple components together\n", + "- ex: take user input, format with PromptTemplate, pass formatted text to LLM.\n", + "\n", + "## Query an LLM with LLMChain" + ] + }, + { + "cell_type": "code", + "metadata": { + "jukit_cell_id": "DPRWRo3fl7" + }, + "source": [ + "from langchain.prompts import PromptTemplate\n", + "from langchain.llms import OpenAI\n", + "import pprint as pp\n", + "\n", + "llm = OpenAI(temperature=0.9)\n", + "prompt = PromptTemplate(\n", + " input_variables=[\"product\"],\n", + " template=\"What is a good name for a company that makes {product}\"\n", + " )" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "metadata": { + "jukit_cell_id": "tOpTb9idHh" + }, + "source": [ + "We can now create a simple chain that takes user input format it and pass to LLM" + ] + }, + { + "cell_type": "code", + "metadata": { + "jukit_cell_id": "QXu2N1dEEC" + }, + "source": [ + "from langchain.chains import LLMChain\n", + "chain = LLMChain(llm=llm, prompt=prompt, output_key='company_name')\n", + "\n", + "# run the chain only specifying input variables\n", + "print(chain.run(\"hand crafted handbags\"))\n", + "\n", + "# NOTE: we pass data to the run of the entry chain (see sequence under)" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "\n\nUrban Crafts Co.\n" + } + ], + "execution_count": 1 + }, + { + "cell_type": "markdown", + "metadata": { + "jukit_cell_id": "Kv6bj1l9I3" + }, + "source": [ + "## Combining chains with SequentialChain\n", + "\n", + "Chains that execute their links in predefined order.\n", + "\n", + "- SimpleSequentialChain: simplest form, each step has a single input/output. \n", + "Output of one step is input to next.\n", + "- SequentialChain: More advanced, multiple inputs/outputs.\n", + "\n", + "\n", + "Following tutorial uses SimpleSequentialChain and SequentialChain, each chains output is input to the next one.\n", + "This sequential chain will:\n", + " 1. create company name for a product. We just use LLMChain for that\n", + " 2. Create a catchphrase for the product. We will use a new LLMChain for the catchphrase, as show below." + ] + }, + { + "cell_type": "code", + "metadata": { + "jukit_cell_id": "BMZLsdY9VP" + }, + "source": [ + "second_prompt = PromptTemplate(\n", + " input_variables=[\"company_name\"],\n", + " template=\"Write a catchphrase for the following company: {company_name}\",\n", + " )\n", + "chain_two = LLMChain(llm=llm, prompt=second_prompt, output_key='catchphrase')" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "metadata": { + "jukit_cell_id": "epQHxmeWCP" + }, + "source": [ + "We now combine the two chains to create company name and catch phrase." + ] + }, + { + "cell_type": "code", + "metadata": { + "jukit_cell_id": "SHwDHjVCxb" + }, + "source": [ + "from langchain.chains import SimpleSequentialChain, SequentialChain" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "code", + "metadata": { + "jukit_cell_id": "lKgp9HR0VX" + }, + "source": [ + "full_chain = SimpleSequentialChain(\n", + " chains=[chain, chain_two], verbose=True,\n", + " )\n", + "\n", + "print(full_chain.run(\"hand crafted handbags\"))" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "metadata": { + "jukit_cell_id": "RiYcYwJhdC" + }, + "source": [ + "---\n", + "\n", + "In the third prompt we create an small advertisement with the title and the product description" + ] + }, + { + "cell_type": "code", + "metadata": { + "jukit_cell_id": "RhnqOumOtX" + }, + "source": [ + "ad_template = \"\"\"Create a small advertisement destined for reddit. \n", + "The advertisement is for a company with the following details:\n", + "\n", + "name: {company_name}\n", + "product: {product}\n", + "catchphrase: {catchphrase}\n", + "\n", + "advertisement:\n", + "\"\"\"\n", + "ad_prompt = PromptTemplate(\n", + " input_variables=[\"product\", \"company_name\", \"catchphrase\"],\n", + " template=ad_template,\n", + " )" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "code", + "metadata": { + "jukit_cell_id": "MsQnieyxgL" + }, + "source": [ + "#Connet the three chains together\n", + "\n", + "ad_chain = LLMChain(llm=llm, prompt=ad_prompt, output_key='advertisement')" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "code", + "metadata": { + "jukit_cell_id": "4PYfwOxTlq" + }, + "source": [ + "final_chain = SequentialChain(\n", + " chains=[chain, chain_two, ad_chain],\n", + " input_variables=['product'],\n", + " output_variables=['advertisement'],\n", + " verbose=True\n", + " )\n", + "\n", + "ad = final_chain.run('Professional Cat Cuddler')" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "\n\n\u001b[1m> Entering new SequentialChain chain...\u001b[0m\n\n\u001b[1m> Finished chain.\u001b[0m\n" + } + ], + "execution_count": 2 + }, + { + "cell_type": "code", + "metadata": { + "jukit_cell_id": "2akm8eB1EV" + }, + "source": [ + "print(ad)" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "Are you in need of a little indulgence? Then come to Purr-fect Pampering! Our professional cat cuddler will provide you with the ultimate relaxation experience. We guarantee that after a session with us, you'll be feeling more purr-fect than ever! Treat yourself to the luxurious indulgence of Purr-fect Pampering!\n" + } + ], + "execution_count": 3 + }, + { + "cell_type": "markdown", + "metadata": { + "jukit_cell_id": "1iT7gBMABZ" + }, + "source": [ + "## Creating a custom chain\n", + "\n", + "Example: create a custom chain that concats output of 2 LLMChain\n", + "\n", + "Steps:\n", + " 1. Subclass Chain class\n", + " 2. Fill out `input_keys` and `output_keys`\n", + " 3. add the `_call` method that shows how to execute chain" + ] + }, + { + "cell_type": "code", + "metadata": { + "jukit_cell_id": "OUXv7kGtDH" + }, + "source": [ + "from langchain.chains import LLMChain\n", + "from langchain.chains.base import Chain\n", + "\n", + "from typing import Dict, List\n", + "\n", + "class ConcatenateChain(Chain):\n", + " chain_1: LLMChain\n", + " chain_2: LLMChain\n", + "\n", + " @property\n", + " def input_keys(self) -> List[str]:\n", + " # Union of the input keys of the two chains\n", + " all_inputs_vars = set(self.chain_1.input_keys).union(\n", + " set(self.chain_2.input_keys))\n", + " return list(all_inputs_vars)\n", + "\n", + " @property\n", + " def output_keys(self) -> List[str]:\n", + " return ['concat_output']\n", + "\n", + " def _call(self, inputs: Dict[str, str]) -> Dict[str,str]:\n", + " output_1 = self.chain_1.run(inputs)\n", + " output_2 = self.chain_2.run(inputs)\n", + " return {'concat_output': output_1 + output_2}" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "metadata": { + "jukit_cell_id": "MUOMbKovF6" + }, + "source": [ + "Running the custom chain" + ] + }, + { + "cell_type": "code", + "metadata": { + "jukit_cell_id": "kBfPU3rB6L" + }, + "source": [ + "prompt_1 = PromptTemplate(\n", + " input_variables=['product'],\n", + " template='what is a good name for a company that makes {product}?'\n", + " )\n", + "chain_1 = LLMChain(llm=llm, prompt=prompt_1)\n", + "\n", + "prompt_2 = PromptTemplate(\n", + " input_variables=['product'],\n", + " template='what is a good slogan for a company that makes {product} ?'\n", + " )\n", + "chain_2 = LLMChain(llm=llm, prompt=prompt_2)\n", + "\n", + "concat_chain = ConcatenateChain(chain_1=chain_1, chain_2=chain_2)\n", + "\n", + "concat_output = concat_chain.run('leather handbags')\n", + "print(f'Concatenated output:\\n{concat_output}')" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "Concatenated output:\n\n\nLeather Luxury Boutique.\n\n\"Handcrafted Leather: The Perfect Accent for Any Look.\"\n" + } + ], + "execution_count": 4 + }, + { + "cell_type": "code", + "metadata": { + "jukit_cell_id": "9CdH3GtsmW" + }, + "source": [], + "outputs": [], + "execution_count": null + } + ], + "metadata": { + "anaconda-cloud": {}, + "kernelspec": { + "display_name": "python", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/chains.py b/chains.py new file mode 100644 index 0000000..a2561a1 --- /dev/null +++ b/chains.py @@ -0,0 +1,193 @@ +r"""°°° +# Chains + +Chaining LLMs with each other or with other experts. + +## Getting Started + +- Using the simple LLM chain +- Creating sequential chains +- Creating a custom chain + +### Why Use Chains ? + +- combine multiple components together +- ex: take user input, format with PromptTemplate, pass formatted text to LLM. + +## Query an LLM with LLMChain + +°°°""" +#|%%--%%| <2XVP2VXIL1|DPRWRo3fl7> + +from langchain.prompts import PromptTemplate +from langchain.llms import OpenAI +import pprint as pp + +llm = OpenAI(temperature=0.9) +prompt = PromptTemplate( + input_variables=["product"], + template="What is a good name for a company that makes {product}" + ) + +#|%%--%%| +r"""°°° +We can now create a simple chain that takes user input format it and pass to LLM +°°°""" +#|%%--%%| + +from langchain.chains import LLMChain +chain = LLMChain(llm=llm, prompt=prompt, output_key='company_name') + +# run the chain only specifying input variables +print(chain.run("hand crafted handbags")) + +# NOTE: we pass data to the run of the entry chain (see sequence under) + +#|%%--%%| +r"""°°° +## Combining chains with SequentialChain + +Chains that execute their links in predefined order. + +- SimpleSequentialChain: simplest form, each step has a single input/output. +Output of one step is input to next. +- SequentialChain: More advanced, multiple inputs/outputs. + + +Following tutorial uses SimpleSequentialChain and SequentialChain, each chains output is input to the next one. +This sequential chain will: + 1. create company name for a product. We just use LLMChain for that + 2. Create a catchphrase for the product. We will use a new LLMChain for the catchphrase, as show below. +°°°""" +#|%%--%%| + +second_prompt = PromptTemplate( + input_variables=["company_name"], + template="Write a catchphrase for the following company: {company_name}", + ) +chain_two = LLMChain(llm=llm, prompt=second_prompt, output_key='catchphrase') + +#|%%--%%| +r"""°°° +We now combine the two chains to create company name and catch phrase. +°°°""" +#|%%--%%| + +from langchain.chains import SimpleSequentialChain, SequentialChain + +#|%%--%%| + +full_chain = SimpleSequentialChain( + chains=[chain, chain_two], verbose=True, + ) + +print(full_chain.run("hand crafted handbags")) + +#|%%--%%| +r"""°°° +--- + +In the third prompt we create an small advertisement with the title and the product description +°°°""" +#|%%--%%| + +ad_template = """Create a small advertisement destined for reddit. +The advertisement is for a company with the following details: + +name: {company_name} +product: {product} +catchphrase: {catchphrase} + +advertisement: +""" +ad_prompt = PromptTemplate( + input_variables=["product", "company_name", "catchphrase"], + template=ad_template, + ) + +#|%%--%%| + +#Connet the three chains together + +ad_chain = LLMChain(llm=llm, prompt=ad_prompt, output_key='advertisement') + +#|%%--%%| + +final_chain = SequentialChain( + chains=[chain, chain_two, ad_chain], + input_variables=['product'], + output_variables=['advertisement'], + verbose=True + ) + +ad = final_chain.run('Professional Cat Cuddler') +#|%%--%%| <4PYfwOxTlq|2akm8eB1EV> + +print(ad) + +#|%%--%%| <2akm8eB1EV|1iT7gBMABZ> +r"""°°° +## Creating a custom chain + +Example: create a custom chain that concats output of 2 LLMChain + +Steps: + 1. Subclass Chain class + 2. Fill out `input_keys` and `output_keys` + 3. add the `_call` method that shows how to execute chain +°°°""" +#|%%--%%| <1iT7gBMABZ|OUXv7kGtDH> + +from langchain.chains import LLMChain +from langchain.chains.base import Chain + +from typing import Dict, List + +class ConcatenateChain(Chain): + chain_1: LLMChain + chain_2: LLMChain + + @property + def input_keys(self) -> List[str]: + # Union of the input keys of the two chains + all_inputs_vars = set(self.chain_1.input_keys).union( + set(self.chain_2.input_keys)) + return list(all_inputs_vars) + + @property + def output_keys(self) -> List[str]: + return ['concat_output'] + + def _call(self, inputs: Dict[str, str]) -> Dict[str,str]: + output_1 = self.chain_1.run(inputs) + output_2 = self.chain_2.run(inputs) + return {'concat_output': output_1 + output_2} + +#|%%--%%| +r"""°°° +Running the custom chain +°°°""" +#|%%--%%| +prompt_1 = PromptTemplate( + input_variables=['product'], + template='what is a good name for a company that makes {product}?' + ) +chain_1 = LLMChain(llm=llm, prompt=prompt_1) + +prompt_2 = PromptTemplate( + input_variables=['product'], + template='what is a good slogan for a company that makes {product} ?' + ) +chain_2 = LLMChain(llm=llm, prompt=prompt_2) + +concat_chain = ConcatenateChain(chain_1=chain_1, chain_2=chain_2) + +concat_output = concat_chain.run('leather handbags') +print(f'Concatenated output:\n{concat_output}') + + +#|%%--%%| + + + + diff --git a/document-loaders.ipynb b/document-loaders.ipynb new file mode 100644 index 0000000..6ea940d --- /dev/null +++ b/document-loaders.ipynb @@ -0,0 +1,157 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "jukit_cell_id": "4yTe29l2Ya" + }, + "source": [ + "# Document Loaders\n", + "\n", + "- loading text from local sources\n", + "- main driver is `Unstructured` python package\n", + "\n", + "## Key Concepts\n", + "\n", + "### Document\n", + "\n", + "container class for document information. contains:\n", + " - page_content\n", + " - metadata\n", + "\n", + "### Loader\n", + "\n", + "base class to load documents. exposes:\n", + " - load() -> Document\n", + "\n", + "\n", + "## Setup Unstructured\n", + "- host dependencies\n", + " - poppler: PDF rendering library\n", + "- Python deps:\n", + " - Pillow: imaging library" + ] + }, + { + "cell_type": "code", + "metadata": { + "jukit_cell_id": "srwyN0cVES" + }, + "source": [ + "# %pip install pillow (already installed)\n", + "%pip install -q unstructured[local-inference]" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "Note: you may need to restart the kernel to use updated packages.\n" + } + ], + "execution_count": 1 + }, + { + "cell_type": "code", + "metadata": { + "jukit_cell_id": "cbFv0eSeXq" + }, + "source": [ + "docs_dir=\"unstructured-examples\"\n", + "!mkdir -p $docs_dir\n", + "!wget https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/example-docs/example-10k.html -P $docs_dir\n", + "!wget https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/example-docs/layout-parser-paper.pdf -P $docs_dir" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "metadata": { + "jukit_cell_id": "U633RkWjYq" + }, + "source": [ + "[repo link](https://github.com/Unstructured-IO/unstructured#coffee-getting-started)\n", + "The easiest way to parse a document in unstructured is to use the partition brick. If you use partition brick, unstructured will detect the file type and route it to the appropriate file-specific partitioning brick. If you are using the partition brick, ensure you first install libmagic using the instructions outlined here partition will always apply the default arguments. If you need advanced features, use a document-specific brick. The partition brick currently works for .txt, .docx, .pptx, .jpg, .png, .eml, .html, and .pdf documents.\n", + "\n", + "Requires detectonr2 inference (cuda ?)" + ] + }, + { + "cell_type": "code", + "metadata": { + "jukit_cell_id": "FJaYuFeL0U" + }, + "source": [ + "docs_dir=\"unstructured-examples\"" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "code", + "metadata": { + "jukit_cell_id": "9MKaXz7Bi4" + }, + "source": [ + "#NOTE: needs inference with facebook's detectron2\n", + "\n", + "# from unstructured.partition.auto import partition\n", + "\n", + "# elements = partition(docs_dir + \"/layout-parser-paper.pdf\")" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "metadata": { + "jukit_cell_id": "X4mTVZAzcD" + }, + "source": [ + "## Unstructured Langchain FileLoader\n", + "\n", + "Requires detectron2" + ] + }, + { + "cell_type": "code", + "metadata": { + "jukit_cell_id": "9k0eAtsfvh" + }, + "source": [ + "from langchain.document_loaders import UnstructuredFileLoader\n", + "\n", + "loader = UnstructuredFileLoader(\"./unstructured-examples/layout-parser-paper.pdf\")\n", + "\n", + "docs = loader.load()" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)\nCell \u001b[0;32mIn[10], line 5\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mlangchain\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdocument_loaders\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m UnstructuredFileLoader\n\u001b[1;32m 3\u001b[0m loader \u001b[38;5;241m=\u001b[39m UnstructuredFileLoader(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m./unstructured-examples/layout-parser-paper.pdf\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m----> 5\u001b[0m docs \u001b[38;5;241m=\u001b[39m \u001b[43mloader\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\nFile \u001b[0;32m/data/source/langchain/langchain/document_loaders/unstructured.py:26\u001b[0m, in \u001b[0;36mUnstructuredFileLoader.load\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 23\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Load file.\"\"\"\u001b[39;00m\n\u001b[1;32m 24\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01munstructured\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpartition\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mauto\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m partition\n\u001b[0;32m---> 26\u001b[0m elements \u001b[38;5;241m=\u001b[39m \u001b[43mpartition\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilename\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfile_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 27\u001b[0m text \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mjoin([\u001b[38;5;28mstr\u001b[39m(el) \u001b[38;5;28;01mfor\u001b[39;00m el \u001b[38;5;129;01min\u001b[39;00m elements])\n\u001b[1;32m 28\u001b[0m metadata \u001b[38;5;241m=\u001b[39m {\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msource\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfile_path}\n\nFile \u001b[0;32m~/.pyenv/versions/3.9.11/envs/langchain/lib/python3.9/site-packages/unstructured/partition/auto.py:44\u001b[0m, in \u001b[0;36mpartition\u001b[0;34m(filename, file, include_page_breaks)\u001b[0m\n\u001b[1;32m 42\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m partition_html(filename\u001b[38;5;241m=\u001b[39mfilename, file\u001b[38;5;241m=\u001b[39mfile, include_page_breaks\u001b[38;5;241m=\u001b[39minclude_page_breaks)\n\u001b[1;32m 43\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m filetype \u001b[38;5;241m==\u001b[39m FileType\u001b[38;5;241m.\u001b[39mPDF:\n\u001b[0;32m---> 44\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mpartition_pdf\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 45\u001b[0m \u001b[43m \u001b[49m\u001b[43mfilename\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfilename\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# type: ignore\u001b[39;49;00m\n\u001b[1;32m 46\u001b[0m \u001b[43m \u001b[49m\u001b[43mfile\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfile\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# type: ignore\u001b[39;49;00m\n\u001b[1;32m 47\u001b[0m \u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 48\u001b[0m \u001b[43m \u001b[49m\u001b[43minclude_page_breaks\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minclude_page_breaks\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 49\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 50\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m (filetype \u001b[38;5;241m==\u001b[39m FileType\u001b[38;5;241m.\u001b[39mPNG) \u001b[38;5;129;01mor\u001b[39;00m (filetype \u001b[38;5;241m==\u001b[39m FileType\u001b[38;5;241m.\u001b[39mJPG):\n\u001b[1;32m 51\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m partition_image(\n\u001b[1;32m 52\u001b[0m filename\u001b[38;5;241m=\u001b[39mfilename, \u001b[38;5;66;03m# type: ignore\u001b[39;00m\n\u001b[1;32m 53\u001b[0m file\u001b[38;5;241m=\u001b[39mfile, \u001b[38;5;66;03m# type: ignore\u001b[39;00m\n\u001b[1;32m 54\u001b[0m url\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 55\u001b[0m include_page_breaks\u001b[38;5;241m=\u001b[39minclude_page_breaks,\n\u001b[1;32m 56\u001b[0m )\n\nFile \u001b[0;32m~/.pyenv/versions/3.9.11/envs/langchain/lib/python3.9/site-packages/unstructured/partition/pdf.py:35\u001b[0m, in \u001b[0;36mpartition_pdf\u001b[0;34m(filename, file, url, template, token, include_page_breaks)\u001b[0m\n\u001b[1;32m 33\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m template \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 34\u001b[0m template \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlayout/pdf\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m---> 35\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mpartition_pdf_or_image\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 36\u001b[0m \u001b[43m \u001b[49m\u001b[43mfilename\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfilename\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 37\u001b[0m \u001b[43m \u001b[49m\u001b[43mfile\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfile\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 38\u001b[0m \u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 39\u001b[0m \u001b[43m \u001b[49m\u001b[43mtemplate\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtemplate\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 40\u001b[0m \u001b[43m \u001b[49m\u001b[43mtoken\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtoken\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 41\u001b[0m \u001b[43m \u001b[49m\u001b[43minclude_page_breaks\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minclude_page_breaks\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 42\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\nFile \u001b[0;32m~/.pyenv/versions/3.9.11/envs/langchain/lib/python3.9/site-packages/unstructured/partition/pdf.py:69\u001b[0m, in \u001b[0;36mpartition_pdf_or_image\u001b[0;34m(filename, file, url, template, token, is_image, include_page_breaks)\u001b[0m\n\u001b[1;32m 67\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m warnings\u001b[38;5;241m.\u001b[39mcatch_warnings():\n\u001b[1;32m 68\u001b[0m warnings\u001b[38;5;241m.\u001b[39msimplefilter(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mignore\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m---> 69\u001b[0m layout_elements \u001b[38;5;241m=\u001b[39m \u001b[43m_partition_pdf_or_image_local\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 70\u001b[0m \u001b[43m \u001b[49m\u001b[43mfilename\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfilename\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 71\u001b[0m \u001b[43m \u001b[49m\u001b[43mfile\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfile\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 72\u001b[0m \u001b[43m \u001b[49m\u001b[43mtemplate\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mout_template\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 73\u001b[0m \u001b[43m \u001b[49m\u001b[43mis_image\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mis_image\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 74\u001b[0m \u001b[43m \u001b[49m\u001b[43minclude_page_breaks\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minclude_page_breaks\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 75\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 76\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 77\u001b[0m \u001b[38;5;66;03m# NOTE(alan): Remove these lines after different models are handled by routing\u001b[39;00m\n\u001b[1;32m 78\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m template \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcheckbox\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\nFile \u001b[0;32m~/.pyenv/versions/3.9.11/envs/langchain/lib/python3.9/site-packages/unstructured/partition/pdf.py:133\u001b[0m, in \u001b[0;36m_partition_pdf_or_image_local\u001b[0;34m(filename, file, template, is_image, include_page_breaks)\u001b[0m\n\u001b[1;32m 124\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mImportError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 125\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m(\n\u001b[1;32m 126\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThere was a problem importing unstructured_inference module - it may not be installed \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 127\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcorrectly... try running pip install unstructured[local-inference] if you installed \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 128\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mthe unstructured library as a package. If you cloned the unstructured repository, try \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 129\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrunning make install-local-inference from the root directory of the repository.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 130\u001b[0m ) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01me\u001b[39;00m\n\u001b[1;32m 132\u001b[0m layout \u001b[38;5;241m=\u001b[39m (\n\u001b[0;32m--> 133\u001b[0m \u001b[43mprocess_file_with_model\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilename\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtemplate\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mis_image\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mis_image\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 134\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m file \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 135\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m process_data_with_model(file, template, is_image\u001b[38;5;241m=\u001b[39mis_image)\n\u001b[1;32m 136\u001b[0m )\n\u001b[1;32m 138\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m document_to_element_list(layout, include_page_breaks\u001b[38;5;241m=\u001b[39minclude_page_breaks)\n\nFile \u001b[0;32m~/.pyenv/versions/3.9.11/envs/langchain/lib/python3.9/site-packages/unstructured_inference/inference/layout.py:266\u001b[0m, in \u001b[0;36mprocess_file_with_model\u001b[0;34m(filename, model_name, is_image)\u001b[0m\n\u001b[1;32m 261\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mprocess_file_with_model\u001b[39m(\n\u001b[1;32m 262\u001b[0m filename: \u001b[38;5;28mstr\u001b[39m, model_name: Optional[\u001b[38;5;28mstr\u001b[39m], is_image: \u001b[38;5;28mbool\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[1;32m 263\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m DocumentLayout:\n\u001b[1;32m 264\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Processes pdf file with name filename into a DocumentLayout by using a model identified by\u001b[39;00m\n\u001b[1;32m 265\u001b[0m \u001b[38;5;124;03m model_name.\"\"\"\u001b[39;00m\n\u001b[0;32m--> 266\u001b[0m model \u001b[38;5;241m=\u001b[39m \u001b[43mget_model\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel_name\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 267\u001b[0m layout \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m 268\u001b[0m DocumentLayout\u001b[38;5;241m.\u001b[39mfrom_image_file(filename, model\u001b[38;5;241m=\u001b[39mmodel)\n\u001b[1;32m 269\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_image\n\u001b[1;32m 270\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m DocumentLayout\u001b[38;5;241m.\u001b[39mfrom_file(filename, model\u001b[38;5;241m=\u001b[39mmodel)\n\u001b[1;32m 271\u001b[0m )\n\u001b[1;32m 272\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m layout\n\nFile \u001b[0;32m~/.pyenv/versions/3.9.11/envs/langchain/lib/python3.9/site-packages/unstructured_inference/models/base.py:14\u001b[0m, in \u001b[0;36mget_model\u001b[0;34m(model_name)\u001b[0m\n\u001b[1;32m 12\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m model_name \u001b[38;5;129;01min\u001b[39;00m DETECTRON2_MODEL_TYPES:\n\u001b[1;32m 13\u001b[0m model \u001b[38;5;241m=\u001b[39m UnstructuredDetectronModel()\n\u001b[0;32m---> 14\u001b[0m \u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43minitialize\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mDETECTRON2_MODEL_TYPES\u001b[49m\u001b[43m[\u001b[49m\u001b[43mmodel_name\u001b[49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 15\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 16\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m UnknownModelException(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUnknown model type: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mmodel_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\nFile \u001b[0;32m~/.pyenv/versions/3.9.11/envs/langchain/lib/python3.9/site-packages/unstructured_inference/models/detectron2.py:76\u001b[0m, in \u001b[0;36mUnstructuredDetectronModel.initialize\u001b[0;34m(self, config_path, model_path, label_map, extra_config, device)\u001b[0m\n\u001b[1;32m 73\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Loads the detectron2 model using the specified parameters\"\"\"\u001b[39;00m\n\u001b[1;32m 75\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_detectron2_available():\n\u001b[0;32m---> 76\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mImportError\u001b[39;00m(\n\u001b[1;32m 77\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFailed to load the Detectron2 model. Ensure that the Detectron2 \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 78\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmodule is correctly installed.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 79\u001b[0m )\n\u001b[1;32m 81\u001b[0m config_path_str \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mstr\u001b[39m(config_path)\n\u001b[1;32m 82\u001b[0m model_path_str: Optional[\u001b[38;5;28mstr\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01mif\u001b[39;00m model_path \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mstr\u001b[39m(model_path)\n\n\u001b[0;31mImportError\u001b[0m: Failed to load the Detectron2 model. Ensure that the Detectron2 module is correctly installed.\n" + } + ], + "execution_count": 2 + }, + { + "cell_type": "code", + "metadata": { + "jukit_cell_id": "1lKP9jNDd4" + }, + "source": [], + "outputs": [], + "execution_count": null + } + ], + "metadata": { + "anaconda-cloud": {}, + "kernelspec": { + "display_name": "python", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/document-loaders.py b/document-loaders.py new file mode 100644 index 0000000..91f7360 --- /dev/null +++ b/document-loaders.py @@ -0,0 +1,74 @@ +r"""°°° +# Document Loaders + +- loading text from local sources +- main driver is `Unstructured` python package + +## Key Concepts + +### Document + +container class for document information. contains: + - page_content + - metadata + +### Loader + +base class to load documents. exposes: + - load() -> Document + + +## Setup Unstructured +- host dependencies + - poppler: PDF rendering library +- Python deps: + - Pillow: imaging library +°°°""" +#|%%--%%| <4yTe29l2Ya|srwyN0cVES> + +# %pip install pillow (already installed) +%pip install -q unstructured[local-inference] + +#|%%--%%| + +docs_dir="unstructured-examples" +!mkdir -p $docs_dir +!wget https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/example-docs/example-10k.html -P $docs_dir +!wget https://raw.githubusercontent.com/Unstructured-IO/unstructured/main/example-docs/layout-parser-paper.pdf -P $docs_dir + + +#|%%--%%| +r"""°°° +[repo link](https://github.com/Unstructured-IO/unstructured#coffee-getting-started) +The easiest way to parse a document in unstructured is to use the partition brick. If you use partition brick, unstructured will detect the file type and route it to the appropriate file-specific partitioning brick. If you are using the partition brick, ensure you first install libmagic using the instructions outlined here partition will always apply the default arguments. If you need advanced features, use a document-specific brick. The partition brick currently works for .txt, .docx, .pptx, .jpg, .png, .eml, .html, and .pdf documents. + +Requires detectonr2 inference (cuda ?) +°°°""" +#|%%--%%| + +docs_dir="unstructured-examples" +#|%%--%%| + +#NOTE: needs inference with facebook's detectron2 + +# from unstructured.partition.auto import partition + +# elements = partition(docs_dir + "/layout-parser-paper.pdf") + + +#|%%--%%| <9MKaXz7Bi4|X4mTVZAzcD> +r"""°°° +## Unstructured Langchain FileLoader + +Requires detectron2 + +°°°""" +#|%%--%%| + +from langchain.document_loaders import UnstructuredFileLoader + +loader = UnstructuredFileLoader("./unstructured-examples/layout-parser-paper.pdf") + +docs = loader.load() +#|%%--%%| <9k0eAtsfvh|1lKP9jNDd4> + diff --git a/llm.json b/llm.json new file mode 100644 index 0000000..3ad673f --- /dev/null +++ b/llm.json @@ -0,0 +1,13 @@ +{ + "model_name": "text-ada-001", + "temperature": 1.0, + "max_tokens": 256, + "top_p": 1.0, + "frequency_penalty": 0.0, + "presence_penalty": 0.0, + "n": 2, + "best_of": 2, + "request_timeout": null, + "logit_bias": {}, + "_type": "openai" +} \ No newline at end of file diff --git a/llms.ipynb b/llms.ipynb new file mode 100644 index 0000000..0f2cdb2 --- /dev/null +++ b/llms.ipynb @@ -0,0 +1,400 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "jukit_cell_id": "gOPXQazOh0" + }, + "source": [ + "# Getting Started" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "jukit_cell_id": "yMwLn4kyVM" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "text/plain": [ + "'\\n\\nWhy did the chicken cross the road?\\n\\nTo get to the other side!'" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from langchain.llms import OpenAI\n", + "\n", + "# n: how many completions to generate for each prompt\n", + "llm = OpenAI(model_name='text-ada-001', n=2, best_of=2, temperature=1)\n", + "llm(\"tell me a joke !\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "jukit_cell_id": "5F2CisYISi" + }, + "source": [ + "you can call it with a list of inputs, getting back a more complete response than just the text. This complete response includes things like multiple top responses, as well as LLM provider specific information." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "jukit_cell_id": "4hSGpG9eHG" + }, + "outputs": [], + "source": [ + "llm_result = llm.generate([\"Tell me a joke\", \"Tell me a poem\"] * 15)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "jukit_cell_id": "kZMW1cS1Qk" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Generation(text='\\n\\nWhy did the chicken cross the road?\\n\\nTo get to the other side!', generation_info={'finish_reason': 'stop', 'logprobs': None}), Generation(text='\\n\\nWhy did the chicken cross the road?\\n\\nTo get to the other side.', generation_info={'finish_reason': 'stop', 'logprobs': None})]\n", + "\n", + "\n", + "\n", + "[Generation(text=\"\\n\\nThere's no need for words\\n\\nWhen we're by your side\\n\\nWe'll know all the way\\n\\nTo the way your driving us\\n\\nOur love will be real\\n\\nNot based on looks or looks only\\n\\nWe'll know what love is\\n\\nOnce we'll be together\\n\\nAnd nothing will help us die\\n\\nOur love is real\\n\\nAnd you don't have to be a part\\n\\nOf our love to be with us\\n\\nAnd it's not about the stars\\n\\nIt's about the way we're driving\\n\\nAnd the way our loves will fly\\n\\n survivin' in each other's sight\\n\\n- birds in the distance-\\n\\nAnd we'll be together\\n\\nWhat a wonderful surprise\\n\\nWe'll be together\\n\\nAnd the wonderful plan!\\n\\nWhy did the ambitious plan\\n\\nFail me every time\\n\\nAnd it's because they were only ambitious\\n\\nTo be in a relationship\\n\\nNow our love is real\\n\\nAnd we're not working for just anyone\\n\\nWe're working for us and our future\\n\\n togethe\", generation_info={'finish_reason': 'stop', 'logprobs': None}), Generation(text=\"\\n\\nWhen I was younger\\nI thought that love\\nI was something like a fairytale\\nI would find my prince\\nAnd we would be together\\nForever\\nI was naïve\\nAnd I was wrong\\nLove is not a fairytale\\nIt's something else entirely\\nSomething that should be cherished\\nAnd loved\\nAnd never taken for granted\\nLove is something that you cannot do without\\nIt's something that you must protect and serve\\nAnd it is this that makes love so wonderful\\n\\nThere is nothing like love\\nIt should beorgans and revolutions\\nIt is the light that leads you through the darkest parts of your heart\\nIt should be constantly Pulled me out of Bed Night\\nAnd keep me awake into the morning\\nIt should be something that is cherished\\nAnd loved\\nAnd never taken for granted\\nLove is something that you cannot do without\\nIt's something that you must protect and serve\", generation_info={'finish_reason': 'stop', 'logprobs': None})]\n" + ] + } + ], + "source": [ + "len(llm_result.generations)\n", + "\n", + "#llm_result.to_dict() # see result details\n", + "\n", + "print(llm_result.generations[0])\n", + "print(\"\\n\\n\")\n", + "print(llm_result.generations[-1])\n", + "llm_result.llm_output" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "jukit_cell_id": "1Kg9Ct1muS" + }, + "source": [ + "## estimate number of tokens in prompt" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "jukit_cell_id": "qz7lnBufdW" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "text/plain": [ + "4" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "llm.get_num_tokens(\"what is a joke\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "jukit_cell_id": "Q9fgYuBKEK" + }, + "source": [ + "# Key Concepts\n", + "\n", + "- Core method exposed by llms is `generate`: takes list of str returns LLMResult\n", + "- Can also be called directly with single string as input and returns a stirng\n", + "- Main result is `LLMResult`, input list of strings -> list of LLMResult\n", + " Each result is a list of generations (since you can request `n` generations per input str)\n", + "- `llm_output` contains provider specific ouptput" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "jukit_cell_id": "3k0hrhqP7F" + }, + "source": [ + "## LLM Serialization\n", + "\n", + "Wrinting and reading llms to disk" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "jukit_cell_id": "9V7QPvLLcT" + }, + "outputs": [], + "source": [ + "from langchain.llms.loading import load_llm\n", + "\n", + "llm.save(\"llm.json\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "jukit_cell_id": "rcdMSwUd3W" + }, + "outputs": [], + "source": [ + "llm = load_llm(\"llm.json\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "jukit_cell_id": "JfkBS05EUP" + }, + "source": [ + "## Token Usage Tracking" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "jukit_cell_id": "FUnKToXsu6" + }, + "outputs": [], + "source": [ + "from langchain.callbacks import get_openai_callback" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "jukit_cell_id": "9AoUdMfzg7" + }, + "outputs": [], + "source": [ + "llm = OpenAI(model_name=\"text-davinci-002\", n=2, best_of=2)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "jukit_cell_id": "aiTKa2iDUx", + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "42\n" + ] + } + ], + "source": [ + "with get_openai_callback() as cb:\n", + " result = llm(\"tell me a joke\")\n", + " print(cb.total_tokens)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "jukit_cell_id": "6G5rwEeItx" + }, + "source": [ + "Anything inside the context manager will get tracked.\n", + "\n", + "Example tracking multiple calls" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "jukit_cell_id": "uDJtapBCby" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "80\n" + ] + } + ], + "source": [ + "with get_openai_callback() as cb:\n", + " result = llm(\"Tell me a joke\")\n", + " result2 = llm(\"Tell me a funny joke\")\n", + " print(cb.total_tokens)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "jukit_cell_id": "sLuvD8dXnk" + }, + "source": [ + "If a chain or agent with multiple steps in it is used, it will track all those steps." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "jukit_cell_id": "uUQOyO8XIw" + }, + "outputs": [], + "source": [ + "from langchain.agents import load_tools\n", + "from langchain.agents import initialize_agent\n", + "from langchain.llms import OpenAI\n", + "\n", + "llm = OpenAI(temperature=0)\n", + "tools = load_tools([\"searx-search\", \"llm-math\"], llm=llm)\n", + "agent = initialize_agent(tools, llm, agent=\"zero-shot-react-description\", verbose=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "jukit_cell_id": "nv0tixnGfg", + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n", + "\u001b[32;1m\u001b[1;3m I need to find the temperature in each city\n", + "Action: Search\n", + "Action Input: \"temperature in Paris Berlin and Granada\"\u001b[0m\n", + "Observation: \u001b[36;1m\u001b[1;3mHumidity: 40%; Visibility: --Visibility: Not available; Pressure: 1029mb, Falling. Observation Station: Granada/Armilla (Lat: 37.1333 | Long: -3.6333) ...\n", + "\n", + "Paris 14 Day Extended Forecast Time/General Weather Time Zone DST Changes Sun & Moon Weather Today Weather Hourly 14 Day Forecast Yesterday/Past Weather Climate (Averages) Currently: 41 °F. Passing clouds. (Weather station: Villacoublay, France). See more current weather Paris Extended Forecast with high and low temperatures Feb 6 – Feb 12 Lo:37\n", + "\n", + "Humidity: 40%; Visibility: --Visibility: Not available; Pressure: 1029mb, Falling. Observation Station: Granada/Armilla (Lat: 37.1333 | Long: -3.6333) ...\n", + "\n", + "10 Day Weather - Paris, France As of 9:14 am CET Today 50°/ 40° 10% Sat 11 | Day 50° 10% SE 3 mph Mostly cloudy skies this morning will become partly cloudy this afternoon. High near 50F. Winds...\n", + "\n", + "Humidity: 87%; Visibility: --Visibility: Not available; Pressure: 1025mb. Observation Station: Granada/Armilla (Lat: 37.1333 | Long: -3.6333) ...\n", + "\n", + "Paris' low season sees temperatures in the 40 and 30-degree range. It's only during November that travelers may get a glimpse of 50-degree days. Heavy – yet chic – coats are necessary this ...\n", + "\n", + "What is the weather like in Granada compared to Paris? The average temperature in Paris the average temperature is 33.34 degrees in summer, with precipitation ...\n", + "\n", + "June Weather in Paris Range of daily high temperatures during June: 62°F to 101°F (17°C to 38°C) Range of daily low temperatures during June: 38°F to 68° F (3°C to 20°C) Record June high temperature (1991-present): 101°F (38°C) in 2022 Record June low temperature (1991-present): 38°F (3°C) in 2006 Average June high temperature: 75°F (24°C)\n", + "\n", + "What is the weather like in Paris compared to Granada?\n", + "\n", + "Typically, the best time to visit Paris is from May through mid-September. Mid-May through June is especially pleasant, with long days and milder temperatures. Spring, while beautiful, can be quite chilly. Read on for more information about Paris's climate as well as a season-by-season breakdown of the weather. 1.\u001b[0m\n", + "Thought:\u001b[32;1m\u001b[1;3m I need to compare the temperatures in Paris and Granada\n", + "Action: Calculator\n", + "Action Input: 33.34 (Paris average temperature) - 37.13 (Granada average temperature)\u001b[0m\n", + "Observation: \u001b[33;1m\u001b[1;3mAnswer: -3.79\u001b[0m\n", + "Thought:\u001b[32;1m\u001b[1;3m I now know the final answer\n", + "Final Answer: The average temperature in Granada is 3.79 degrees Celsius lower than the average temperature in Paris.\u001b[0m\n", + "\n", + "\u001b[1m> Finished chain.\u001b[0m\n" + ] + } + ], + "source": [ + "with get_openai_callback() as cb:\n", + " res = agent.run(\"What is the temperature in Paris Berlin and Granada ? \\\n", + " Print every city's temperature in Celcius and Fahrenheit. Think step by step\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "jukit_cell_id": "vpDGqKnagk" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2192\n" + ] + } + ], + "source": [ + " print(cb.total_tokens)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "jukit_cell_id": "xUoMvs6XZ0" + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "anaconda-cloud": {}, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.11" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/llms.py b/llms.py new file mode 100644 index 0000000..e058f3a --- /dev/null +++ b/llms.py @@ -0,0 +1,126 @@ +r"""°°° +# Getting Started +°°°""" +# |%%--%%| + +from langchain.llms import OpenAI + +# n: how many completions to generate for each prompt +llm = OpenAI(model_name='text-ada-001', n=2, best_of=2, temperature=1) +llm("tell me a joke !") + + +#|%%--%%| +r"""°°° +you can call it with a list of inputs, getting back a more complete response than just the text. This complete response includes things like multiple top responses, as well as LLM provider specific information. +°°°""" +#|%%--%%| <5F2CisYISi|4hSGpG9eHG> + +llm_result = llm.generate(["Tell me a joke", "Tell me a poem"] * 15) + +#|%%--%%| <4hSGpG9eHG|kZMW1cS1Qk> + +len(llm_result.generations) + +#llm_result.to_dict() # see result details + +print(llm_result.generations[0]) +print("\n\n") +print(llm_result.generations[-1]) +llm_result.llm_output + + +#|%%--%%| +r"""°°° +## estimate number of tokens in prompt +°°°""" +#|%%--%%| <1Kg9Ct1muS|qz7lnBufdW> + +llm.get_num_tokens("what is a joke") + +#|%%--%%| +r"""°°° +# Key Concepts + +- Core method exposed by llms is `generate`: takes list of str returns LLMResult +- Can also be called directly with single string as input and returns a stirng +- Main result is `LLMResult`, input list of strings -> list of LLMResult + Each result is a list of generations (since you can request `n` generations per input str) +- `llm_output` contains provider specific ouptput +°°°""" +#|%%--%%| +r"""°°° +## LLM Serialization + +Wrinting and reading llms to disk +°°°""" +#|%%--%%| <3k0hrhqP7F|9V7QPvLLcT> + +from langchain.llms.loading import load_llm + +llm.save("llm.json") + +#|%%--%%| <9V7QPvLLcT|rcdMSwUd3W> + +llm = load_llm("llm.json") + +#|%%--%%| +r"""°°° +## Token Usage Tracking + + +°°°""" +#|%%--%%| + +from langchain.callbacks import get_openai_callback + +#|%%--%%| + +llm = OpenAI(model_name="text-davinci-002", n=2, best_of=2) + +#|%%--%%| <9AoUdMfzg7|aiTKa2iDUx> + +with get_openai_callback() as cb: + result = llm("tell me a joke") + print(cb.total_tokens) + +#|%%--%%| +r"""°°° +Anything inside the context manager will get tracked. + +Example tracking multiple calls +°°°""" +#|%%--%%| <6G5rwEeItx|uDJtapBCby> + +with get_openai_callback() as cb: + result = llm("Tell me a joke") + result2 = llm("Tell me a funny joke") + print(cb.total_tokens) + +#|%%--%%| +r"""°°° +If a chain or agent with multiple steps in it is used, it will track all those steps. +°°°""" +#|%%--%%| + +from langchain.agents import load_tools +from langchain.agents import initialize_agent +from langchain.llms import OpenAI + +llm = OpenAI(temperature=0) +tools = load_tools(["searx-search", "llm-math"], llm=llm) +agent = initialize_agent(tools, llm, agent="zero-shot-react-description", verbose=True) + +#|%%--%%| + +with get_openai_callback() as cb: + res = agent.run("What is the temperature in Paris Berlin and Granada ? \ + Print every city's temperature in Celcius and Fahrenheit. Think step by step") + +#|%%--%%| + + print(cb.total_tokens) + +# |%%--%%| + + diff --git a/pdf-loader.ipynb b/pdf-loader.ipynb new file mode 100644 index 0000000..99691e7 --- /dev/null +++ b/pdf-loader.ipynb @@ -0,0 +1,104 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "jukit_cell_id": "ut22SE2PmJ" + }, + "source": [ + "## Loading PDF" + ] + }, + { + "cell_type": "code", + "metadata": { + "jukit_cell_id": "EQL3ZDG6Dt" + }, + "source": [ + "from langchain.document_loaders import PagedPDFSplitter\n", + "\n", + "loader = PagedPDFSplitter(\"./documents/layout-parser-paper.pdf\")\n", + "pages = loader.load_and_split()" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "metadata": { + "jukit_cell_id": "6LWg1c7vN6" + }, + "source": [ + "Documents can be retrived with page numbers" + ] + }, + { + "cell_type": "code", + "metadata": { + "jukit_cell_id": "0kFnbEI7yL" + }, + "source": [ + "from langchain.vectorstores import FAISS\n", + "from langchain.embeddings.openai import OpenAIEmbeddings" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "code", + "metadata": { + "jukit_cell_id": "KkXwCS4JHN" + }, + "source": [ + "faiss_index = FAISS.from_documents(pages, OpenAIEmbeddings() )\n", + "\n", + "# Find docs (ie pages) most similar to query\n", + "# k: number of docs similar to query\n", + "docs = faiss_index.similarity_search(\"How will the community be engaged ?\", k=2)" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "code", + "metadata": { + "jukit_cell_id": "RDajVoEdqh" + }, + "source": [ + "# get page numbers + content, similar to query \n", + "for doc in docs:\n", + " print(\"\\n----\\n\")\n", + " print(\"page: \" + str(doc.metadata[\"page\"] + 1))\n", + " print(\"content:\")\n", + " print(str(doc.page_content))" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "\n----\n\npage: 10\ncontent:\n10 Z. Shen et al.\nFig. 4: Illustration of (a) the original historical Japanese document with layout\ndetection results and (b) a recreated version of the document image that achieves\nmuch better character recognition recall. The reorganization algorithm rearranges\nthe tokens based on the their detected bounding boxes given a maximum allowed\nheight.\n4LayoutParser Community Platform\nAnother focus of LayoutParser is promoting the reusability of layout detection\nmodels and full digitization pipelines. Similar to many existing deep learning\nlibraries, LayoutParser comes with a community model hub for distributing\nlayout models. End-users can upload their self-trained models to the model hub,\nand these models can be loaded into a similar interface as the currently available\nLayoutParser pre-trained models. For example, the model trained on the News\nNavigator dataset [17] has been incorporated in the model hub.\nBeyond DL models, LayoutParser also promotes the sharing of entire doc-\nument digitization pipelines. For example, sometimes the pipeline requires the\ncombination of multiple DL models to achieve better accuracy. Currently, pipelines\nare mainly described in academic papers and implementations are often not pub-\nlicly available. To this end, the LayoutParser community platform also enables\nthe sharing of layout pipelines to promote the discussion and reuse of techniques.\nFor each shared pipeline, it has a dedicated project page, with links to the source\ncode, documentation, and an outline of the approaches. A discussion panel is\nprovided for exchanging ideas. Combined with the core LayoutParser library,\nusers can easily build reusable components based on the shared pipelines and\napply them to solve their unique problems.\n5 Use Cases\nThe core objective of LayoutParser is to make it easier to create both large-scale\nand light-weight document digitization pipelines. Large-scale document processing\n\n----\n\npage: 4\ncontent:\n4 Z. Shen et al.\nEfficient Data AnnotationC u s t o m i z e d M o d e l T r a i n i n gModel Cust omizationDI A Model HubDI A Pipeline SharingCommunity PlatformLa y out Detection ModelsDocument Images \nT h e C o r e L a y o u t P a r s e r L i b r a r yOCR ModuleSt or age & VisualizationLa y out Data Structur e\nFig. 1: The overall architecture of LayoutParser . For an input document image,\nthe core LayoutParser library provides a set of o\u000b-the-shelf tools for layout\ndetection, OCR, visualization, and storage, backed by a carefully designed layout\ndata structure. LayoutParser also supports high level customization via e\u000ecient\nlayout annotation and model training functions. These improve model accuracy\non the target samples. The community platform enables the easy sharing of DIA\nmodels and whole digitization pipelines to promote reusability and reproducibility.\nA collection of detailed documentation, tutorials and exemplar projects make\nLayoutParser easy to learn and use.\nAllenNLP [ 8] and transformers [ 34] have provided the community with complete\nDL-based support for developing and deploying models for general computer\nvision and natural language processing problems. LayoutParser , on the other\nhand, specializes speci\fcally in DIA tasks. LayoutParser is also equipped with a\ncommunity platform inspired by established model hubs such as Torch Hub [23]\nandTensorFlow Hub [1]. It enables the sharing of pretrained models as well as\nfull document processing pipelines that are unique to DIA tasks.\nThere have been a variety of document data collections to facilitate the\ndevelopment of DL models. Some examples include PRImA [ 3](magazine layouts),\nPubLayNet [ 38](academic paper layouts), Table Bank [ 18](tables in academic\npapers), Newspaper Navigator Dataset [ 16,17](newspaper \fgure layouts) and\nHJDataset [31](historical Japanese document layouts). A spectrum of models\ntrained on these datasets are currently available in the LayoutParser model zoo\nto support di\u000berent use cases.\n3 The Core LayoutParser Library\nAt the core of LayoutParser is an o\u000b-the-shelf toolkit that streamlines DL-\nbased document image analysis. Five components support a simple interface\nwith comprehensive functionalities: 1) The layout detection models enable using\npre-trained or self-trained DL models for layout detection with just four lines\nof code. 2) The detected layout information is stored in carefully engineered\n" + } + ], + "execution_count": 1 + }, + { + "cell_type": "code", + "metadata": { + "jukit_cell_id": "cqoPocvVBS" + }, + "source": [], + "outputs": [], + "execution_count": null + } + ], + "metadata": { + "anaconda-cloud": {}, + "kernelspec": { + "display_name": "python", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/pdf-loader.py b/pdf-loader.py new file mode 100644 index 0000000..47fa6f0 --- /dev/null +++ b/pdf-loader.py @@ -0,0 +1,37 @@ +r"""°°° +## Loading PDF +°°°""" +#|%%--%%| + +from langchain.document_loaders import PagedPDFSplitter + +loader = PagedPDFSplitter("./documents/layout-parser-paper.pdf") +pages = loader.load_and_split() + +#|%%--%%| +r"""°°° +Documents can be retrived with page numbers +°°°""" +#|%%--%%| <6LWg1c7vN6|0kFnbEI7yL> + +from langchain.vectorstores import FAISS +from langchain.embeddings.openai import OpenAIEmbeddings + +#|%%--%%| <0kFnbEI7yL|KkXwCS4JHN> + +faiss_index = FAISS.from_documents(pages, OpenAIEmbeddings() ) + +# Find docs (ie pages) most similar to query +# k: number of docs similar to query +docs = faiss_index.similarity_search("How will the community be engaged ?", k=2) + +#|%%--%%| +# get page numbers + content, similar to query +for doc in docs: + print("\n----\n") + print("page: " + str(doc.metadata["page"] + 1)) + print("content:") + print(str(doc.page_content)) + +#|%%--%%| + diff --git a/prompt_templates.ipynb b/prompt_templates.ipynb new file mode 100644 index 0000000..1391de0 --- /dev/null +++ b/prompt_templates.ipynb @@ -0,0 +1,347 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "jukit_cell_id": "IagIXdFJ76" + }, + "source": [ + "# Prompt templates\n", + "[see](https://langchain.readthedocs.io/en/latest/modules/prompts/getting_started.html)" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "jukit_cell_id": "5rNpKSE97a" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "('\\n'\n", + " 'I want you to act as a naming consultant for new companies.\\n'\n", + " '\\n'\n", + " 'Here are some examples of good company names:\\n'\n", + " '\\n'\n", + " '- search engine, Google\\n'\n", + " '- social media, Facebook\\n'\n", + " '- video sharing, YouTube\\n'\n", + " '\\n'\n", + " 'The name should be short, catchy and easy to remember.\\n'\n", + " '\\n'\n", + " 'What is a good name for a company that makes cookie?\\n')\n" + ] + } + ], + "source": [ + "from langchain import PromptTemplate\n", + "import pprint as pp\n", + "\n", + "template = \"\"\"\n", + "I want you to act as a naming consultant for new companies.\n", + "\n", + "Here are some examples of good company names:\n", + "\n", + "- search engine, Google\n", + "- social media, Facebook\n", + "- video sharing, YouTube\n", + "\n", + "The name should be short, catchy and easy to remember.\n", + "\n", + "What is a good name for a company that makes {product}?\n", + "\"\"\"\n", + "\n", + "prompt = PromptTemplate(\n", + " input_variables=[\"product\"],\n", + " template=template,\n", + " )\n", + "\n", + "pp.pp(prompt.format(product='cookie'))" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "jukit_cell_id": "UH7UDNwwOT" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "text/plain": [ + "'tell me a funny joke about bats.'" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# without inputs\n", + "no_input_prompt = PromptTemplate(input_variables=[],\n", + " template=\"tell me a joke.\")\n", + "no_input_prompt.format()\n", + "\n", + "# with inputs\n", + "multi_input_prompt = PromptTemplate(\n", + " input_variables=[\"adjective\", \"content\"],\n", + " template=\"tell me a {adjective} joke about {content}.\" \n", + " )\n", + "multi_input_prompt.format(adjective=\"funny\", content=\"bats\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "jukit_cell_id": "m4HfAhbN4T" + }, + "source": [ + "## Loading prompt templates from LangChainHub " + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "jukit_cell_id": "6sAHvM0Vrt" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know.\n", + "\n", + "Current conversation:\n", + "\n", + "Human: what is 1 + 1?\n", + "AI:\n" + ] + } + ], + "source": [ + "from langchain.prompts import load_prompt\n", + "\n", + "prompt=load_prompt(\"lc://prompts/conversation/prompt.json\")\n", + "#NOTE: is there a helper to quickly build a history ? \n", + "print(prompt.format(history=\"\", input=\"what is 1 + 1?\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "jukit_cell_id": "u2xOTHeA5E" + }, + "source": [ + "## Pass few shot examples to prompt template" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "jukit_cell_id": "KO5IXCuzjw" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Give the antonym of every input\n", + "\n", + "\n", + "Word: happy\n", + "Antonym: sad\n", + "\n", + "\n", + "\n", + "Word: tall\n", + "Antonym: short\n", + "\n", + "\n", + "Word: fast\n", + "Antonym:\n" + ] + } + ], + "source": [ + "from langchain import FewShotPromptTemplate\n", + "\n", + "# create a list of few shot examples\n", + "examples = [\n", + " {\"word\": \"happy\", \"antonym\": \"sad\"},\n", + " {\"word\": \"tall\", \"antonym\": \"short\"},\n", + " ]\n", + "\n", + "# next e specify a template for format the examples\n", + "# we use PromptTemplate class\n", + "example_formatter_template = \"\"\"\n", + "Word: {word}\n", + "Antonym: {antonym}\n", + "\"\"\"\n", + "example_pr = PromptTemplate(\n", + " input_variables=[\"word\", \"antonym\"],\n", + " template=example_formatter_template,\n", + " )\n", + "\n", + "# now we can use FewShotPromptTemplate\n", + "few_shot_prompt = FewShotPromptTemplate(\n", + " # examples we want to insert in prompt\n", + " examples=examples,\n", + " # how we want examples to be formatted in prompt\n", + " example_prompt=example_pr,\n", + " # The prefix is some text that goes before the examples in the prompt.\n", + " # Usually, this consists of intructions.\n", + " prefix=\"Give the antonym of every input\",\n", + " #The suffix is some text that goes after the examples in the prompt.\n", + " suffix=\"Word: {input}\\nAntonym:\",\n", + " # The input variables are the variables that the overall prompt expects.\n", + " input_variables=[\"input\"],\n", + " # The example_separator is the string we will use to join the prefix, examples, and suffix together with.\n", + " example_separator=\"\\n\\n\",\n", + " )\n", + "# generate few shot prompt using input\n", + "print(few_shot_prompt.format(input=\"fast\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "jukit_cell_id": "fV1qxeDncc" + }, + "source": [ + "## Select examples from prompt template\n", + "\n", + "- for a large number of exaamples use ExampleSelector to select a subset of\n", + " most informative ones for language model.\n", + "- LengthBasedExampleSelector selects examples based on length of input.\n", + " practical to to construct prompt that do not extend over context window\n", + " based on input length" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "jukit_cell_id": "uuIdYaJ4wD" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Give the antonym of every input\n", + "\n", + "\n", + "Word: happy\n", + "Antonym: sad\n", + "\n", + "\n", + "\n", + "Word: tall\n", + "Antonym: short\n", + "\n", + "\n", + "\n", + "Word: energetic\n", + "Antonym: lethargic\n", + "\n", + "\n", + "\n", + "Word: sunny\n", + "Antonym: gloomy\n", + "\n", + "\n", + "Word: big\n", + "Antonym:\n", + "----------\n", + "Give the antonym of every input\n", + "\n", + "\n", + "Word: happy\n", + "Antonym: sad\n", + "\n", + "\n", + "Word: big and huge and massive and large and gigantic and tall and much much much much much bigger than everything else\n", + "Antonym:\n" + ] + } + ], + "source": [ + "from langchain.prompts.example_selector import LengthBasedExampleSelector\n", + "\n", + "#These are a lot of examples of a pretend task of creating antonyms.\n", + "examples = [\n", + " {\"word\": \"happy\", \"antonym\": \"sad\"},\n", + " {\"word\": \"tall\", \"antonym\": \"short\"},\n", + " {\"word\": \"energetic\", \"antonym\": \"lethargic\"},\n", + " {\"word\": \"sunny\", \"antonym\": \"gloomy\"},\n", + " {\"word\": \"windy\", \"antonym\": \"calm\"},\n", + "]\n", + "\n", + "example_selector = LengthBasedExampleSelector(\n", + " examples=examples,\n", + " # This is the PromptTemplate being used to format the examples.\n", + " example_prompt=example_pr,\n", + " # This is the maximum length that the formatted examples should be.\n", + " # Length is measured by the get_text_length function below.\n", + " max_length=30,\n", + " )\n", + "\n", + "# We can now use the `example_selector` to create a `FewShotPromptTemplate`.\n", + "dynamic_prompt = FewShotPromptTemplate(\n", + " # We provide an ExampleSelector instead of examples.\n", + " example_selector=example_selector,\n", + " example_prompt=example_pr,\n", + " prefix=\"Give the antonym of every input\",\n", + " suffix=\"Word: {input}\\nAntonym:\",\n", + " input_variables=[\"input\"],\n", + " example_separator=\"\\n\\n\",\n", + ")\n", + "\n", + "# We can now generate a prompt using the `format` method.\n", + "print(dynamic_prompt.format(input=\"big\"))\n", + "\n", + "print(\"----------\")\n", + "\n", + "# In contrast, if we provide a very long input, the LengthBasedExampleSelector\n", + "# will select fewer examples to include in the prompt.\n", + "long_string = \"big and huge and massive and large and gigantic and tall and much much much much much bigger than everything else\"\n", + "print(dynamic_prompt.format(input=long_string))" + ] + } + ], + "metadata": { + "anaconda-cloud": {}, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.11" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/prompt_templates.py b/prompt_templates.py new file mode 100644 index 0000000..ad1ad55 --- /dev/null +++ b/prompt_templates.py @@ -0,0 +1,152 @@ +r"""°°° +# Prompt templates +[see](https://langchain.readthedocs.io/en/latest/modules/prompts/getting_started.html) +°°°""" +# |%%--%%| + +from langchain import PromptTemplate +import pprint as pp + +template = """ +I want you to act as a naming consultant for new companies. + +Here are some examples of good company names: + +- search engine, Google +- social media, Facebook +- video sharing, YouTube + +The name should be short, catchy and easy to remember. + +What is a good name for a company that makes {product}? +""" + +prompt = PromptTemplate( + input_variables=["product"], + template=template, + ) + +pp.pp(prompt.format(product='cookie')) + +# |%%--%%| <5rNpKSE97a|UH7UDNwwOT> + +# without inputs +no_input_prompt = PromptTemplate(input_variables=[], + template="tell me a joke.") +no_input_prompt.format() + +# with inputs +multi_input_prompt = PromptTemplate( + input_variables=["adjective", "content"], + template="tell me a {adjective} joke about {content}." + ) +multi_input_prompt.format(adjective="funny", content="bats") + +# |%%--%%| +r"""°°° +## Loading prompt templates from LangChainHub +°°°""" +# |%%--%%| + +from langchain.prompts import load_prompt + +prompt=load_prompt("lc://prompts/conversation/prompt.json") +#NOTE: is there a helper to quickly build a history ? +print(prompt.format(history="", input="what is 1 + 1?")) + +# |%%--%%| <6sAHvM0Vrt|u2xOTHeA5E> +r"""°°° +## Pass few shot examples to prompt template +°°°""" +# |%%--%%| + +from langchain import FewShotPromptTemplate + +# create a list of few shot examples +examples = [ + {"word": "happy", "antonym": "sad"}, + {"word": "tall", "antonym": "short"}, + ] + +# next e specify a template for format the examples +# we use PromptTemplate class +example_formatter_template = """ +Word: {word} +Antonym: {antonym} +""" +example_pr = PromptTemplate( + input_variables=["word", "antonym"], + template=example_formatter_template, + ) + +# now we can use FewShotPromptTemplate +few_shot_prompt = FewShotPromptTemplate( + # examples we want to insert in prompt + examples=examples, + # how we want examples to be formatted in prompt + example_prompt=example_pr, + # The prefix is some text that goes before the examples in the prompt. + # Usually, this consists of intructions. + prefix="Give the antonym of every input", + #The suffix is some text that goes after the examples in the prompt. + suffix="Word: {input}\nAntonym:", + # The input variables are the variables that the overall prompt expects. + input_variables=["input"], + # The example_separator is the string we will use to join the prefix, examples, and suffix together with. + example_separator="\n\n", + ) +# generate few shot prompt using input +print(few_shot_prompt.format(input="fast")) + +# |%%--%%| +r"""°°° +## Select examples from prompt template + +- for a large number of exaamples use ExampleSelector to select a subset of + most informative ones for language model. +- LengthBasedExampleSelector selects examples based on length of input. + practical to to construct prompt that do not extend over context window + based on input length +°°°""" +# |%%--%%| + +from langchain.prompts.example_selector import LengthBasedExampleSelector + +#These are a lot of examples of a pretend task of creating antonyms. +examples = [ + {"word": "happy", "antonym": "sad"}, + {"word": "tall", "antonym": "short"}, + {"word": "energetic", "antonym": "lethargic"}, + {"word": "sunny", "antonym": "gloomy"}, + {"word": "windy", "antonym": "calm"}, +] + +example_selector = LengthBasedExampleSelector( + examples=examples, + # This is the PromptTemplate being used to format the examples. + example_prompt=example_pr, + # This is the maximum length that the formatted examples should be. + # Length is measured by the get_text_length function below. + max_length=30, + ) + +# We can now use the `example_selector` to create a `FewShotPromptTemplate`. +dynamic_prompt = FewShotPromptTemplate( + # We provide an ExampleSelector instead of examples. + example_selector=example_selector, + example_prompt=example_pr, + prefix="Give the antonym of every input", + suffix="Word: {input}\nAntonym:", + input_variables=["input"], + example_separator="\n\n", +) + +# We can now generate a prompt using the `format` method. +print(dynamic_prompt.format(input="big")) + +print("----------") + +# In contrast, if we provide a very long input, the LengthBasedExampleSelector +# will select fewer examples to include in the prompt. +long_string = "big and huge and massive and large and gigantic and tall and much much much much much bigger than everything else" +print(dynamic_prompt.format(input=long_string)) diff --git a/test.ipynb b/test.ipynb new file mode 100644 index 0000000..a33ebaf --- /dev/null +++ b/test.ipynb @@ -0,0 +1,23 @@ +{ + "cells": [ + { + "cell_type": "code", + "metadata": { + "jukit_cell_id": "NONE" + }, + "source": [], + "outputs": [], + "execution_count": null + } + ], + "metadata": { + "anaconda-cloud": {}, + "kernelspec": { + "display_name": "python", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/test.py b/test.py new file mode 100644 index 0000000..e69de29 diff --git a/utils.ipynb b/utils.ipynb new file mode 100644 index 0000000..da7e48a --- /dev/null +++ b/utils.ipynb @@ -0,0 +1,78 @@ +{ + "cells": [ + { + "cell_type": "code", + "metadata": { + "jukit_cell_id": "f0GEGDeLRe" + }, + "source": [], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "metadata": { + "jukit_cell_id": "bNvqfQkX7p" + }, + "source": [ + "# Utils\n", + "\n", + "Connect LLMs to tools and other sources of knowledge\n", + "\n", + "- Text Splitter\n", + "- Embeddigs:\n", + " - Return embeddigns (list of floats)\n", + "- Vectorstores\n", + " - Datastores to store embeddings of documents in vector form.\n", + " - Expose methods for passing in a string and retrieve similar document\n", + "- Python Repl\n", + "- Bash\n", + "- Requests Wrapper (requests lib)\n", + "- Search\n", + "\n", + "## Generic Utilities\n", + "\n", + "### Bash" + ] + }, + { + "cell_type": "code", + "metadata": { + "jukit_cell_id": "ph5kvpzBNj" + }, + "source": [ + "from langchain.utilities import BashProcess\n", + "\n", + "bash = BashProcess()\n", + "print(bash.run(\"ls\"))" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "document-loaders.ipynb\ndocument-loaders.py\ndocuments\nllm.json\nllms.ipynb\nllms.py\npdf-loader.ipynb\npdf-loader.py\nquickstart.ipynb\nquickstart.py\ntest.ipynb\ntest.py\nutils.py\n\n" + } + ], + "execution_count": 1 + }, + { + "cell_type": "code", + "metadata": { + "jukit_cell_id": "4qBrDqHb3v" + }, + "source": [], + "outputs": [], + "execution_count": null + } + ], + "metadata": { + "anaconda-cloud": {}, + "kernelspec": { + "display_name": "python", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/utils.py b/utils.py new file mode 100644 index 0000000..0a9bd2c --- /dev/null +++ b/utils.py @@ -0,0 +1,32 @@ + +#|%%--%%| +r"""°°° +# Utils + +Connect LLMs to tools and other sources of knowledge + +- Text Splitter +- Embeddigs: + - Return embeddigns (list of floats) +- Vectorstores + - Datastores to store embeddings of documents in vector form. + - Expose methods for passing in a string and retrieve similar document +- Python Repl +- Bash +- Requests Wrapper (requests lib) +- Search + +## Generic Utilities + +### Bash +°°°""" +#|%%--%%| + +from langchain.utilities import BashProcess + +bash = BashProcess() +print(bash.run("ls")) + +#|%%--%%| + +#TODO: rest of utils