Merge branch 'master' into harrison/agent_multi_inputs

1 year ago · c7c38dd3df
parent 81383474c4 50257fce59
commit c7c38dd3df
62 changed files with 2941 additions and 485 deletions
--- a/.coveragerc
+++ b/.coveragerc
@ -0,0 +1,2 @@
+[run]
+omit = tests/*
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@ -6,30 +6,31 @@ on:
  pull_request:

 env:
-  POETRY_VERSION: "1.2.0"
+  POETRY_VERSION: "1.3.1"

 jobs:
  build:
    runs-on: ubuntu-latest
    strategy:
      matrix:
-        python-version: 
-         - "3.8"
-         - "3.9"
-         - "3.10"
+        python-version:
+          - "3.8"
+          - "3.9"
+          - "3.10"
+          - "3.11"
    steps:
-    - uses: actions/checkout@v3
-    - name: Install poetry
-      run: |
-        pipx install poetry==$POETRY_VERSION
-    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v4
-      with:
-        python-version: ${{ matrix.python-version }}
-        cache: poetry
-    - name: Install dependencies
-      run: |
-        poetry install
-    - name: Analysing the code with our lint
-      run: |
-        make lint
+      - uses: actions/checkout@v3
+      - name: Install poetry
+        run: |
+          pipx install poetry==$POETRY_VERSION
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+          cache: poetry
+      - name: Install dependencies
+        run: |
+          poetry install
+      - name: Analysing the code with our lint
+        run: |
+          make lint
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@ -6,7 +6,7 @@ on:
  pull_request:

 env:
-  POETRY_VERSION: "1.2.0"
+  POETRY_VERSION: "1.3.1"

 jobs:
  build:
@ -14,20 +14,21 @@ jobs:
    strategy:
      matrix:
        python-version:
-         - "3.8"
-         - "3.9"
-         - "3.10"
+          - "3.8"
+          - "3.9"
+          - "3.10"
+          - "3.11"
    steps:
-    - uses: actions/checkout@v3
-    - name: Install poetry
-      run: pipx install poetry==$POETRY_VERSION
-    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v4
-      with:
-        python-version: ${{ matrix.python-version }}
-        cache: 'poetry'
-    - name: Install dependencies
-      run: poetry install
-    - name: Run unit tests
-      run: |
-        make tests
+      - uses: actions/checkout@v3
+      - name: Install poetry
+        run: pipx install poetry==$POETRY_VERSION
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+          cache: "poetry"
+      - name: Install dependencies
+        run: poetry install
+      - name: Run unit tests
+        run: |
+          make tests
--- a/.gitignore
+++ b/.gitignore
@ -1,4 +1,5 @@
 .vscode/
+.idea/
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -0,0 +1,154 @@
+# Contributing to LangChain
+
+Hi there! Thank you for even being interested in contributing to LangChain.
+As an open source project in a rapidly developing field, we are extremely open
+to contributions, whether it be in the form of a new feature, improved infra, or better documentation.
+
+To contribute to this project, please follow a ["fork and pull request"](https://docs.github.com/en/get-started/quickstart/contributing-to-projects) workflow.
+Please do not try to push directly to this repo unless you are maintainer.
+
+## 🗺️Contributing Guidelines
+
+### 🚩GitHub Issues
+
+Our [issues](https://github.com/hwchase17/langchain/issues) page is kept up to date
+with bugs, improvements, and feature requests. There is a taxonomy of labels to help
+with sorting and discovery of issues of interest. These include:
+
+- prompts: related to prompt tooling/infra.
+- llms: related to LLM wrappers/tooling/infra.
+- chains
+- utilities: related to different types of utilities to integrate with (Python, SQL, etc.).
+- agents
+- memory
+- applications: related to example applications to build
+
+If you start working on an issue, please assign it to yourself.
+
+If you are adding an issue, please try to keep it focused on a single modular bug/improvement/feature.
+If the two issues are related, or blocking, please link them rather than keep them as one single one.
+
+We will try to keep these issues as up to date as possible, though
+with the rapid rate of develop in this field some may get out of date.
+If you notice this happening, please just let us know.
+
+### 🙋Getting Help
+
+Although we try to have a developer setup to make it as easy as possible for others to contribute (see below)
+it is possible that some pain point may arise around environment setup, linting, documentation, or other.
+Should that occur, please contact a maintainer! Not only do we want to help get you unblocked,
+but we also want to make sure that the process is smooth for future contributors.
+
+In a similar vein, we do enforce certain linting, formatting, and documentation standards in the codebase.
+If you are finding these difficult (or even just annoying) to work with,
+feel free to contact a maintainer for help - we do not want these to get in the way of getting
+good code into the codebase.
+
+### 🏭Release process
+
+As of now, LangChain has an ad hoc release process: releases are cut with high frequency via by
+a developer and published to [PyPI](https://pypi.org/project/ruff/).
+
+LangChain follows the [semver](https://semver.org/) versioning standard. However, as pre-1.0 software,
+even patch releases may contain [non-backwards-compatible changes](https://semver.org/#spec-item-4).
+
+If your contribution has made its way into a release, we will want to give you credit on Twitter (only if you want though)!
+If you have a Twitter account you would like us to mention, please let us know in the PR or in another manner.
+
+## 🤖Developer Setup
+
+### 🚀Quick Start
+
+This project uses [Poetry](https://python-poetry.org/) as a dependency manager. Check out Poetry's [documentation on how to install it](https://python-poetry.org/docs/#installation) on your system before proceeding.
+
+To install requirements:
+
+```bash
+poetry install -E all
+```
+
+This will install all requirements for running the package, examples, linting, formatting, tests, and coverage. Note the `-E all` flag will install all optional dependencies necessary for integration testing.
+
+Now, you should be able to run the common tasks in the following section.
+
+### ✅Common Tasks
+
+#### Code Formatting
+
+Formatting for this project is done via a combination of [Black](https://black.readthedocs.io/en/stable/) and [isort](https://pycqa.github.io/isort/).
+
+To run formatting for this project:
+
+```bash
+make format
+```
+
+#### Linting
+
+Linting for this project is done via a combination of [Black](https://black.readthedocs.io/en/stable/), [isort](https://pycqa.github.io/isort/), [flake8](https://flake8.pycqa.org/en/latest/), and [mypy](http://mypy-lang.org/).
+
+To run linting for this project:
+
+```bash
+make lint
+```
+
+We recognize linting can be annoying - if you do not want to do it, please contact a project maintainer, and they can help you with it. We do not want this to be a blocker for good code getting contributed.
+
+#### Coverage
+
+Code coverage (i.e. the amount of code that is covered by unit tests) helps identify areas of the code that are potentially more or less brittle.
+
+To get a report of current coverage, run the following:
+
+```bash
+make coverage
+```
+
+#### Testing
+
+Unit tests cover modular logic that does not require calls to outside APIs.
+
+To run unit tests:
+
+```bash
+make tests
+```
+
+If you add new logic, please add a unit test.
+
+Integration tests cover logic that requires making calls to outside APIs (often integration with other services).
+
+To run integration tests:
+
+```bash
+make integration_tests
+```
+
+If you add support for a new external API, please add a new integration test.
+
+#### Adding a Jupyter Notebook
+
+If you are adding a Jupyter notebook example, you'll want to install the optional `dev` dependencies.
+
+To install dev dependencies:
+
+```bash
+poetry install --with dev
+```
+
+Launch a notebook:
+
+```bash
+poetry run jupyter notebook
+```
+
+When you run `poetry install`, the `langchain` package is installed as editable in the virtualenv, so your new logic can be imported into the notebook.
+
+#### Contribute Documentation
+
+Docs are largely autogenerated by [sphinx](https://www.sphinx-doc.org/en/master/) from the code.
+
+For that reason, we ask that you add good documentation to all classes and methods.
+
+Similar to linting, we recognize documentation can be annoying. If you do not want to do it, please contact a project maintainer, and they can help you with it. We do not want this to be a blocker for good code getting contributed.
--- a/6
+++ b/6
@ -1,5 +1,11 @@
 .PHONY: format lint tests integration_tests

+coverage:
+	poetry run pytest --cov \
+		--cov-config=.coveragerc \
+		--cov-report xml \
+		--cov-report term-missing:skip-covered
+
 format:
 	poetry run black .
 	poetry run isort .
--- a/README.md
+++ b/README.md
@ -13,176 +13,45 @@
 Large language models (LLMs) are emerging as a transformative technology, enabling
 developers to build applications that they previously could not.
 But using these LLMs in isolation is often not enough to
-create a truly powerful app - the real power comes when you are able to
-combine them with other sources of computation or knowledge.
+create a truly powerful app - the real power comes when you can combine them with other sources of computation or knowledge.

 This library is aimed at assisting in the development of those types of applications.

 ## 📖 Documentation

 Please see [here](https://langchain.readthedocs.io/en/latest/?) for full documentation on:
- Getting started (installation, setting up environment, simple examples)
+
+- Getting started (installation, setting up the environment, simple examples)
 - How-To examples (demos, integrations, helper functions)
 - Reference (full API docs)
- Resources (high level explanation of core concepts)
+  Resources (high-level explanation of core concepts)

 ## 🚀 What can this help with?

 There are four main areas that LangChain is designed to help with.
 These are, in increasing order of complexity:
-1. LLM and Prompts
-2. Chains
-3. Agents
-4. Memory
-
-Let's go through these categories and for each one identify key concepts (to clarify terminology) as well as the problems in this area LangChain helps solve.
-
-### LLMs and Prompts
-Calling out to an LLM once is pretty easy, with most of them being behind well documented APIs.
-However, there are still some challenges going from that to an application running in production that LangChain attempts to address.
-
-**Key Concepts**
- LLM: A large language model, in particular a text-to-text model.
- Prompt: The input to a language model. Typically this is not simply a hardcoded string but rather a combination of a template, some examples, and user input.
- Prompt Template: An object responsible for constructing the final prompt to pass to a LLM.
- Examples: Datapoints that can be included in the prompt in order to give the model more context what to do.
- Few Shot Prompt Template: A subclass of the PromptTemplate class that uses examples.
- Example Selector: A class responsible to selecting examples to use dynamically (depending on user input) in a few shot prompt.
-
-**Problems Solved**
- Switching costs: by exposing a standard interface for all the top LLM providers, LangChain makes it easy to switch from one provider to another, whether it be for production use cases or just for testing stuff out.
- Prompt management: managing your prompts is easy when you only have one simple one, but can get tricky when you have a bunch or when they start to get more complex. LangChain provides a standard way for storing, constructing, and referencing prompts.
- Prompt optimization: despite the underlying models getting better and better, there is still currently a need for carefully constructing prompts. 
-
-### Chains
-Using an LLM in isolation is fine for some simple applications, but many more complex ones require chaining LLMs - either with eachother or with other experts.
-LangChain provides several parts to help with that.
-
-**Key Concepts**
- Tools: APIs designed for assisting with a particular use case (search, databases, Python REPL, etc). Prompt templates, LLMs, and chains can also be considered tools.
- Chains: A combination of multiple tools in a deterministic manner.
-
-**Problems Solved**
- Standard interface for working with Chains
- Easy way to construct chains of LLMs
- Lots of integrations with other tools that you may want to use in conjunction with LLMs 
- End-to-end chains for common workflows (database question/answer, recursive summarization, etc)
-
-### Agents
-Some applications will require not just a predetermined chain of calls to LLMs/other tools, but potentially an unknown chain that depends on the user input.
-In these types of chains, there is a “agent” which has access to a suite of tools.
-Depending on the user input, the agent can then decide which, if any, of these tools to call.
-
-**Key Concepts**
- Tools: same as above.
- Agent: An LLM-powered class responsible for determining which tools to use and in what order.
-
-
-**Problems Solved**
- Standard agent interfaces
- A selection of powerful agents to choose from
- Common chains that can be used as tools
-
-### Memory
-By default, Chains and Agents are stateless, meaning that they treat each incoming query independently.
-In some applications (chatbots being a GREAT example) it is highly important to remember previous interactions,
-both at a short term but also at a long term level. The concept of "Memory" exists to do exactly that.
-
-**Key Concepts**
- Memory: A class that can be added to an Agent or Chain to (1) pull in memory variables before calling that chain/agent, and (2) create new memories after the chain/agent finishes.
- Memory Variables: Variables returned from a Memory class, to be passed into the chain/agent along with the user input.
-
-**Problems Solved**
- Standard memory interfaces
- A collection of common memory implementations to choose from
- Common chains/agents that use memory (e.g. chatbots)
-
-## 🤖 Developer Guide
-
-To begin developing on this project, first clone the repo locally.
-
-### Quick Start
-
-This project uses [Poetry](https://python-poetry.org/) as a dependency manager. Check out Poetry's own [documentation on how to install it](https://python-poetry.org/docs/#installation) on your system before proceeding.
-
-To install requirements:
-
-```bash
-poetry install -E all
-```
-
-This will install all requirements for running the package, examples, linting, formatting, and tests. Note the `-E all` flag will install all optional dependencies necessary for integration testing.
-
-Now, you should be able to run the common tasks in the following section.
-
-### Common Tasks
-
-#### Code Formatting
-
-Formatting for this project is a combination of [Black](https://black.readthedocs.io/en/stable/) and [isort](https://pycqa.github.io/isort/).
-
-To run formatting for this project:
-
-```bash
-make format
-```
-
-#### Linting
-
-Linting for this project is a combination of [Black](https://black.readthedocs.io/en/stable/), [isort](https://pycqa.github.io/isort/), [flake8](https://flake8.pycqa.org/en/latest/), and [mypy](http://mypy-lang.org/).
-
-To run linting for this project:
-
-```bash
-make lint
-```
-
-We recognize linting can be annoying - if you do not want to do it, please contact a project maintainer and they can help you with it. We do not want this to be a blocker for good code getting contributed.
-
-#### Testing
-
-Unit tests cover modular logic that does not require calls to outside apis.
-
-To run unit tests:
-
-```bash
-make tests
-```
-
-If you add new logic, please add a unit test.
-
-Integration tests cover logic that requires making calls to outside APIs (often integration with other services).
-
-To run integration tests:
-
-```bash
-make integration_tests
-```

-If you add support for a new external API, please add a new integration test.
+**📃 LLMs and Prompts:**

-#### Adding a Jupyter Notebook
+This includes prompt management, prompt optimization, generic interface for all LLMs, and common utilities for working with LLMs.

-If you are adding a Jupyter notebook example, you'll want to install the optional `dev` dependencies.
+**🔗 Chains:**

-To install dev dependencies:
+Chains go beyond just a single LLM call, and are sequences of calls (whether to an LLM or a different utility). LangChain provides a standard interface for chains, lots of integrations with other tools, and end-to-end chains for common applications.

-```bash
-poetry install --with dev
-```
+**🤖 Agents:**

-Launch a notebook:
+Agents involve an LLM making decisions about which Actions to take, taking that Action, seeing an Observation, and repeating that until done. LangChain provides a standard interface for agents, a selection of agents to choose from, and examples of end to end agents.

-```bash
-poetry run jupyter notebook
-```
+**🧠 Memory:**

-When you run `poetry install`, the `langchain` package is installed as editable in the virtualenv, so your new logic can be imported into the notebook.
+Memory is the concept of persisting state between calls of a chain/agent. LangChain provides a standard interface for memory, a collection of memory implementations, and examples of chains/agents that use memory.

-#### Contribute Documentation
+For more information on these concepts, please see our [full documentation](https://langchain.readthedocs.io/en/latest/?).

-Docs are largely autogenerated by [sphinx](https://www.sphinx-doc.org/en/master/) from the code.
+## 💁 Contributing

-For that reason, we ask that you add good documentation to all classes and methods.
+As an open source project in a rapidly developing field, we are extremely open
+to contributions, whether it be in the form of a new feature, improved infra, or better documentation.

-Similar to linting, we recognize documentation can be annoying - if you do not want to do it, please contact a project maintainer and they can help you with it. We do not want this to be a blocker for good code getting contributed.
+For detailed information on how to contribute, see [here](CONTRIBUTING.md).
--- a/docs/examples/chains/qa_with_sources.ipynb
+++ b/docs/examples/chains/qa_with_sources.ipynb
@ -159,6 +159,14 @@
   "id": "e417926a",
   "metadata": {},
   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.\n",
+      "Token indices sequence length is longer than the specified maximum sequence length for this model (1546 > 1024). Running this sequence through the model will result in indexing errors\n"
+     ]
+    },
    {
     "data": {
      "text/plain": [
@ -204,7 +212,7 @@
    {
     "data": {
      "text/plain": [
-       "{'output_text': \"\\n\\nThe president did not mention Justice Breyer in his speech to the European Parliament. He discussed the situation in Ukraine, the NATO Alliance, and the United States' response to Putin's attack on Ukraine. He spoke about the extensive preparation and coalition building that was done in advance of the attack, and the unified response from the European Union, Canada, Japan, Korea, Australia, New Zealand, and many other countries. He also discussed the economic sanctions that have been imposed on Russia, and the effects they have had on Putin's war fund. Source: 1, 2\"}"
+       "{'output_text': \"\\n\\nThe president did not mention Justice Breyer in his speech to the European Parliament, which focused on building a coalition of freedom-loving nations to confront Putin, unifying European allies, countering Russia's lies with truth, and enforcing powerful economic sanctions. Source: 2\"}"
      ]
     },
     "execution_count": 12,
--- a/docs/examples/chains/summarize.ipynb
+++ b/docs/examples/chains/summarize.ipynb
@ -131,7 +131,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 6,
   "id": "ef28e1d4",
   "metadata": {},
   "outputs": [],
@ -148,7 +148,7 @@
    {
     "data": {
      "text/plain": [
-       "' In response to Russian aggression in Ukraine, the US and its allies have imposed economic sanctions, cut off access to technology, seized assets of Russian oligarchs, and closed American airspace to Russian flights. The US is also providing military, economic, and humanitarian assistance to Ukraine, mobilizing ground forces, air squadrons, and ship deployments, and releasing 30 million barrels of oil from its Strategic Petroleum Reserve. President Biden has also passed the American Rescue Plan, Bipartisan Infrastructure Law, and Bipartisan Innovation Act to provide economic relief and create jobs.'"
+       "\" In response to Vladimir Putin's aggression in Ukraine, the US and its allies have taken action to hold him accountable, including economic sanctions, cutting off access to technology, and seizing the assets of Russian oligarchs. They are also providing military, economic, and humanitarian assistance to the Ukrainians, and releasing 60 million barrels of oil from reserves around the world. President Biden has passed several laws to provide economic relief to Americans and create jobs, and is making sure taxpayer dollars support American jobs and businesses.\""
      ]
     },
     "execution_count": 9,
--- a/docs/examples/integrations/textsplitter.ipynb
+++ b/docs/examples/integrations/textsplitter.ipynb
@ -19,7 +19,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 1,
   "id": "e82c4685",
   "metadata": {},
   "outputs": [],
@ -42,7 +42,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 2,
   "id": "79ff6737",
   "metadata": {},
   "outputs": [],
@ -57,7 +57,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 3,
   "id": "38547666",
   "metadata": {},
   "outputs": [
@ -67,7 +67,7 @@
       "'Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.  \\n\\nLast year COVID-19 kept us apart. This year we are finally together again. \\n\\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. \\n\\nWith a duty to one another to the American people to the Constitution. \\n\\nAnd with an unwavering resolve that freedom will always triumph over tyranny. \\n\\nSix days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways. But he badly miscalculated. \\n\\nHe thought he could roll into Ukraine and the world would roll over. Instead he met a wall of strength he never imagined. \\n\\nHe met the Ukrainian people. \\n\\nFrom President Zelenskyy to every Ukrainian, their fearlessness, their courage, their determination, inspires the world. \\n\\nGroups of citizens blocking tanks with their bodies. Everyone from students to retirees teachers turned soldiers defending their homeland. '"
      ]
     },
-     "execution_count": 8,
+     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -88,7 +88,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 4,
   "id": "a8ce51d5",
   "metadata": {},
   "outputs": [
@ -108,7 +108,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 5,
   "id": "ca5e72c0",
   "metadata": {},
   "outputs": [],
@ -119,7 +119,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 6,
   "id": "37cdfbeb",
   "metadata": {},
   "outputs": [
@ -143,6 +143,52 @@
    "print(texts[0])"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "id": "7683b36a",
+   "metadata": {},
+   "source": [
+    "## tiktoken (OpenAI) Length Function\n",
+    "You can also use tiktoken, a open source tokenizer package from OpenAI to estimate tokens used. Will probably be ore accurate for their models."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "825f7c0a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=100, chunk_overlap=0)\n",
+    "texts = text_splitter.split_text(state_of_the_union)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "ae35d165",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.  \n",
+      "\n",
+      "Last year COVID-19 kept us apart. This year we are finally together again. \n",
+      "\n",
+      "Tonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. \n",
+      "\n",
+      "With a duty to one another to the American people to the Constitution. \n",
+      "\n",
+      "And with an unwavering resolve that freedom will always triumph over tyranny. \n"
+     ]
+    }
+   ],
+   "source": [
+    "print(texts[0])"
+   ]
+  },
  {
   "cell_type": "markdown",
   "id": "ea2973ac",
--- a/docs/examples/prompts.rst
+++ b/docs/examples/prompts.rst
@ -1,10 +1,35 @@
-Prompts
-=======
+LLMs & Prompts
+==============
+
+The examples here all highlight how to work with LLMs and prompts.
+
+**LLMs**
+
+`LLM Functionality <prompts/llm_functionality.ipynb>`_: A walkthrough of all the functionality the standard LLM interface exposes.
+
+`LLM Serialization <prompts/llm_serialization.ipynb>`_: A walkthrough of how to serialize LLMs to and from disk.
+
+`Custom LLM <prompts/custom_llm.ipynb>`_: How to create and use a custom LLM class, in case you have an LLM not from one of the standard providers (including one that you host yourself).
+
+
+**Prompts**
+
+`Prompt Management <prompts/prompt_management.ipynb>`_: A walkthrough of all the functionality LangChain supports for working with prompts.
+
+`Prompt Serialization <prompts/prompt_serialization.ipynb>`_: A walkthrough of how to serialize prompts to and from disk.
+
+`Few Shot Examples <prompts/few_shot_examples.ipynb>`_: How to include examples in the prompt.
+
+`Generate Examples <prompts/generate_examples.ipynb>`_: How to use existing examples to generate more examples.
+
+`Custom Example Selector <prompts/custom_example_selector.ipynb>`_: How to create and use a custom ExampleSelector (the class responsible for choosing which examples to use in a prompt).
+
+`Custom Prompt Template <prompts/custom_prompt_template.ipynb>`_: How to create and use a custom PromptTemplate, the logic that decides how input variables get formatted into a prompt.

-The examples here all highlight how to work with prompts.

 .. toctree::
   :maxdepth: 1
   :glob:
+   :hidden:

   prompts/*
--- a/docs/examples/prompts/custom_llm.ipynb
+++ b/docs/examples/prompts/custom_llm.ipynb
@ -11,7 +11,7 @@
    "\n",
    "There is only one required thing that a custom LLM needs to implement:\n",
    "\n",
-    "1. A `__call__` method that takes in a string, some optional stop words, and returns a string\n",
+    "1. A `_call` method that takes in a string, some optional stop words, and returns a string\n",
    "\n",
    "There is a second optional thing it can implement:\n",
    "\n",
@ -33,17 +33,20 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 7,
   "id": "d5ceff02",
   "metadata": {},
   "outputs": [],
   "source": [
    "class CustomLLM(LLM):\n",
    "    \n",
-    "    def __init__(self, n: int):\n",
-    "        self.n = n\n",
+    "    n: int\n",
+    "        \n",
+    "    @property\n",
+    "    def _llm_type(self) -> str:\n",
+    "        return \"custom\"\n",
    "    \n",
-    "    def __call__(self, prompt: str, stop: Optional[List[str]] = None) -> str:\n",
+    "    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:\n",
    "        if stop is not None:\n",
    "            raise ValueError(\"stop kwargs are not permitted.\")\n",
    "        return prompt[:self.n]\n",
@ -64,7 +67,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 8,
   "id": "10e5ece6",
   "metadata": {},
   "outputs": [],
@ -74,7 +77,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 9,
   "id": "8cd49199",
   "metadata": {},
   "outputs": [
@ -84,7 +87,7 @@
       "'This is a '"
      ]
     },
-     "execution_count": 4,
+     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -103,7 +106,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 10,
   "id": "9c33fa19",
   "metadata": {},
   "outputs": [
@ -145,7 +148,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.7.6"
+   "version": "3.10.8"
  }
 },
 "nbformat": 4,
--- a/docs/examples/prompts/few_shot_examples.ipynb
+++ b/docs/examples/prompts/few_shot_examples.ipynb
--- a/docs/examples/prompts/llm.json
+++ b/docs/examples/prompts/llm.json
@ -0,0 +1,11 @@
+{
+    "model_name": "text-davinci-003",
+    "temperature": 0.7,
+    "max_tokens": 256,
+    "top_p": 1.0,
+    "frequency_penalty": 0.0,
+    "presence_penalty": 0.0,
+    "n": 1,
+    "best_of": 1,
+    "_type": "openai"
+}
--- a/docs/examples/prompts/llm.yaml
+++ b/docs/examples/prompts/llm.yaml
@ -0,0 +1,9 @@
+_type: openai
+best_of: 1
+frequency_penalty: 0.0
+max_tokens: 256
+model_name: text-davinci-003
+n: 1
+presence_penalty: 0.0
+temperature: 0.7
+top_p: 1.0
--- a/docs/examples/prompts/llm_functionality.ipynb
+++ b/docs/examples/prompts/llm_functionality.ipynb
@ -0,0 +1,412 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "20ac6b98",
+   "metadata": {},
+   "source": [
+    "# LLM Functionality\n",
+    "\n",
+    "This notebook goes over all the different features of the LLM class in LangChain.\n",
+    "\n",
+    "We will work with an OpenAI LLM wrapper, although these functionalities should exist for all LLM types."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "df924055",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.llms import OpenAI"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "182b484c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "llm = OpenAI(model_name=\"text-ada-001\", n=2, best_of=2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9695ccfc",
+   "metadata": {},
+   "source": [
+    "**Generate Text:** The most basic functionality an LLM has is just the ability to call it, passing in a string and getting back a string."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "9d12ac26",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'\\n\\nWhy did the chicken cross the road?\\n\\nTo get to the other side!'"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "llm(\"Tell me a joke\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e7d4d42d",
+   "metadata": {},
+   "source": [
+    "**Generate:** More broadly, you can call it with a list of inputs, getting back a more complete response than just the text. This complete response includes things like multiple top responses, as well as LLM provider specific information"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "f4dc241a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "llm_result = llm.generate([\"Tell me a joke\", \"Tell me a poem\"]*15)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "740392f6",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "30"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(llm_result.generations)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "ab6cdcf1",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[Generation(text='\\n\\nWhy did the chicken cross the road?\\n\\nTo get to the other side.'),\n",
+       " Generation(text='\\n\\nWhy did the chicken cross the road?\\n\\nTo get to the other side!')]"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "llm_result.generations[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "4946a778",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[Generation(text=\"\\n\\nA rose by the side of the road\\n\\nIs all I need to find my way\\n\\nTo the place I've been searching for\\n\\nAnd my heart is singing with joy\\n\\nWhen I look at this rose\\n\\nIt reminds me of the love I've found\\n\\nAnd I know that wherever I go\\n\\nI'll always find my rose by the side of the road.\"),\n",
+       " Generation(text=\"\\n\\nWhen I was younger\\nI thought that love\\nI was something like a fairytale\\nI would find my prince and they would be my people\\nI was naïve\\nI thought that\\n\\nLove was a something that happened\\nWhen I was younger\\nI was it for my fairytale prince\\nNow I realize\\nThat love is something that waits\\nFor when my prince comes\\nAnd when I am ready to be his wife\\nI'll tell you a poem\\n\\nWhen I was younger\\nI thought that love\\nI was something like a fairytale\\nI would find my prince and they would be my people\\nI was naïve\\nI thought that\\n\\nLove was a something that happened\\nAnd I would be happy\\nWhen my prince came\\nAnd I was ready to be his wife\")]"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "llm_result.generations[-1]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "242e4527",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'token_usage': {'completion_tokens': 3722,\n",
+       "  'prompt_tokens': 120,\n",
+       "  'total_tokens': 3842}}"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Provider specific info\n",
+    "llm_result.llm_output"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bde8e04f",
+   "metadata": {},
+   "source": [
+    "**Number of Tokens:** You can also estimate how many tokens a piece of text will be in that model. This is useful because models have a context length (and cost more for more tokens), which means you need to be aware of how long the text you are passing in is.\n",
+    "\n",
+    "Notice that by default the tokens are estimated using a HuggingFace tokenizer."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "b623c774",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "3"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "llm.get_num_tokens(\"what a joke\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ee6fcf8d",
+   "metadata": {},
+   "source": [
+    "### Caching\n",
+    "With LangChain, you can also enable caching of LLM calls. Note that currently this only applies for individual LLM calls."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "2626ca48",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import langchain\n",
+    "from langchain.cache import InMemoryCache\n",
+    "langchain.llm_cache = InMemoryCache()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "97762272",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# To make the caching really obvious, lets use a slower model.\n",
+    "llm = OpenAI(model_name=\"text-davinci-002\", n=2, best_of=2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "e80c65e4",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 31.2 ms, sys: 11.8 ms, total: 43.1 ms\n",
+      "Wall time: 1.75 s\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'\\n\\nWhy did the chicken cross the road?\\n\\nTo get to the other side!'"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "# The first time, it is not yet in cache, so it should take longer\n",
+    "llm(\"Tell me a joke\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "678408ec",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 51 µs, sys: 1 µs, total: 52 µs\n",
+      "Wall time: 67.2 µs\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'\\n\\nWhy did the chicken cross the road?\\n\\nTo get to the other side!'"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "# The second time it is, so it goes faster\n",
+    "llm(\"Tell me a joke\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "3f0ac8d2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# We can do the same thing with a SQLite cache\n",
+    "from langchain.cache import SQLiteCache\n",
+    "langchain.llm_cache = SQLiteCache(database_path=\".langchain.db\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "0e1dcce3",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 26.6 ms, sys: 11.2 ms, total: 37.7 ms\n",
+      "Wall time: 1.89 s\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'\\n\\nWhy did the chicken cross the road?\\n\\nTo get to the other side.'"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "# The first time, it is not yet in cache, so it should take longer\n",
+    "llm(\"Tell me a joke\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "efadd750",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 2.69 ms, sys: 1.57 ms, total: 4.27 ms\n",
+      "Wall time: 2.73 ms\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'\\n\\nWhy did the chicken cross the road?\\n\\nTo get to the other side.'"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "# The second time it is, so it goes faster\n",
+    "llm(\"Tell me a joke\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6053408b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# You can use SQLAlchemyCache to cache with any SQL database supported by SQLAlchemy.\n",
+    "from langchain.cache import SQLAlchemyCache\n",
+    "from sqlalchemy import create_engine\n",
+    "\n",
+    "engine = create_engine(\"postgresql://postgres:postgres@localhost:5432/postgres\")\n",
+    "langchain.llm_cache = SQLAlchemyCache(engine)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.12 (main, Jun  1 2022, 06:34:44) \n[Clang 12.0.0 ]"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "1235b9b19e8e9828b5c1fdb2cd89fe8d3de0fcde5ef5f3db36e4b671adb8660f"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/docs/examples/prompts/llm_serialization.ipynb
+++ b/docs/examples/prompts/llm_serialization.ipynb
@ -0,0 +1,166 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "73f9bf40",
+   "metadata": {},
+   "source": [
+    "# LLM Serialization\n",
+    "\n",
+    "This notebook walks how to write and read an LLM Configuration to and from disk. This is useful if you want to save the configuration for a given LLM (eg the provider, the temperature, etc)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "9c9fb6ff",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.llms import OpenAI\n",
+    "from langchain.llms.loading import load_llm"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "88ce018b",
+   "metadata": {},
+   "source": [
+    "### Loading\n",
+    "First, lets go over loading a LLM from disk. LLMs can be saved on disk in two formats: json or yaml. No matter the extension, they are loaded in the same way."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "f12b28f3",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{\r\n",
+      "    \"model_name\": \"text-davinci-003\",\r\n",
+      "    \"temperature\": 0.7,\r\n",
+      "    \"max_tokens\": 256,\r\n",
+      "    \"top_p\": 1,\r\n",
+      "    \"frequency_penalty\": 0,\r\n",
+      "    \"presence_penalty\": 0,\r\n",
+      "    \"n\": 1,\r\n",
+      "    \"best_of\": 1,\r\n",
+      "    \"_type\": \"openai\"\r\n",
+      "}"
+     ]
+    }
+   ],
+   "source": [
+    "!cat llm.json"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "9ab709fc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "llm = load_llm(\"llm.json\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "095b1d56",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "_type: openai\r\n",
+      "best_of: 1\r\n",
+      "frequency_penalty: 0\r\n",
+      "max_tokens: 256\r\n",
+      "model_name: text-davinci-003\r\n",
+      "n: 1\r\n",
+      "presence_penalty: 0\r\n",
+      "temperature: 0.7\r\n",
+      "top_p: 1\r\n"
+     ]
+    }
+   ],
+   "source": [
+    "!cat llm.yaml"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "8cafaafe",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "llm = load_llm(\"llm.yaml\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ab3e4223",
+   "metadata": {},
+   "source": [
+    "### Saving\n",
+    "If you want to go from a LLM in memory to a serialized version of it, you can do so easily by calling the `.save` method. Again, this supports both json and yaml."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "b38f685d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "llm.save(\"llm.json\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "b7365503",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "llm.save(\"llm.yaml\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0e494851",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/docs/explanation/combine_docs.md
+++ b/docs/explanation/combine_docs.md
@ -113,7 +113,7 @@ asking the LLM to refine the output based on the new document.

 **Pros:** Can pull in more relevant context, and may be less lossy than `RefineDocumentsChain`.

-**Cons:** Requires many more calls to the LLM than `StuffDocumentsChain`. The calls are also NOT independent, meaning they cannot be paralleled like `RefineDocumentsChain`. There is also some potential dependencies on the ordering of the documents.
+**Cons:** Requires many more calls to the LLM than `StuffDocumentsChain`. The calls are also NOT independent, meaning they cannot be paralleled like `MapReduceDocumentsChain`. There is also some potential dependencies on the ordering of the documents.

 ## Use Cases
 LangChain supports the above three methods of augmenting LLMs with external data.
--- a/docs/explanation/cool_demos.md
+++ b/docs/explanation/cool_demos.md
@ -6,6 +6,9 @@ If you see any other demos that you think we should highlight, be sure to let us

 ## Open Source

+### [YouTube Transcription Question Answering with Sources](https://colab.research.google.com/drive/1sKSTjt9cPstl_WMZ86JsgEqFG-aSAwkn?usp=sharing)
+An end-to-end example of doing question answering on YouTube transcripts, returning the timestamps as sources to legitimize the answer.
+
 ### [ThoughtSource](https://github.com/OpenBioLink/ThoughtSource)
 A central, open resource and community around data and tools related to chain-of-thought reasoning in large language models.

--- a/docs/installation.md
+++ b/docs/installation.md
@ -21,4 +21,10 @@ To install all modules needed for all integrations, run:

 ```
 pip install langchain[all]
+```
+
+Note that if you are using `zsh`, you'll need to quote square brackets when passing them as an argument to a command, for example:
+
+```
+pip install 'langchain[all]'
 ```
--- a/langchain/init.py
+++ b/langchain/init.py
@ -1,6 +1,9 @@
 """Main entrypoint into package."""

+from typing import Optional
+
 from langchain.agents import MRKLChain, ReActChain, SelfAskWithSearchChain
+from langchain.cache import BaseCache
 from langchain.chains import (
    ConversationChain,
    LLMBashChain,
@ -15,6 +18,7 @@ from langchain.chains import (
 )
 from langchain.docstore import InMemoryDocstore, Wikipedia
 from langchain.llms import Cohere, HuggingFaceHub, OpenAI
+from langchain.llms.huggingface_pipeline import HuggingFacePipeline
 from langchain.logger import BaseLogger, StdOutLogger
 from langchain.prompts import (
    BasePromptTemplate,
@ -28,6 +32,7 @@ from langchain.vectorstores import FAISS, ElasticVectorSearch

 logger: BaseLogger = StdOutLogger()
 verbose: bool = False
+llm_cache: Optional[BaseCache] = None

 __all__ = [
    "LLMChain",
@ -46,6 +51,7 @@ __all__ = [
    "ReActChain",
    "Wikipedia",
    "HuggingFaceHub",
+    "HuggingFacePipeline",
    "SQLDatabase",
    "SQLDatabaseChain",
    "FAISS",
--- a/langchain/cache.py
+++ b/langchain/cache.py
@ -0,0 +1,118 @@
+"""Beta Feature: base interface for cache."""
+from abc import ABC, abstractmethod
+from typing import Dict, List, Optional, Tuple, Union
+
+from sqlalchemy import Column, Integer, String, create_engine, select
+from sqlalchemy.engine.base import Engine
+from sqlalchemy.ext.declarative import declarative_base
+from sqlalchemy.orm import Session
+
+from langchain.schema import Generation
+
+RETURN_VAL_TYPE = Union[List[Generation], str]
+
+
+class BaseCache(ABC):
+    """Base interface for cache."""
+
+    @abstractmethod
+    def lookup(self, prompt: str, llm_string: str) -> Optional[RETURN_VAL_TYPE]:
+        """Look up based on prompt and llm_string."""
+
+    @abstractmethod
+    def update(self, prompt: str, llm_string: str, return_val: RETURN_VAL_TYPE) -> None:
+        """Update cache based on prompt and llm_string."""
+
+
+class InMemoryCache(BaseCache):
+    """Cache that stores things in memory."""
+
+    def __init__(self) -> None:
+        """Initialize with empty cache."""
+        self._cache: Dict[Tuple[str, str], RETURN_VAL_TYPE] = {}
+
+    def lookup(self, prompt: str, llm_string: str) -> Optional[RETURN_VAL_TYPE]:
+        """Look up based on prompt and llm_string."""
+        return self._cache.get((prompt, llm_string), None)
+
+    def update(self, prompt: str, llm_string: str, return_val: RETURN_VAL_TYPE) -> None:
+        """Update cache based on prompt and llm_string."""
+        self._cache[(prompt, llm_string)] = return_val
+
+
+Base = declarative_base()
+
+
+class LLMCache(Base):  # type: ignore
+    """SQLite table for simple LLM cache (string only)."""
+
+    __tablename__ = "llm_cache"
+    prompt = Column(String, primary_key=True)
+    llm = Column(String, primary_key=True)
+    response = Column(String)
+
+
+class FullLLMCache(Base):  # type: ignore
+    """SQLite table for full LLM Cache (all generations)."""
+
+    __tablename__ = "full_llm_cache"
+    prompt = Column(String, primary_key=True)
+    llm = Column(String, primary_key=True)
+    idx = Column(Integer, primary_key=True)
+    response = Column(String)
+
+
+class SQLAlchemyCache(BaseCache):
+    """Cache that uses SQAlchemy as a backend."""
+
+    def __init__(self, engine: Engine):
+        """Initialize by creating all tables."""
+        self.engine = engine
+        Base.metadata.create_all(self.engine)
+
+    def lookup(self, prompt: str, llm_string: str) -> Optional[RETURN_VAL_TYPE]:
+        """Look up based on prompt and llm_string."""
+        stmt = (
+            select(FullLLMCache.response)
+            .where(FullLLMCache.prompt == prompt)
+            .where(FullLLMCache.llm == llm_string)
+            .order_by(FullLLMCache.idx)
+        )
+        with Session(self.engine) as session:
+            generations = []
+            for row in session.execute(stmt):
+                generations.append(Generation(text=row[0]))
+            if len(generations) > 0:
+                return generations
+        stmt = (
+            select(LLMCache.response)
+            .where(LLMCache.prompt == prompt)
+            .where(LLMCache.llm == llm_string)
+        )
+        with Session(self.engine) as session:
+            for row in session.execute(stmt):
+                return row[0]
+        return None
+
+    def update(self, prompt: str, llm_string: str, return_val: RETURN_VAL_TYPE) -> None:
+        """Look up based on prompt and llm_string."""
+        if isinstance(return_val, str):
+            item = LLMCache(prompt=prompt, llm=llm_string, response=return_val)
+            with Session(self.engine) as session, session.begin():
+                session.add(item)
+        else:
+            for i, generation in enumerate(return_val):
+                item = FullLLMCache(
+                    prompt=prompt, llm=llm_string, response=generation.text, idx=i
+                )
+                with Session(self.engine) as session, session.begin():
+                    session.add(item)
+
+
+class SQLiteCache(SQLAlchemyCache):
+    """Cache that uses SQLite as a backend."""
+
+    def __init__(self, database_path: str = ".langchain.db"):
+        """Initialize by creating the engine and all tables."""
+        engine = create_engine(f"sqlite:///{database_path}")
+        super().__init__(engine)
--- a/langchain/chains/base.py
+++ b/langchain/chains/base.py
@ -1,6 +1,6 @@
 """Base interface that all chains should implement."""
 from abc import ABC, abstractmethod
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Union

 from pydantic import BaseModel, Extra, Field

@ -29,6 +29,10 @@ class Memory(BaseModel, ABC):
    def save_context(self, inputs: Dict[str, Any], outputs: Dict[str, str]) -> None:
        """Save the context of this model run to memory."""

+    @abstractmethod
+    def clear(self) -> None:
+        """Clear memory contents."""
+

 def _get_verbosity() -> bool:
    return langchain.verbose
@ -70,18 +74,28 @@ class Chain(BaseModel, ABC):
        """Run the logic of this chain and return the output."""

    def __call__(
-        self, inputs: Dict[str, Any], return_only_outputs: bool = False
+        self, inputs: Union[Dict[str, Any], Any], return_only_outputs: bool = False
    ) -> Dict[str, str]:
        """Run the logic of this chain and add to output if desired.

        Args:
-            inputs: Dictionary of inputs.
+            inputs: Dictionary of inputs, or single input if chain expects
+                only one param.
            return_only_outputs: boolean for whether to return only outputs in the
                response. If True, only new keys generated by this chain will be
                returned. If False, both input keys and new keys generated by this
                chain will be returned. Defaults to False.

        """
+        if not isinstance(inputs, dict):
+            if len(self.input_keys) != 1:
+                raise ValueError(
+                    f"A single string input was passed in, but this chain expects "
+                    f"multiple inputs ({self.input_keys}). When a chain expects "
+                    f"multiple inputs, please call it by passing in a dictionary, "
+                    "eg `chain({'foo': 1, 'bar': 2})`"
+                )
+            inputs = {self.input_keys[0]: inputs}
        if self.memory is not None:
            external_context = self.memory.load_memory_variables(inputs)
            inputs = dict(inputs, **external_context)
--- a/langchain/chains/combine_documents/base.py
+++ b/langchain/chains/combine_documents/base.py
@ -1,7 +1,7 @@
 """Base interface for chains combining documents."""

 from abc import ABC, abstractmethod
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Optional

 from pydantic import BaseModel

@ -31,6 +31,13 @@ class BaseCombineDocumentsChain(Chain, BaseModel, ABC):
        """
        return [self.output_key]

+    def prompt_length(self, docs: List[Document], **kwargs: Any) -> Optional[int]:
+        """Return the prompt length given the documents passed in.
+
+        Returns None if the method does not depend on the prompt length.
+        """
+        return None
+
    @abstractmethod
    def combine_docs(self, docs: List[Document], **kwargs: Any) -> str:
        """Combine documents into a single string."""
--- a/langchain/chains/combine_documents/map_reduce.py
+++ b/langchain/chains/combine_documents/map_reduce.py
@ -2,7 +2,7 @@

 from __future__ import annotations

-from typing import Any, Dict, List
+from typing import Any, Callable, Dict, List, Optional

 from pydantic import BaseModel, Extra, root_validator

@ -11,13 +11,57 @@ from langchain.chains.llm import LLMChain
 from langchain.docstore.document import Document


+def _split_list_of_docs(
+    docs: List[Document], length_func: Callable, token_max: int, **kwargs: Any
+) -> List[List[Document]]:
+    new_result_doc_list = []
+    _sub_result_docs = []
+    for doc in docs:
+        _sub_result_docs.append(doc)
+        _num_tokens = length_func(_sub_result_docs, **kwargs)
+        if _num_tokens > token_max:
+            if len(_sub_result_docs) == 1:
+                raise ValueError(
+                    "A single document was longer than the context length,"
+                    " we cannot handle this."
+                )
+            if len(_sub_result_docs) == 2:
+                raise ValueError(
+                    "A single document was so long it could not be combined "
+                    "with another document, we cannot handle this."
+                )
+            new_result_doc_list.append(_sub_result_docs[:-1])
+            _sub_result_docs = _sub_result_docs[-1:]
+    new_result_doc_list.append(_sub_result_docs)
+    return new_result_doc_list
+
+
+def _collapse_docs(
+    docs: List[Document],
+    combine_document_func: Callable,
+    **kwargs: Any,
+) -> Document:
+    result = combine_document_func(docs, **kwargs)
+    combined_metadata = {k: str(v) for k, v in docs[0].metadata.items()}
+    for doc in docs[1:]:
+        for k, v in doc.metadata.items():
+            if k in combined_metadata:
+                combined_metadata[k] += f", {v}"
+            else:
+                combined_metadata[k] = str(v)
+    return Document(page_content=result, metadata=combined_metadata)
+
+
 class MapReduceDocumentsChain(BaseCombineDocumentsChain, BaseModel):
    """Combining documents by mapping a chain over them, then combining results."""

    llm_chain: LLMChain
-    """Chain to apply to each document individually.."""
+    """Chain to apply to each document individually."""
    combine_document_chain: BaseCombineDocumentsChain
    """Chain to use to combine results of applying llm_chain to documents."""
+    collapse_document_chain: Optional[BaseCombineDocumentsChain] = None
+    """Chain to use to collapse intermediary results if needed.
+    If None, will use the combine_document_chain."""
    document_variable_name: str
    """The variable name in the llm_chain to put the documents in.
    If only one variable in the llm_chain, this need not be provided."""
@ -49,14 +93,45 @@ class MapReduceDocumentsChain(BaseCombineDocumentsChain, BaseModel):
                )
        return values

-    def combine_docs(self, docs: List[Document], **kwargs: Any) -> str:
-        """Combine by mapping first chain over all, then stuffing into final chain."""
+    @property
+    def _collapse_chain(self) -> BaseCombineDocumentsChain:
+        if self.collapse_document_chain is not None:
+            return self.collapse_document_chain
+        else:
+            return self.combine_document_chain
+
+    def combine_docs(
+        self, docs: List[Document], token_max: int = 3000, **kwargs: Any
+    ) -> str:
+        """Combine documents in a map reduce manner.
+
+        Combine by mapping first chain over all documents, then reducing the results.
+        This reducing can be done recursively if needed (if there are many documents).
+        """
        results = self.llm_chain.apply(
+            # FYI - this is parallelized and so it is fast.
            [{**{self.document_variable_name: d.page_content}, **kwargs} for d in docs]
        )
        question_result_key = self.llm_chain.output_key
        result_docs = [
            Document(page_content=r[question_result_key], metadata=docs[i].metadata)
+            # This uses metadata from the docs, and the textual results from `results`
            for i, r in enumerate(results)
        ]
-        return self.combine_document_chain.combine_docs(result_docs, **kwargs)
+        length_func = self.combine_document_chain.prompt_length
+        num_tokens = length_func(result_docs, **kwargs)
+        while num_tokens is not None and num_tokens > token_max:
+            new_result_doc_list = _split_list_of_docs(
+                result_docs, length_func, token_max, **kwargs
+            )
+            result_docs = []
+            for docs in new_result_doc_list:
+                new_doc = _collapse_docs(
+                    docs, self._collapse_chain.combine_docs, **kwargs
+                )
+                result_docs.append(new_doc)
+            num_tokens = self.combine_document_chain.prompt_length(
+                result_docs, **kwargs
+            )
+        output = self.combine_document_chain.combine_docs(result_docs, **kwargs)
+        return output
--- a/langchain/chains/combine_documents/stuff.py
+++ b/langchain/chains/combine_documents/stuff.py
@ -1,6 +1,6 @@
 """Chain that combines documents by stuffing into context."""

-from typing import Any, Dict, List
+from typing import Any, Dict, List, Optional

 from pydantic import BaseModel, Extra, Field, root_validator

@ -55,8 +55,7 @@ class StuffDocumentsChain(BaseCombineDocumentsChain, BaseModel):
                )
        return values

-    def combine_docs(self, docs: List[Document], **kwargs: Any) -> str:
-        """Stuff all documents into one prompt and pass to LLM."""
+    def _get_inputs(self, docs: List[Document], **kwargs: Any) -> dict:
        # Get relevant information from each document.
        doc_dicts = []
        for doc in docs:
@ -71,5 +70,16 @@ class StuffDocumentsChain(BaseCombineDocumentsChain, BaseModel):
        # Join the documents together to put them in the prompt.
        inputs = kwargs.copy()
        inputs[self.document_variable_name] = "\n\n".join(doc_strings)
+        return inputs
+
+    def prompt_length(self, docs: List[Document], **kwargs: Any) -> Optional[int]:
+        """Get the prompt length by formatting the prompt."""
+        inputs = self._get_inputs(docs, **kwargs)
+        prompt = self.llm_chain.prompt.format(**inputs)
+        return self.llm_chain.llm.get_num_tokens(prompt)
+
+    def combine_docs(self, docs: List[Document], **kwargs: Any) -> str:
+        """Stuff all documents into one prompt and pass to LLM."""
+        inputs = self._get_inputs(docs, **kwargs)
        # Call predict on the LLM.
        return self.llm_chain.predict(**inputs)
--- a/langchain/chains/conversation/memory.py
+++ b/langchain/chains/conversation/memory.py
@ -46,6 +46,10 @@ class ConversationBufferMemory(Memory, BaseModel):
        ai = "AI: " + outputs[list(outputs.keys())[0]]
        self.buffer += "\n" + "\n".join([human, ai])

+    def clear(self) -> None:
+        """Clear memory contents."""
+        self.buffer = ""
+

 class ConversationalBufferWindowMemory(Memory, BaseModel):
    """Buffer for storing conversation memory."""
@ -75,6 +79,10 @@ class ConversationalBufferWindowMemory(Memory, BaseModel):
        ai = "AI: " + outputs[list(outputs.keys())[0]]
        self.buffer.append("\n".join([human, ai]))

+    def clear(self) -> None:
+        """Clear memory contents."""
+        self.buffer = []
+

 class ConversationSummaryMemory(Memory, BaseModel):
    """Conversation summarizer to memory."""
@ -118,3 +126,7 @@ class ConversationSummaryMemory(Memory, BaseModel):
        new_lines = "\n".join([human, ai])
        chain = LLMChain(llm=self.llm, prompt=self.prompt)
        self.buffer = chain.predict(summary=self.buffer, new_lines=new_lines)
+
+    def clear(self) -> None:
+        """Clear memory contents."""
+        self.buffer = ""
--- a/langchain/chains/llm.py
+++ b/langchain/chains/llm.py
@ -51,18 +51,34 @@ class LLMChain(Chain, BaseModel):
        """
        return [self.output_key]

+    def apply(self, input_list: List[Dict[str, Any]]) -> List[Dict[str, str]]:
+        """Utilize the LLM generate method for speed gains."""
+        stop = None
+        if "stop" in input_list[0]:
+            stop = input_list[0]["stop"]
+        prompts = []
+        for inputs in input_list:
+            selected_inputs = {k: inputs[k] for k in self.prompt.input_variables}
+            prompt = self.prompt.format(**selected_inputs)
+            if self.verbose:
+                langchain.logger.log_llm_inputs(selected_inputs, prompt)
+            if "stop" in inputs and inputs["stop"] != stop:
+                raise ValueError(
+                    "If `stop` is present in any inputs, should be present in all."
+                )
+            prompts.append(prompt)
+        response = self.llm.generate(prompts, stop=stop)
+        outputs = []
+        for generation in response.generations:
+            # Get the text of the top generated string.
+            response_str = generation[0].text
+            if self.verbose:
+                langchain.logger.log_llm_response(response_str)
+            outputs.append({self.output_key: response_str})
+        return outputs
+
    def _call(self, inputs: Dict[str, Any]) -> Dict[str, str]:
-        selected_inputs = {k: inputs[k] for k in self.prompt.input_variables}
-        prompt = self.prompt.format(**selected_inputs)
-        if self.verbose:
-            langchain.logger.log_llm_inputs(selected_inputs, prompt)
-        kwargs = {}
-        if "stop" in inputs:
-            kwargs["stop"] = inputs["stop"]
-        response = self.llm(prompt, **kwargs)
-        if self.verbose:
-            langchain.logger.log_llm_response(response)
-        return {self.output_key: response}
+        return self.apply([inputs])[0]

    def predict(self, **kwargs: Any) -> str:
        """Format prompt with kwargs and pass to LLM.
--- a/langchain/chains/qa_with_sources/init.py
+++ b/langchain/chains/qa_with_sources/init.py
@ -1,5 +1,5 @@
 """Load question answering with sources chains."""
-from typing import Any, Mapping, Protocol
+from typing import Any, Mapping, Optional, Protocol

 from langchain.chains.combine_documents.base import BaseCombineDocumentsChain
 from langchain.chains.combine_documents.map_reduce import MapReduceDocumentsChain
@ -44,6 +44,7 @@ def _load_map_reduce_chain(
    document_prompt: BasePromptTemplate = map_reduce_prompt.EXAMPLE_PROMPT,
    combine_document_variable_name: str = "summaries",
    map_reduce_document_variable_name: str = "context",
+    collapse_prompt: Optional[BasePromptTemplate] = None,
    **kwargs: Any,
 ) -> MapReduceDocumentsChain:
    map_chain = LLMChain(llm=llm, prompt=question_prompt)
@ -53,10 +54,19 @@ def _load_map_reduce_chain(
        document_variable_name=combine_document_variable_name,
        document_prompt=document_prompt,
    )
+    if collapse_prompt is None:
+        collapse_chain = None
+    else:
+        collapse_chain = StuffDocumentsChain(
+            llm_chain=LLMChain(llm=llm, prompt=collapse_prompt),
+            document_variable_name=combine_document_variable_name,
+            document_prompt=document_prompt,
+        )
    return MapReduceDocumentsChain(
        llm_chain=map_chain,
        combine_document_chain=combine_document_chain,
        document_variable_name=map_reduce_document_variable_name,
+        collapse_document_chain=collapse_chain,
        **kwargs,
    )

--- a/langchain/chains/question_answering/init.py
+++ b/langchain/chains/question_answering/init.py
@ -1,5 +1,5 @@
 """Load question answering chains."""
-from typing import Any, Mapping, Protocol
+from typing import Any, Mapping, Optional, Protocol

 from langchain.chains.combine_documents.base import BaseCombineDocumentsChain
 from langchain.chains.combine_documents.map_reduce import MapReduceDocumentsChain
@ -41,6 +41,7 @@ def _load_map_reduce_chain(
    combine_prompt: BasePromptTemplate = map_reduce_prompt.COMBINE_PROMPT,
    combine_document_variable_name: str = "summaries",
    map_reduce_document_variable_name: str = "context",
+    collapse_prompt: Optional[BasePromptTemplate] = None,
    **kwargs: Any,
 ) -> MapReduceDocumentsChain:
    map_chain = LLMChain(llm=llm, prompt=question_prompt)
@ -49,10 +50,18 @@ def _load_map_reduce_chain(
    combine_document_chain = StuffDocumentsChain(
        llm_chain=reduce_chain, document_variable_name=combine_document_variable_name
    )
+    if collapse_prompt is None:
+        collapse_chain = None
+    else:
+        collapse_chain = StuffDocumentsChain(
+            llm_chain=LLMChain(llm=llm, prompt=collapse_prompt),
+            document_variable_name=combine_document_variable_name,
+        )
    return MapReduceDocumentsChain(
        llm_chain=map_chain,
        combine_document_chain=combine_document_chain,
        document_variable_name=map_reduce_document_variable_name,
+        collapse_document_chain=collapse_chain,
        **kwargs,
    )

--- a/langchain/chains/summarize/init.py
+++ b/langchain/chains/summarize/init.py
@ -1,5 +1,5 @@
 """Load summarizing chains."""
-from typing import Any, Mapping, Protocol
+from typing import Any, Mapping, Optional, Protocol

 from langchain.chains.combine_documents.base import BaseCombineDocumentsChain
 from langchain.chains.combine_documents.map_reduce import MapReduceDocumentsChain
@ -37,6 +37,7 @@ def _load_map_reduce_chain(
    combine_prompt: BasePromptTemplate = map_reduce_prompt.PROMPT,
    combine_document_variable_name: str = "text",
    map_reduce_document_variable_name: str = "text",
+    collapse_prompt: Optional[BasePromptTemplate] = None,
    **kwargs: Any,
 ) -> MapReduceDocumentsChain:
    map_chain = LLMChain(llm=llm, prompt=map_prompt)
@ -45,10 +46,18 @@ def _load_map_reduce_chain(
    combine_document_chain = StuffDocumentsChain(
        llm_chain=reduce_chain, document_variable_name=combine_document_variable_name
    )
+    if collapse_prompt is None:
+        collapse_chain = None
+    else:
+        collapse_chain = StuffDocumentsChain(
+            llm_chain=LLMChain(llm=llm, prompt=collapse_prompt),
+            document_variable_name=combine_document_variable_name,
+        )
    return MapReduceDocumentsChain(
        llm_chain=map_chain,
        combine_document_chain=combine_document_chain,
        document_variable_name=map_reduce_document_variable_name,
+        collapse_document_chain=collapse_chain,
        **kwargs,
    )

--- a/langchain/embeddings/openai.py
+++ b/langchain/embeddings/openai.py
@ -22,9 +22,8 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
    """

    client: Any  #: :meta private:
-    model_name: str = "babbage"
-    """Model name to use."""
-
+    document_model_name: str = "text-embedding-ada-002"
+    query_model_name: str = "text-embedding-ada-002"
    openai_api_key: Optional[str] = None

    class Config:
@ -32,6 +31,26 @@ class OpenAIEmbeddings(BaseModel, Embeddings):

        extra = Extra.forbid

+    # TODO: deprecate this
+    @root_validator(pre=True)
+    def get_model_names(cls, values: Dict) -> Dict:
+        """Get model names from just old model name."""
+        if "model_name" in values:
+            if "document_model_name" in values:
+                raise ValueError(
+                    "Both `model_name` and `document_model_name` were provided, "
+                    "but only one should be."
+                )
+            if "query_model_name" in values:
+                raise ValueError(
+                    "Both `model_name` and `query_model_name` were provided, "
+                    "but only one should be."
+                )
+            model_name = values.pop("model_name")
+            values["document_model_name"] = f"text-search-{model_name}-doc-001"
+            values["query_model_name"] = f"text-search-{model_name}-query-001"
+        return values
+
    @root_validator()
    def validate_environment(cls, values: Dict) -> Dict:
        """Validate that api key and python package exists in environment."""
@ -66,7 +85,7 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
            List of embeddings, one for each text.
        """
        responses = [
-            self._embedding_func(text, engine=f"text-search-{self.model_name}-doc-001")
+            self._embedding_func(text, engine=self.document_model_name)
            for text in texts
        ]
        return responses
@ -80,7 +99,5 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
        Returns:
            Embeddings for the text.
        """
-        embedding = self._embedding_func(
-            text, engine=f"text-search-{self.model_name}-query-001"
-        )
+        embedding = self._embedding_func(text, engine=self.query_model_name)
        return embedding
--- a/langchain/llms/init.py
+++ b/langchain/llms/init.py
@ -1,7 +1,28 @@
 """Wrappers on top of large language models APIs."""
+from typing import Dict, Type
+
+from langchain.llms.ai21 import AI21
+from langchain.llms.base import LLM
 from langchain.llms.cohere import Cohere
 from langchain.llms.huggingface_hub import HuggingFaceHub
+from langchain.llms.huggingface_pipeline import HuggingFacePipeline
 from langchain.llms.nlpcloud import NLPCloud
 from langchain.llms.openai import OpenAI

-__all__ = ["Cohere", "NLPCloud", "OpenAI", "HuggingFaceHub"]
+__all__ = [
+    "Cohere",
+    "NLPCloud",
+    "OpenAI",
+    "HuggingFaceHub",
+    "HuggingFacePipeline",
+    "AI21",
+]
+
+type_to_cls_dict: Dict[str, Type[LLM]] = {
+    "ai21": AI21,
+    "cohere": Cohere,
+    "huggingface_hub": HuggingFaceHub,
+    "nlpcloud": NLPCloud,
+    "openai": OpenAI,
+    "huggingface_pipeline": HuggingFacePipeline,
+}
--- a/langchain/llms/ai21.py
+++ b/langchain/llms/ai21.py
@ -19,7 +19,7 @@ class AI21PenaltyData(BaseModel):
    applyToEmojis: bool = True


-class AI21(BaseModel, LLM):
+class AI21(LLM, BaseModel):
    """Wrapper around AI21 large language models.

    To use, you should have the environment variable ``AI21_API_KEY``
@ -96,7 +96,12 @@ class AI21(BaseModel, LLM):
        """Get the identifying parameters."""
        return {**{"model": self.model}, **self._default_params}

-    def __call__(self, prompt: str, stop: Optional[List[str]] = None) -> str:
+    @property
+    def _llm_type(self) -> str:
+        """Return type of llm."""
+        return "ai21"
+
+    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
        """Call out to AI21's complete endpoint.

        Args:
--- a/langchain/llms/base.py
+++ b/langchain/llms/base.py
@ -1,15 +1,116 @@
 """Base interface for large language models to expose."""
+import json
 from abc import ABC, abstractmethod
-from typing import Any, List, Mapping, Optional
+from pathlib import Path
+from typing import Any, Dict, List, Mapping, NamedTuple, Optional, Union

+import yaml
+from pydantic import BaseModel, Extra

-class LLM(ABC):
+import langchain
+from langchain.schema import Generation
+
+
+class LLMResult(NamedTuple):
+    """Class that contains all relevant information for an LLM Result."""
+
+    generations: List[List[Generation]]
+    """List of the things generated. This is List[List[]] because
+    each input could have multiple generations."""
+    llm_output: Optional[dict] = None
+    """For arbitrary LLM provider specific output."""
+
+
+class LLM(BaseModel, ABC):
    """LLM wrapper should take in a prompt and return a string."""

+    class Config:
+        """Configuration for this pydantic object."""
+
+        extra = Extra.forbid
+
+    def _generate(
+        self, prompts: List[str], stop: Optional[List[str]] = None
+    ) -> LLMResult:
+        """Run the LLM on the given prompt and input."""
+        # TODO: add caching here.
+        generations = []
+        for prompt in prompts:
+            text = self(prompt, stop=stop)
+            generations.append([Generation(text=text)])
+        return LLMResult(generations=generations)
+
+    def generate(
+        self, prompts: List[str], stop: Optional[List[str]] = None
+    ) -> LLMResult:
+        """Run the LLM on the given prompt and input."""
+        if langchain.llm_cache is None:
+            return self._generate(prompts, stop=stop)
+        params = self._llm_dict()
+        params["stop"] = stop
+        llm_string = str(sorted([(k, v) for k, v in params.items()]))
+        missing_prompts = []
+        missing_prompt_idxs = []
+        existing_prompts = {}
+        for i, prompt in enumerate(prompts):
+            cache_val = langchain.llm_cache.lookup(prompt, llm_string)
+            if isinstance(cache_val, list):
+                existing_prompts[i] = cache_val
+            else:
+                missing_prompts.append(prompt)
+                missing_prompt_idxs.append(i)
+        new_results = self._generate(missing_prompts, stop=stop)
+        for i, result in enumerate(new_results.generations):
+            existing_prompts[i] = result
+            prompt = prompts[i]
+            langchain.llm_cache.update(prompt, llm_string, result)
+        generations = [existing_prompts[i] for i in range(len(prompts))]
+        return LLMResult(generations=generations, llm_output=new_results.llm_output)
+
+    def get_num_tokens(self, text: str) -> int:
+        """Get the number of tokens present in the text."""
+        # TODO: this method may not be exact.
+        # TODO: this method may differ based on model (eg codex).
+        try:
+            from transformers import GPT2TokenizerFast
+        except ImportError:
+            raise ValueError(
+                "Could not import transformers python package. "
+                "This is needed in order to calculate get_num_tokens. "
+                "Please it install it with `pip install transformers`."
+            )
+        # create a GPT-3 tokenizer instance
+        tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
+
+        # tokenize the text using the GPT-3 tokenizer
+        tokenized_text = tokenizer.tokenize(text)
+
+        # calculate the number of tokens in the tokenized text
+        return len(tokenized_text)
+
    @abstractmethod
-    def __call__(self, prompt: str, stop: Optional[List[str]] = None) -> str:
+    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
        """Run the LLM on the given prompt and input."""

+    def __call__(self, prompt: str, stop: Optional[List[str]] = None) -> str:
+        """Check Cache and run the LLM on the given prompt and input."""
+        if langchain.llm_cache is None:
+            return self._call(prompt, stop=stop)
+        params = self._llm_dict()
+        params["stop"] = stop
+        llm_string = str(sorted([(k, v) for k, v in params.items()]))
+        if langchain.cache is not None:
+            cache_val = langchain.llm_cache.lookup(prompt, llm_string)
+            if cache_val is not None:
+                if isinstance(cache_val, str):
+                    return cache_val
+                else:
+                    return cache_val[0].text
+        return_val = self._call(prompt, stop=stop)
+        if langchain.cache is not None:
+            langchain.llm_cache.update(prompt, llm_string, return_val)
+        return return_val
+
    @property
    def _identifying_params(self) -> Mapping[str, Any]:
        """Get the identifying parameters."""
@ -19,3 +120,46 @@ class LLM(ABC):
        """Get a string representation of the object for printing."""
        cls_name = f"\033[1m{self.__class__.__name__}\033[0m"
        return f"{cls_name}\nParams: {self._identifying_params}"
+
+    @property
+    @abstractmethod
+    def _llm_type(self) -> str:
+        """Return type of llm."""
+
+    def _llm_dict(self) -> Dict:
+        """Return a dictionary of the prompt."""
+        starter_dict = dict(self._identifying_params)
+        starter_dict["_type"] = self._llm_type
+        return starter_dict
+
+    def save(self, file_path: Union[Path, str]) -> None:
+        """Save the LLM.
+
+        Args:
+            file_path: Path to file to save the LLM to.
+
+        Example:
+        .. code-block:: python
+
+            llm.save(file_path="path/llm.yaml")
+        """
+        # Convert file to Path object.
+        if isinstance(file_path, str):
+            save_path = Path(file_path)
+        else:
+            save_path = file_path
+
+        directory_path = save_path.parent
+        directory_path.mkdir(parents=True, exist_ok=True)
+
+        # Fetch dictionary to save
+        prompt_dict = self._llm_dict()
+
+        if save_path.suffix == ".json":
+            with open(file_path, "w") as f:
+                json.dump(prompt_dict, f, indent=4)
+        elif save_path.suffix == ".yaml":
+            with open(file_path, "w") as f:
+                yaml.dump(prompt_dict, f, default_flow_style=False)
+        else:
+            raise ValueError(f"{save_path} must be json or yaml")
--- a/langchain/llms/cohere.py
+++ b/langchain/llms/cohere.py
@ -85,7 +85,12 @@ class Cohere(LLM, BaseModel):
        """Get the identifying parameters."""
        return {**{"model": self.model}, **self._default_params}

-    def __call__(self, prompt: str, stop: Optional[List[str]] = None) -> str:
+    @property
+    def _llm_type(self) -> str:
+        """Return type of llm."""
+        return "cohere"
+
+    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
        """Call out to Cohere's generate endpoint.

        Args:
--- a/langchain/llms/huggingface_hub.py
+++ b/langchain/llms/huggingface_hub.py
@ -74,9 +74,17 @@ class HuggingFaceHub(LLM, BaseModel):
    def _identifying_params(self) -> Mapping[str, Any]:
        """Get the identifying parameters."""
        _model_kwargs = self.model_kwargs or {}
-        return {**{"repo_id": self.repo_id}, **_model_kwargs}
+        return {
+            **{"repo_id": self.repo_id, "task": self.task},
+            **{"model_kwargs": _model_kwargs},
+        }

-    def __call__(self, prompt: str, stop: Optional[List[str]] = None) -> str:
+    @property
+    def _llm_type(self) -> str:
+        """Return type of llm."""
+        return "huggingface_hub"
+
+    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
        """Call out to HuggingFace Hub's inference endpoint.

        Args:
--- a/langchain/llms/huggingface_pipeline.py
+++ b/langchain/llms/huggingface_pipeline.py
@ -0,0 +1,118 @@
+"""Wrapper around HuggingFace Pipeline APIs."""
+from typing import Any, List, Mapping, Optional
+
+from pydantic import BaseModel, Extra
+
+from langchain.llms.base import LLM
+from langchain.llms.utils import enforce_stop_tokens
+
+DEFAULT_MODEL_ID = "gpt2"
+DEFAULT_TASK = "text-generation"
+VALID_TASKS = ("text2text-generation", "text-generation")
+
+
+class HuggingFacePipeline(LLM, BaseModel):
+    """Wrapper around HuggingFace Pipeline API.
+
+    To use, you should have the ``transformers`` python package installed.
+
+    Only supports `text-generation` and `text2text-generation` for now.
+
+    Example using from_model_id:
+        .. code-block:: python
+
+            from langchain.llms.huggingface_pipeline import HuggingFacePipeline
+            hf = HuggingFacePipeline.from_model_id(
+                model_id="gpt2", task="text-generation"
+            )
+    Example passing pipeline in directly:
+    .. code-block:: python
+
+            from langchain.llms.huggingface_pipeline import HuggingFacePipeline
+            from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
+
+            model_id = "gpt2"
+            tokenizer = AutoTokenizer.from_pretrained(model_id)
+            model = AutoModelForCausalLM.from_pretrained(model_id)
+            pipe = pipeline(
+                "text-generation", model=model, tokenizer=tokenizer, max_new_tokens=10
+            )
+            hf = HuggingFacePipeline(pipeline=pipe
+    """
+
+    pipeline: Any  #: :meta private:
+    model_id: str = DEFAULT_MODEL_ID
+    """Model name to use."""
+    model_kwargs: Optional[dict] = None
+    """Key word arguments to pass to the model."""
+
+    class Config:
+        """Configuration for this pydantic object."""
+
+        extra = Extra.forbid
+
+    @classmethod
+    def from_model_id(
+        cls,
+        model_id: str,
+        task: str,
+        model_kwargs: Optional[dict] = None,
+        **kwargs: Any,
+    ) -> LLM:
+        """Construct the pipeline object from model_id and task."""
+        try:
+            from transformers import AutoModelForCausalLM, AutoTokenizer
+            from transformers import pipeline as hf_pipeline
+
+            tokenizer = AutoTokenizer.from_pretrained(model_id)
+            model = AutoModelForCausalLM.from_pretrained(model_id)
+            pipeline = hf_pipeline(
+                task=task, model=model, tokenizer=tokenizer, **model_kwargs
+            )
+            if pipeline.task not in VALID_TASKS:
+                raise ValueError(
+                    f"Got invalid task {pipeline.task}, "
+                    f"currently only {VALID_TASKS} are supported"
+                )
+
+            return cls(
+                pipeline=pipeline,
+                model_id=model_id,
+                model_kwargs=model_kwargs,
+                **kwargs,
+            )
+        except ImportError:
+            raise ValueError(
+                "Could not import transformers python package. "
+                "Please it install it with `pip install transformers`."
+            )
+
+    @property
+    def _identifying_params(self) -> Mapping[str, Any]:
+        """Get the identifying parameters."""
+        return {
+            **{"model_id": self.model_id},
+            **{"model_kwargs": self.model_kwargs},
+        }
+
+    @property
+    def _llm_type(self) -> str:
+        return "huggingface_pipeline"
+
+    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
+        response = self.pipeline(text_inputs=prompt)
+        if self.pipeline.task == "text-generation":
+            # Text generation return includes the starter text.
+            text = response[0]["generated_text"][len(prompt) :]
+        elif self.pipeline.task == "text2text-generation":
+            text = response[0]["generated_text"]
+        else:
+            raise ValueError(
+                f"Got invalid task {self.pipeline.task}, "
+                f"currently only {VALID_TASKS} are supported"
+            )
+        if stop is not None:
+            # This is a bit hacky, but I can't figure out a better way to enforce
+            # stop tokens when making calls to huggingface_hub.
+            text = enforce_stop_tokens(text, stop)
+        return text
--- a/langchain/llms/loading.py
+++ b/langchain/llms/loading.py
@ -0,0 +1,42 @@
+"""Base interface for loading large language models apis."""
+import json
+from pathlib import Path
+from typing import Union
+
+import yaml
+
+from langchain.llms import type_to_cls_dict
+from langchain.llms.base import LLM
+
+
+def load_llm_from_config(config: dict) -> LLM:
+    """Load LLM from Config Dict."""
+    if "_type" not in config:
+        raise ValueError("Must specify an LLM Type in config")
+    config_type = config.pop("_type")
+
+    if config_type not in type_to_cls_dict:
+        raise ValueError(f"Loading {config_type} LLM not supported")
+
+    llm_cls = type_to_cls_dict[config_type]
+    return llm_cls(**config)
+
+
+def load_llm(file: Union[str, Path]) -> LLM:
+    """Load LLM from file."""
+    # Convert file to Path object.
+    if isinstance(file, str):
+        file_path = Path(file)
+    else:
+        file_path = file
+    # Load from either json or yaml.
+    if file_path.suffix == ".json":
+        with open(file_path) as f:
+            config = json.load(f)
+    elif file_path.suffix == ".yaml":
+        with open(file_path, "r") as f:
+            config = yaml.safe_load(f)
+    else:
+        raise ValueError("File type must be json or yaml")
+    # Load the LLM from the config now.
+    return load_llm_from_config(config)
--- a/langchain/llms/manifest.py
+++ b/langchain/llms/manifest.py
@ -37,7 +37,12 @@ class ManifestWrapper(LLM, BaseModel):
        kwargs = self.llm_kwargs or {}
        return {**self.client.client.get_model_params(), **kwargs}

-    def __call__(self, prompt: str, stop: Optional[List[str]] = None) -> str:
+    @property
+    def _llm_type(self) -> str:
+        """Return type of llm."""
+        return "manifest"
+
+    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
        """Call out to LLM through Manifest."""
        if stop is not None and len(stop) != 1:
            raise NotImplementedError(
--- a/langchain/llms/nlpcloud.py
+++ b/langchain/llms/nlpcloud.py
@ -106,7 +106,12 @@ class NLPCloud(LLM, BaseModel):
        """Get the identifying parameters."""
        return {**{"model_name": self.model_name}, **self._default_params}

-    def __call__(self, prompt: str, stop: Optional[List[str]] = None) -> str:
+    @property
+    def _llm_type(self) -> str:
+        """Return type of llm."""
+        return "nlpcloud"
+
+    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
        """Call out to NLPCloud's create endpoint.

        Args:
--- a/langchain/llms/openai.py
+++ b/langchain/llms/openai.py
@ -1,9 +1,11 @@
 """Wrapper around OpenAI APIs."""
-from typing import Any, Dict, List, Mapping, Optional
+import sys
+from typing import Any, Dict, Generator, List, Mapping, Optional

 from pydantic import BaseModel, Extra, Field, root_validator

-from langchain.llms.base import LLM
+from langchain.llms.base import LLM, LLMResult
+from langchain.schema import Generation
 from langchain.utils import get_from_dict_or_env


@ -29,7 +31,9 @@ class OpenAI(LLM, BaseModel):
    temperature: float = 0.7
    """What sampling temperature to use."""
    max_tokens: int = 256
-    """The maximum number of tokens to generate in the completion."""
+    """The maximum number of tokens to generate in the completion.
+    -1 returns as many tokens as possible given the prompt and
+    the models maximal context size."""
    top_p: float = 1
    """Total probability mass of tokens to consider at each step."""
    frequency_penalty: float = 0
@ -43,6 +47,8 @@ class OpenAI(LLM, BaseModel):
    model_kwargs: Dict[str, Any] = Field(default_factory=dict)
    """Holds any model parameters valid for `create` call not explicitly specified."""
    openai_api_key: Optional[str] = None
+    batch_size: int = 20
+    """Batch size to use when passing multiple documents to generate."""

    class Config:
        """Configuration for this pydantic object."""
@ -95,12 +101,100 @@ class OpenAI(LLM, BaseModel):
        }
        return {**normal_params, **self.model_kwargs}

+    def _generate(
+        self, prompts: List[str], stop: Optional[List[str]] = None
+    ) -> LLMResult:
+        """Call out to OpenAI's endpoint with k unique prompts.
+
+        Args:
+            prompts: The prompts to pass into the model.
+            stop: Optional list of stop words to use when generating.
+
+        Returns:
+            The full LLM output.
+
+        Example:
+            .. code-block:: python
+
+                response = openai.generate(["Tell me a joke."])
+        """
+        # TODO: write a unit test for this
+        params = self._default_params
+        if stop is not None:
+            if "stop" in params:
+                raise ValueError("`stop` found in both the input and default params.")
+            params["stop"] = stop
+
+        if params["max_tokens"] == -1:
+            if len(prompts) != 1:
+                raise ValueError(
+                    "max_tokens set to -1 not supported for multiple inputs."
+                )
+            params["max_tokens"] = self.max_tokens_for_prompt(prompts[0])
+        sub_prompts = [
+            prompts[i : i + self.batch_size]
+            for i in range(0, len(prompts), self.batch_size)
+        ]
+        choices = []
+        token_usage = {}
+        # Get the token usage from the response.
+        # Includes prompt, completion, and total tokens used.
+        _keys = ["completion_tokens", "prompt_tokens", "total_tokens"]
+        for _prompts in sub_prompts:
+            response = self.client.create(
+                model=self.model_name, prompt=_prompts, **params
+            )
+            choices.extend(response["choices"])
+            for _key in _keys:
+                if _key not in token_usage:
+                    token_usage[_key] = response["usage"][_key]
+                else:
+                    token_usage[_key] += response["usage"][_key]
+        generations = []
+        for i, prompt in enumerate(prompts):
+            sub_choices = choices[i * self.n : (i + 1) * self.n]
+            generations.append(
+                [Generation(text=choice["text"]) for choice in sub_choices]
+            )
+        return LLMResult(
+            generations=generations, llm_output={"token_usage": token_usage}
+        )
+
+    def stream(self, prompt: str) -> Generator:
+        """Call OpenAI with streaming flag and return the resulting generator.
+
+        Args:
+            prompt: The prompts to pass into the model.
+
+        Returns:
+            A generator representing the stream of tokens from OpenAI.
+
+        Example:
+            .. code-block:: python
+
+                generator = openai.stream("Tell me a joke.")
+                for token in generator:
+                    yield token
+        """
+        params = self._default_params
+        if params["best_of"] != 1:
+            raise ValueError("OpenAI only supports best_of == 1 for streaming")
+        params["stream"] = True
+        generator = self.client.create(model=self.model_name, prompt=prompt, **params)
+
+        return generator
+
    @property
    def _identifying_params(self) -> Mapping[str, Any]:
        """Get the identifying parameters."""
-        return {**{"model": self.model_name}, **self._default_params}
+        return {**{"model_name": self.model_name}, **self._default_params}
+
+    @property
+    def _llm_type(self) -> str:
+        """Return type of llm."""
+        return "openai"

-    def __call__(self, prompt: str, stop: Optional[List[str]] = None) -> str:
+    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
        """Call out to OpenAI's create endpoint.

        Args:
@ -115,10 +209,82 @@ class OpenAI(LLM, BaseModel):

                response = openai("Tell me a joke.")
        """
-        params = self._default_params
-        if stop is not None:
-            if "stop" in params:
-                raise ValueError("`stop` found in both the input and default params.")
-            params["stop"] = stop
-        response = self.client.create(model=self.model_name, prompt=prompt, **params)
-        return response["choices"][0]["text"]
+        return self.generate([prompt], stop=stop).generations[0][0].text
+
+    def get_num_tokens(self, text: str) -> int:
+        """Calculate num tokens with tiktoken package."""
+        # tiktoken NOT supported for Python 3.8 or below
+        if sys.version_info[1] <= 8:
+            return super().get_num_tokens(text)
+        try:
+            import tiktoken
+        except ImportError:
+            raise ValueError(
+                "Could not import tiktoken python package. "
+                "This is needed in order to calculate get_num_tokens. "
+                "Please it install it with `pip install tiktoken`."
+            )
+        # create a GPT-3 encoder instance
+        enc = tiktoken.get_encoding("gpt2")
+
+        # encode the text using the GPT-3 encoder
+        tokenized_text = enc.encode(text)
+
+        # calculate the number of tokens in the encoded text
+        return len(tokenized_text)
+
+    def modelname_to_contextsize(self, modelname: str) -> int:
+        """Calculate the maximum number of tokens possible to generate for a model.
+
+        text-davinci-003: 4,000 tokens
+        text-curie-001: 2,048 tokens
+        text-babbage-001: 2,048 tokens
+        text-ada-001: 2,048 tokens
+        code-davinci-002: 8,000 tokens
+        code-cushman-001: 2,048 tokens
+
+        Args:
+            modelname: The modelname we want to know the context size for.
+
+        Returns:
+            The maximum context size
+
+        Example:
+            .. code-block:: python
+
+                max_tokens = openai.modelname_to_contextsize("text-davinci-003")
+        """
+        if modelname == "text-davinci-003":
+            return 4000
+        elif modelname == "text-curie-001":
+            return 2048
+        elif modelname == "text-babbage-001":
+            return 2048
+        elif modelname == "text-ada-001":
+            return 2048
+        elif modelname == "code-davinci-002":
+            return 8000
+        elif modelname == "code-cushman-001":
+            return 2048
+        else:
+            return 4000
+
+    def max_tokens_for_prompt(self, prompt: str) -> int:
+        """Calculate the maximum number of tokens possible to generate for a prompt.
+
+        Args:
+            prompt: The prompt to pass into the model.
+
+        Returns:
+            The maximum number of tokens to generate for a prompt.
+
+        Example:
+            .. code-block:: python
+
+                max_tokens = openai.max_token_for_prompt("Tell me a joke.")
+        """
+        num_tokens = self.get_num_tokens(prompt)
+
+        # get max context size for model by name
+        max_size = self.modelname_to_contextsize(self.model_name)
+        return max_size - num_tokens
--- a/langchain/schema.py
+++ b/langchain/schema.py
@ -9,3 +9,11 @@ class AgentAction(NamedTuple):
    tool: str
    tool_input: str
    log: str
+
+
+class Generation(NamedTuple):
+    """Output of a single generation."""
+
+    text: str
+    """Generated text output."""
+    # TODO: add log probs
--- a/langchain/text_splitter.py
+++ b/langchain/text_splitter.py
@ -49,7 +49,7 @@ class TextSplitter(ABC):

    @classmethod
    def from_huggingface_tokenizer(cls, tokenizer: Any, **kwargs: Any) -> TextSplitter:
-        """Text splitter than uses HuggingFace tokenizer to count length."""
+        """Text splitter that uses HuggingFace tokenizer to count length."""
        try:
            from transformers import PreTrainedTokenizerBase

@ -68,6 +68,27 @@ class TextSplitter(ABC):
            )
        return cls(length_function=_huggingface_tokenizer_length, **kwargs)

+    @classmethod
+    def from_tiktoken_encoder(
+        cls, encoding_name: str = "gpt2", **kwargs: Any
+    ) -> TextSplitter:
+        """Text splitter that uses tiktoken encoder to count length."""
+        try:
+            import tiktoken
+        except ImportError:
+            raise ValueError(
+                "Could not import tiktoken python package. "
+                "This is needed in order to calculate max_tokens_for_prompt. "
+                "Please it install it with `pip install tiktoken`."
+            )
+        # create a GPT-3 encoder instance
+        enc = tiktoken.get_encoding(encoding_name)
+
+        def _tiktoken_encoder(text: str) -> int:
+            return len(enc.encode(text))
+
+        return cls(length_function=_tiktoken_encoder, **kwargs)
+

 class CharacterTextSplitter(TextSplitter):
    """Implementation of splitting text that looks at characters."""
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "langchain"
-version = "0.0.34"
+version = "0.0.39"
 description = "Building applications with LLMs through composability"
 authors = []
 license = "MIT"
@ -22,9 +22,12 @@ spacy = {version = "^3", optional = true}
 nltk = {version = "^3", optional = true}
 transformers = {version = "^4", optional = true}
 beautifulsoup4 = {version = "^4", optional = true}
+torch = {version = "^1.13.1", optional = true}
+tiktoken = {version = "^0", optional = true, python="^3.9"}

 [tool.poetry.group.test.dependencies]
 pytest = "^7.2.0"
+pytest-cov = "^4.0.0"
 pytest-dotenv = "^0.5.2"

 [tool.poetry.group.lint.dependencies]
@ -47,8 +50,8 @@ jupyter = "^1.0.0"
 playwright = "^1.28.0"

 [tool.poetry.extras]
-llms = ["cohere", "openai", "nlpcloud", "huggingface_hub", "manifest-ml"]
-all = ["cohere", "openai", "nlpcloud", "huggingface_hub", "manifest-ml", "elasticsearch", "google-search-results", "faiss-cpu", "sentence_transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4"]
+llms = ["cohere", "openai", "nlpcloud", "huggingface_hub", "manifest-ml", "torch", "transformers"]
+all = ["cohere", "openai", "nlpcloud", "huggingface_hub", "manifest-ml", "elasticsearch", "google-search-results", "faiss-cpu", "sentence_transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch"]

 [tool.isort]
 profile = "black"
--- a/tests/integration_tests/llms/test_ai21.py
+++ b/tests/integration_tests/llms/test_ai21.py
@ -1,6 +1,9 @@
 """Test AI21 API wrapper."""

+from pathlib import Path
+
 from langchain.llms.ai21 import AI21
+from langchain.llms.loading import load_llm


 def test_ai21_call() -> None:
@ -8,3 +11,11 @@ def test_ai21_call() -> None:
    llm = AI21(maxTokens=10)
    output = llm("Say foo:")
    assert isinstance(output, str)
+
+
+def test_saving_loading_llm(tmp_path: Path) -> None:
+    """Test saving/loading an AI21 LLM."""
+    llm = AI21(maxTokens=10)
+    llm.save(file_path=tmp_path / "ai21.yaml")
+    loaded_llm = load_llm(tmp_path / "ai21.yaml")
+    assert llm == loaded_llm
--- a/tests/integration_tests/llms/test_cohere.py
+++ b/tests/integration_tests/llms/test_cohere.py
@ -1,6 +1,10 @@
 """Test Cohere API wrapper."""

+from pathlib import Path
+
 from langchain.llms.cohere import Cohere
+from langchain.llms.loading import load_llm
+from tests.integration_tests.llms.utils import assert_llm_equality


 def test_cohere_call() -> None:
@ -8,3 +12,11 @@ def test_cohere_call() -> None:
    llm = Cohere(max_tokens=10)
    output = llm("Say foo:")
    assert isinstance(output, str)
+
+
+def test_saving_loading_llm(tmp_path: Path) -> None:
+    """Test saving/loading an Cohere LLM."""
+    llm = Cohere(max_tokens=10)
+    llm.save(file_path=tmp_path / "cohere.yaml")
+    loaded_llm = load_llm(tmp_path / "cohere.yaml")
+    assert_llm_equality(llm, loaded_llm)
--- a/tests/integration_tests/llms/test_huggingface_hub.py
+++ b/tests/integration_tests/llms/test_huggingface_hub.py
@ -1,8 +1,12 @@
 """Test HuggingFace API wrapper."""

+from pathlib import Path
+
 import pytest

 from langchain.llms.huggingface_hub import HuggingFaceHub
+from langchain.llms.loading import load_llm
+from tests.integration_tests.llms.utils import assert_llm_equality


 def test_huggingface_text_generation() -> None:
@ -24,3 +28,11 @@ def test_huggingface_call_error() -> None:
    llm = HuggingFaceHub(model_kwargs={"max_new_tokens": -1})
    with pytest.raises(ValueError):
        llm("Say foo:")
+
+
+def test_saving_loading_llm(tmp_path: Path) -> None:
+    """Test saving/loading an HuggingFaceHub LLM."""
+    llm = HuggingFaceHub(repo_id="gpt2", model_kwargs={"max_new_tokens": 10})
+    llm.save(file_path=tmp_path / "hf.yaml")
+    loaded_llm = load_llm(tmp_path / "hf.yaml")
+    assert_llm_equality(llm, loaded_llm)
--- a/tests/integration_tests/llms/test_huggingface_pipeline.py
+++ b/tests/integration_tests/llms/test_huggingface_pipeline.py
@ -0,0 +1,41 @@
+"""Test HuggingFace Pipeline wrapper."""
+
+from pathlib import Path
+
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
+
+from langchain.llms.huggingface_pipeline import HuggingFacePipeline
+from langchain.llms.loading import load_llm
+from tests.integration_tests.llms.utils import assert_llm_equality
+
+
+def test_huggingface_pipeline_text_generation() -> None:
+    """Test valid call to HuggingFace text generation model."""
+    llm = HuggingFacePipeline.from_model_id(
+        model_id="gpt2", task="text-generation", model_kwargs={"max_new_tokens": 10}
+    )
+    output = llm("Say foo:")
+    assert isinstance(output, str)
+
+
+def test_saving_loading_llm(tmp_path: Path) -> None:
+    """Test saving/loading an HuggingFaceHub LLM."""
+    llm = HuggingFacePipeline.from_model_id(
+        model_id="gpt2", task="text-generation", model_kwargs={"max_new_tokens": 10}
+    )
+    llm.save(file_path=tmp_path / "hf.yaml")
+    loaded_llm = load_llm(tmp_path / "hf.yaml")
+    assert_llm_equality(llm, loaded_llm)
+
+
+def test_init_with_pipeline() -> None:
+    """Test initialization with a HF pipeline."""
+    model_id = "gpt2"
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+    model = AutoModelForCausalLM.from_pretrained(model_id)
+    pipe = pipeline(
+        "text-generation", model=model, tokenizer=tokenizer, max_new_tokens=10
+    )
+    llm = HuggingFacePipeline(pipeline=pipe)
+    output = llm("Say foo:")
+    assert isinstance(output, str)
--- a/tests/integration_tests/llms/test_nlpcloud.py
+++ b/tests/integration_tests/llms/test_nlpcloud.py
@ -1,6 +1,10 @@
 """Test NLPCloud API wrapper."""

+from pathlib import Path
+
+from langchain.llms.loading import load_llm
 from langchain.llms.nlpcloud import NLPCloud
+from tests.integration_tests.llms.utils import assert_llm_equality


 def test_nlpcloud_call() -> None:
@ -8,3 +12,11 @@ def test_nlpcloud_call() -> None:
    llm = NLPCloud(max_length=10)
    output = llm("Say foo:")
    assert isinstance(output, str)
+
+
+def test_saving_loading_llm(tmp_path: Path) -> None:
+    """Test saving/loading an NLPCloud LLM."""
+    llm = NLPCloud(max_length=10)
+    llm.save(file_path=tmp_path / "nlpcloud.yaml")
+    loaded_llm = load_llm(tmp_path / "nlpcloud.yaml")
+    assert_llm_equality(llm, loaded_llm)
--- a/tests/integration_tests/llms/test_openai.py
+++ b/tests/integration_tests/llms/test_openai.py
@ -1,7 +1,11 @@
 """Test OpenAI API wrapper."""

+from pathlib import Path
+from typing import Generator
+
 import pytest

+from langchain.llms.loading import load_llm
 from langchain.llms.openai import OpenAI


@ -44,3 +48,29 @@ def test_openai_stop_error() -> None:
    llm = OpenAI(stop="3", temperature=0)
    with pytest.raises(ValueError):
        llm("write an ordered list of five items", stop=["\n"])
+
+
+def test_saving_loading_llm(tmp_path: Path) -> None:
+    """Test saving/loading an OpenAPI LLM."""
+    llm = OpenAI(max_tokens=10)
+    llm.save(file_path=tmp_path / "openai.yaml")
+    loaded_llm = load_llm(tmp_path / "openai.yaml")
+    assert loaded_llm == llm
+
+
+def test_openai_streaming() -> None:
+    """Test streaming tokens from OpenAI."""
+    llm = OpenAI(max_tokens=10)
+    generator = llm.stream("I'm Pickle Rick")
+
+    assert isinstance(generator, Generator)
+
+    for token in generator:
+        assert isinstance(token["choices"][0]["text"], str)
+
+
+def test_openai_streaming_error() -> None:
+    """Test error handling in stream."""
+    llm = OpenAI(best_of=2)
+    with pytest.raises(ValueError):
+        llm.stream("I'm Pickle Rick")
--- a/tests/integration_tests/llms/utils.py
+++ b/tests/integration_tests/llms/utils.py
@ -0,0 +1,16 @@
+"""Utils for LLM Tests."""
+
+from langchain.llms.base import LLM
+
+
+def assert_llm_equality(llm: LLM, loaded_llm: LLM) -> None:
+    """Assert LLM Equality for tests."""
+    # Check that they are the same type.
+    assert type(llm) == type(loaded_llm)
+    # Client field can be session based, so hash is different despite
+    # all other values being the same, so just assess all other fields
+    for field in llm.__fields__.keys():
+        if field != "client" and field != "pipeline":
+            val = getattr(llm, field)
+            new_val = getattr(loaded_llm, field)
+            assert new_val == val
--- a/tests/unit_tests/agents/test_agent.py
+++ b/tests/unit_tests/agents/test_agent.py
@ -2,19 +2,19 @@

 from typing import Any, List, Mapping, Optional

+from pydantic import BaseModel
+
 from langchain.agents import Tool, initialize_agent
 from langchain.llms.base import LLM


-class FakeListLLM(LLM):
+class FakeListLLM(LLM, BaseModel):
    """Fake LLM for testing that outputs elements of a list."""

-    def __init__(self, responses: List[str]):
-        """Initialize with list of responses."""
-        self.responses = responses
-        self.i = -1
+    responses: List[str]
+    i: int = -1

-    def __call__(self, prompt: str, stop: Optional[List[str]] = None) -> str:
+    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
        """Increment counter, and then return response in that index."""
        self.i += 1
        print(self.i)
@ -25,6 +25,11 @@ class FakeListLLM(LLM):
    def _identifying_params(self) -> Mapping[str, Any]:
        return {}

+    @property
+    def _llm_type(self) -> str:
+        """Return type of llm."""
+        return "fake_list"
+

 def test_agent_bad_action() -> None:
    """Test react chain when bad action given."""
@ -33,7 +38,7 @@ def test_agent_bad_action() -> None:
        f"I'm turning evil\nAction: {bad_action_name}\nAction Input: misalignment",
        "Oh well\nAction: Final Answer\nAction Input: curses foiled again",
    ]
-    fake_llm = FakeListLLM(responses)
+    fake_llm = FakeListLLM(responses=responses)
    tools = [
        Tool("Search", lambda x: x, "Useful for searching"),
        Tool("Lookup", lambda x: x, "Useful for looking up things in a table"),
--- a/tests/unit_tests/agents/test_react.py
+++ b/tests/unit_tests/agents/test_react.py
@ -2,6 +2,8 @@

 from typing import Any, List, Mapping, Optional, Union

+from pydantic import BaseModel
+
 from langchain.agents.react.base import ReActChain, ReActDocstoreAgent
 from langchain.agents.tools import Tool
 from langchain.docstore.base import Docstore
@ -20,15 +22,18 @@ Made in 2022."""
 _FAKE_PROMPT = PromptTemplate(input_variables=["input"], template="{input}")


-class FakeListLLM(LLM):
+class FakeListLLM(LLM, BaseModel):
    """Fake LLM for testing that outputs elements of a list."""

-    def __init__(self, responses: List[str]):
-        """Initialize with list of responses."""
-        self.responses = responses
-        self.i = -1
+    responses: List[str]
+    i: int = -1
+
+    @property
+    def _llm_type(self) -> str:
+        """Return type of llm."""
+        return "fake_list"

-    def __call__(self, prompt: str, stop: Optional[List[str]] = None) -> str:
+    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
        """Increment counter, and then return response in that index."""
        self.i += 1
        return self.responses[self.i]
@ -50,7 +55,7 @@ class FakeDocstore(Docstore):
 def test_predict_until_observation_normal() -> None:
    """Test predict_until_observation when observation is made normally."""
    outputs = ["foo\nAction 1: Search[foo]"]
-    fake_llm = FakeListLLM(outputs)
+    fake_llm = FakeListLLM(responses=outputs)
    tools = [
        Tool("Search", lambda x: x),
        Tool("Lookup", lambda x: x),
@ -65,7 +70,7 @@ def test_predict_until_observation_normal() -> None:
 def test_predict_until_observation_repeat() -> None:
    """Test when no action is generated initially."""
    outputs = ["foo", " Search[foo]"]
-    fake_llm = FakeListLLM(outputs)
+    fake_llm = FakeListLLM(responses=outputs)
    tools = [
        Tool("Search", lambda x: x),
        Tool("Lookup", lambda x: x),
@ -84,7 +89,7 @@ def test_react_chain() -> None:
        "I should probably lookup\nAction 2: Lookup[made]",
        "Ah okay now I know the answer\nAction 3: Finish[2022]",
    ]
-    fake_llm = FakeListLLM(responses)
+    fake_llm = FakeListLLM(responses=responses)
    react_chain = ReActChain(llm=fake_llm, docstore=FakeDocstore())
    output = react_chain.run("when was langchain made")
    assert output == "2022"
@ -97,7 +102,7 @@ def test_react_chain_bad_action() -> None:
        f"I'm turning evil\nAction 1: {bad_action_name}[langchain]",
        "Oh well\nAction 2: Finish[curses foiled again]",
    ]
-    fake_llm = FakeListLLM(responses)
+    fake_llm = FakeListLLM(responses=responses)
    react_chain = ReActChain(llm=fake_llm, docstore=FakeDocstore())
    output = react_chain.run("when was langchain made")
    assert output == "curses foiled again"
--- a/tests/unit_tests/chains/test_base.py
+++ b/tests/unit_tests/chains/test_base.py
@ -11,11 +11,12 @@ class FakeChain(Chain, BaseModel):
    """Fake chain class for testing purposes."""

    be_correct: bool = True
+    the_input_keys: List[str] = ["foo"]

    @property
    def input_keys(self) -> List[str]:
-        """Input key of foo."""
-        return ["foo"]
+        """Input keys."""
+        return self.the_input_keys

    @property
    def output_keys(self) -> List[str]:
@ -48,3 +49,17 @@ def test_correct_call() -> None:
    chain = FakeChain()
    output = chain({"foo": "bar"})
    assert output == {"foo": "bar", "bar": "baz"}
+
+
+def test_single_input_correct() -> None:
+    """Test passing single input works."""
+    chain = FakeChain()
+    output = chain("bar")
+    assert output == {"foo": "bar", "bar": "baz"}
+
+
+def test_single_input_error() -> None:
+    """Test passing single input errors as expected."""
+    chain = FakeChain(the_input_keys=["foo", "bar"])
+    with pytest.raises(ValueError):
+        chain("bar")
--- a/tests/unit_tests/chains/test_combine_documents.py
+++ b/tests/unit_tests/chains/test_combine_documents.py
@ -0,0 +1,118 @@
+"""Test functionality related to combining documents."""
+
+from typing import List
+
+import pytest
+
+from langchain.chains.combine_documents.map_reduce import (
+    _collapse_docs,
+    _split_list_of_docs,
+)
+from langchain.docstore.document import Document
+
+
+def _fake_docs_len_func(docs: List[Document]) -> int:
+    return len(_fake_combine_docs_func(docs))
+
+
+def _fake_combine_docs_func(docs: List[Document]) -> str:
+    return "".join([d.page_content for d in docs])
+
+
+def test__split_list_long_single_doc() -> None:
+    """Test splitting of a long single doc."""
+    docs = [Document(page_content="foo" * 100)]
+    with pytest.raises(ValueError):
+        _split_list_of_docs(docs, _fake_docs_len_func, 100)
+
+
+def test__split_list_long_pair_doc() -> None:
+    """Test splitting of a list with two medium docs."""
+    docs = [Document(page_content="foo" * 30)] * 2
+    with pytest.raises(ValueError):
+        _split_list_of_docs(docs, _fake_docs_len_func, 100)
+
+
+def test__split_list_single_doc() -> None:
+    """Test splitting works with just a single doc."""
+    docs = [Document(page_content="foo")]
+    doc_list = _split_list_of_docs(docs, _fake_docs_len_func, 100)
+    assert doc_list == [docs]
+
+
+def test__split_list_double_doc() -> None:
+    """Test splitting works with just two docs."""
+    docs = [Document(page_content="foo"), Document(page_content="bar")]
+    doc_list = _split_list_of_docs(docs, _fake_docs_len_func, 100)
+    assert doc_list == [docs]
+
+
+def test__split_list_works_correctly() -> None:
+    """Test splitting works correctly."""
+    docs = [
+        Document(page_content="foo"),
+        Document(page_content="bar"),
+        Document(page_content="baz"),
+        Document(page_content="foo" * 2),
+        Document(page_content="bar"),
+        Document(page_content="baz"),
+    ]
+    doc_list = _split_list_of_docs(docs, _fake_docs_len_func, 10)
+    expected_result = [
+        # Test a group of three.
+        [
+            Document(page_content="foo"),
+            Document(page_content="bar"),
+            Document(page_content="baz"),
+        ],
+        # Test a group of two, where one is bigger.
+        [Document(page_content="foo" * 2), Document(page_content="bar")],
+        # Test no errors on last
+        [Document(page_content="baz")],
+    ]
+    assert doc_list == expected_result
+
+
+def test__collapse_docs_no_metadata() -> None:
+    """Test collapse documents functionality when no metadata."""
+    docs = [
+        Document(page_content="foo"),
+        Document(page_content="bar"),
+        Document(page_content="baz"),
+    ]
+    output = _collapse_docs(docs, _fake_combine_docs_func)
+    expected_output = Document(page_content="foobarbaz")
+    assert output == expected_output
+
+
+def test__collapse_docs_one_doc() -> None:
+    """Test collapse documents functionality when only one document present."""
+    # Test with no metadata.
+    docs = [Document(page_content="foo")]
+    output = _collapse_docs(docs, _fake_combine_docs_func)
+    assert output == docs[0]
+
+    # Test with metadata.
+    docs = [Document(page_content="foo", metadata={"source": "a"})]
+    output = _collapse_docs(docs, _fake_combine_docs_func)
+    assert output == docs[0]
+
+
+def test__collapse_docs_metadata() -> None:
+    """Test collapse documents functionality when metadata exists."""
+    metadata1 = {"source": "a", "foo": 2, "bar": "1", "extra1": "foo"}
+    metadata2 = {"source": "b", "foo": "3", "bar": 2, "extra2": "bar"}
+    docs = [
+        Document(page_content="foo", metadata=metadata1),
+        Document(page_content="bar", metadata=metadata2),
+    ]
+    output = _collapse_docs(docs, _fake_combine_docs_func)
+    expected_metadata = {
+        "source": "a, b",
+        "foo": "2, 3",
+        "bar": "1, 2",
+        "extra1": "foo",
+        "extra2": "bar",
+    }
+    expected_output = Document(page_content="foobar", metadata=expected_metadata)
+    assert output == expected_output
--- a/tests/unit_tests/chains/test_conversation.py
+++ b/tests/unit_tests/chains/test_conversation.py
@ -4,6 +4,7 @@ import pytest
 from langchain.chains.base import Memory
 from langchain.chains.conversation.base import ConversationChain
 from langchain.chains.conversation.memory import (
+    ConversationalBufferWindowMemory,
    ConversationBufferMemory,
    ConversationSummaryMemory,
 )
@ -66,3 +67,23 @@ def test_conversation_memory(memory: Memory) -> None:
    bad_outputs = {"foo": "bar", "foo1": "bar"}
    with pytest.raises(ValueError):
        memory.save_context(good_inputs, bad_outputs)
+
+
+@pytest.mark.parametrize(
+    "memory",
+    [
+        ConversationBufferMemory(memory_key="baz"),
+        ConversationSummaryMemory(llm=FakeLLM(), memory_key="baz"),
+        ConversationalBufferWindowMemory(memory_key="baz"),
+    ],
+)
+def test_clearing_conversation_memory(memory: Memory) -> None:
+    """Test clearing the conversation memory."""
+    # This is a good input because the input is not the same as baz.
+    good_inputs = {"foo": "bar", "baz": "foo"}
+    # This is a good output because these is one variable.
+    good_outputs = {"bar": "foo"}
+    memory.save_context(good_inputs, good_outputs)
+
+    memory.clear()
+    assert memory.load_memory_variables({}) == {"baz": ""}
--- a/tests/unit_tests/chains/test_natbot.py
+++ b/tests/unit_tests/chains/test_natbot.py
@ -2,20 +2,27 @@

 from typing import Any, List, Mapping, Optional

+from pydantic import BaseModel
+
 from langchain.chains.natbot.base import NatBotChain
 from langchain.llms.base import LLM


-class FakeLLM(LLM):
+class FakeLLM(LLM, BaseModel):
    """Fake LLM wrapper for testing purposes."""

-    def __call__(self, prompt: str, stop: Optional[List[str]] = None) -> str:
+    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
        """Return `foo` if longer than 10000 words, else `bar`."""
        if len(prompt) > 10000:
            return "foo"
        else:
            return "bar"

+    @property
+    def _llm_type(self) -> str:
+        """Return type of llm."""
+        return "fake"
+
    @property
    def _identifying_params(self) -> Mapping[str, Any]:
        return {}
--- a/tests/unit_tests/llms/fake_llm.py
+++ b/tests/unit_tests/llms/fake_llm.py
@ -1,20 +1,25 @@
 """Fake LLM wrapper for testing purposes."""
 from typing import Any, List, Mapping, Optional

+from pydantic import BaseModel
+
 from langchain.llms.base import LLM


-class FakeLLM(LLM):
+class FakeLLM(LLM, BaseModel):
    """Fake LLM wrapper for testing purposes."""

-    def __init__(self, queries: Optional[Mapping] = None):
-        """Initialize with optional lookup of queries."""
-        self._queries = queries
+    queries: Optional[Mapping] = None
+
+    @property
+    def _llm_type(self) -> str:
+        """Return type of llm."""
+        return "fake"

-    def __call__(self, prompt: str, stop: Optional[List[str]] = None) -> str:
+    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
        """First try to lookup in queries, else return 'foo' or 'bar'."""
-        if self._queries is not None:
-            return self._queries[prompt]
+        if self.queries is not None:
+            return self.queries[prompt]
        if stop is None:
            return "foo"
        else:
--- a/tests/unit_tests/llms/test_loading.py
+++ b/tests/unit_tests/llms/test_loading.py
@ -0,0 +1,15 @@
+"""Test LLM saving and loading functions."""
+from pathlib import Path
+from unittest.mock import patch
+
+from langchain.llms.loading import load_llm
+from tests.unit_tests.llms.fake_llm import FakeLLM
+
+
+@patch("langchain.llms.loading.type_to_cls_dict", {"fake": FakeLLM})
+def test_saving_loading_round_trip(tmp_path: Path) -> None:
+    """Test saving/loading a Fake LLM."""
+    fake_llm = FakeLLM()
+    fake_llm.save(file_path=tmp_path / "fake_llm.yaml")
+    loaded_llm = load_llm(tmp_path / "fake_llm.yaml")
+    assert loaded_llm == fake_llm
--- a/tests/unit_tests/llms/test_utils.py
+++ b/tests/unit_tests/llms/test_utils.py
@ -10,6 +10,9 @@ def test_enforce_stop_tokens() -> None:
    text = "foo bar baz"
    output = enforce_stop_tokens(text, ["moo", "baz", "bar"])
    assert output == "foo "
+    text = "foo bar baz"
+    output = enforce_stop_tokens(text, ["moo", "bar"])
+    assert output == "foo "


 def test_enforce_stop_tokens_none() -> None: