forked from Archives/langchain
Compare commits
1 Commits
main
...
searx-quer
Author | SHA1 | Date |
---|---|---|
blob42 | 03d42dad27 | 1 year ago |
@ -1,144 +0,0 @@
|
||||
.vscode/
|
||||
.idea/
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
pip-wheel-metadata/
|
||||
share/python-wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.nox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
*.py,cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
db.sqlite3-journal
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
notebooks/
|
||||
|
||||
# IPython
|
||||
profile_default/
|
||||
ipython_config.py
|
||||
|
||||
# pyenv
|
||||
.python-version
|
||||
|
||||
# pipenv
|
||||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||
# install all needed dependencies.
|
||||
#Pipfile.lock
|
||||
|
||||
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
|
||||
__pypackages__/
|
||||
|
||||
# Celery stuff
|
||||
celerybeat-schedule
|
||||
celerybeat.pid
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# Environments
|
||||
.env
|
||||
.venv
|
||||
.venvs
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
.dmypy.json
|
||||
dmypy.json
|
||||
|
||||
# Pyre type checker
|
||||
.pyre/
|
||||
|
||||
# macOS display setting files
|
||||
.DS_Store
|
||||
|
||||
|
||||
|
||||
# docker
|
||||
docker/
|
||||
!docker/assets/
|
||||
.dockerignore
|
||||
docker.build
|
@ -1,13 +0,0 @@
|
||||
# python env
|
||||
PYTHON_VERSION=3.10
|
||||
|
||||
# -E flag is required
|
||||
# comment the following line to only install dev dependencies
|
||||
POETRY_EXTRA_PACKAGES="-E all"
|
||||
|
||||
# at least one group needed
|
||||
POETRY_DEPENDENCIES="dev,test,lint,typing"
|
||||
|
||||
# langchain env. warning: these variables will be baked into the docker image !
|
||||
OPENAI_API_KEY=${OPENAI_API_KEY:-}
|
||||
SERPAPI_API_KEY=${SERPAPI_API_KEY:-}
|
@ -1,53 +0,0 @@
|
||||
# Using Docker
|
||||
|
||||
To quickly get started, run the command `make docker`.
|
||||
|
||||
If docker is installed the Makefile will export extra targets in the fomrat `docker.*` to build and run the docker image. Type `make` for a list of available tasks.
|
||||
|
||||
There is a basic `docker-compose.yml` in the docker directory.
|
||||
|
||||
## Building the development image
|
||||
|
||||
Using `make docker` will build the dev image if it does not exist, then drops
|
||||
you inside the container with the langchain environment available in the shell.
|
||||
|
||||
### Customizing the image and installed dependencies
|
||||
|
||||
The image is built with a default python version and all extras and dev
|
||||
dependencies. It can be customized by changing the variables in the [.env](/docker/.env)
|
||||
file.
|
||||
|
||||
If you don't need all the `extra` dependencies a slimmer image can be obtained by
|
||||
commenting out `POETRY_EXTRA_PACKAGES` in the [.env](docker/.env) file.
|
||||
|
||||
### Image caching
|
||||
|
||||
The Dockerfile is optimized to cache the poetry install step. A rebuild is triggered when there a change to the source code.
|
||||
|
||||
## Example Usage
|
||||
|
||||
All commands from langchain's python environment are available by default in the container.
|
||||
|
||||
A few examples:
|
||||
```bash
|
||||
# run jupyter notebook
|
||||
docker run --rm -it IMG jupyter notebook
|
||||
|
||||
# run ipython
|
||||
docker run --rm -it IMG ipython
|
||||
|
||||
# start web server
|
||||
docker run --rm -p 8888:8888 IMG python -m http.server 8888
|
||||
```
|
||||
|
||||
## Testing / Linting
|
||||
|
||||
Tests and lints are run using your local source directory that is mounted on the volume /src.
|
||||
|
||||
Run unit tests in the container with `make docker.test`.
|
||||
|
||||
Run the linting and formatting checks with `make docker.lint`.
|
||||
|
||||
Note: this task can run in parallel using `make -j4 docker.lint`.
|
||||
|
||||
|
@ -1,104 +0,0 @@
|
||||
# vim: ft=dockerfile
|
||||
#
|
||||
# see also: https://github.com/python-poetry/poetry/discussions/1879
|
||||
# - with https://github.com/bneijt/poetry-lock-docker
|
||||
# see https://github.com/thehale/docker-python-poetry
|
||||
# see https://github.com/max-pfeiffer/uvicorn-poetry
|
||||
|
||||
# use by default the slim version of python
|
||||
ARG PYTHON_IMAGE_TAG=slim
|
||||
ARG PYTHON_VERSION=${PYTHON_VERSION:-3.11.2}
|
||||
|
||||
####################
|
||||
# Base Environment
|
||||
####################
|
||||
FROM python:$PYTHON_VERSION-$PYTHON_IMAGE_TAG AS lchain-base
|
||||
|
||||
ARG UID=1000
|
||||
ARG USERNAME=lchain
|
||||
|
||||
ENV USERNAME=$USERNAME
|
||||
|
||||
RUN groupadd -g ${UID} $USERNAME
|
||||
RUN useradd -l -m -u ${UID} -g ${UID} $USERNAME
|
||||
|
||||
# used for mounting source code
|
||||
RUN mkdir /src
|
||||
VOLUME /src
|
||||
|
||||
|
||||
#######################
|
||||
## Poetry Builder Image
|
||||
#######################
|
||||
FROM lchain-base AS lchain-base-builder
|
||||
|
||||
ARG POETRY_EXTRA_PACKAGES=$POETRY_EXTRA_PACKAGES
|
||||
ARG POETRY_DEPENDENCIES=$POETRY_DEPENDENCIES
|
||||
|
||||
ENV HOME=/root
|
||||
ENV POETRY_HOME=/root/.poetry
|
||||
ENV POETRY_VIRTUALENVS_IN_PROJECT=false
|
||||
ENV POETRY_NO_INTERACTION=1
|
||||
ENV CACHE_DIR=$HOME/.cache
|
||||
ENV POETRY_CACHE_DIR=$CACHE_DIR/pypoetry
|
||||
ENV PATH="$POETRY_HOME/bin:$PATH"
|
||||
|
||||
WORKDIR /root
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y \
|
||||
build-essential \
|
||||
git \
|
||||
curl
|
||||
|
||||
SHELL ["/bin/bash", "-o", "pipefail", "-c"]
|
||||
|
||||
RUN mkdir -p $CACHE_DIR
|
||||
|
||||
## setup poetry
|
||||
RUN curl -sSL -o $CACHE_DIR/pypoetry-installer.py https://install.python-poetry.org/
|
||||
RUN python3 $CACHE_DIR/pypoetry-installer.py
|
||||
|
||||
|
||||
# # Copy poetry files
|
||||
COPY poetry.* pyproject.toml ./
|
||||
|
||||
RUN mkdir /pip-prefix
|
||||
|
||||
RUN poetry export $POETRY_EXTRA_PACKAGES --with $POETRY_DEPENDENCIES -f requirements.txt --output requirements.txt --without-hashes && \
|
||||
pip install --no-cache-dir --disable-pip-version-check --prefix /pip-prefix -r requirements.txt
|
||||
|
||||
|
||||
# add custom motd message
|
||||
COPY docker/assets/etc/motd /tmp/motd
|
||||
RUN cat /tmp/motd > /etc/motd
|
||||
|
||||
RUN printf "\n%s\n%s\n" "$(poetry version)" "$(python --version)" >> /etc/motd
|
||||
|
||||
###################
|
||||
## Runtime Image
|
||||
###################
|
||||
FROM lchain-base AS lchain
|
||||
|
||||
#jupyter port
|
||||
EXPOSE 8888
|
||||
|
||||
COPY docker/assets/entry.sh /entry
|
||||
RUN chmod +x /entry
|
||||
|
||||
COPY --from=lchain-base-builder /etc/motd /etc/motd
|
||||
COPY --from=lchain-base-builder /usr/bin/git /usr/bin/git
|
||||
|
||||
USER ${USERNAME:-lchain}
|
||||
ENV HOME /home/$USERNAME
|
||||
WORKDIR /home/$USERNAME
|
||||
|
||||
COPY --chown=lchain:lchain --from=lchain-base-builder /pip-prefix $HOME/.local/
|
||||
|
||||
COPY . .
|
||||
|
||||
SHELL ["/bin/bash", "-o", "pipefail", "-c"]
|
||||
RUN pip install --no-deps --disable-pip-version-check --no-cache-dir -e .
|
||||
|
||||
|
||||
entrypoint ["/entry"]
|
@ -1,84 +0,0 @@
|
||||
#do not call this makefile it is included in the main Makefile
|
||||
.PHONY: docker docker.jupyter docker.run docker.force_build docker.clean \
|
||||
docker.test docker.lint docker.lint.mypy docker.lint.black \
|
||||
docker.lint.isort docker.lint.flake
|
||||
|
||||
# read python version from .env file ignoring comments
|
||||
PYTHON_VERSION := $(shell grep PYTHON_VERSION docker/.env | cut -d '=' -f2)
|
||||
POETRY_EXTRA_PACKAGES := $(shell grep '^[^#]*POETRY_EXTRA_PACKAGES' docker/.env | cut -d '=' -f2)
|
||||
POETRY_DEPENDENCIES := $(shell grep 'POETRY_DEPENDENCIES' docker/.env | cut -d '=' -f2)
|
||||
|
||||
|
||||
DOCKER_SRC := $(shell find docker -type f)
|
||||
DOCKER_IMAGE_NAME = langchain/dev
|
||||
|
||||
# SRC is all files matched by the git ls-files command
|
||||
SRC := $(shell git ls-files -- '*' ':!:docker/*')
|
||||
|
||||
# set DOCKER_BUILD_PROGRESS=plain to see detailed build progress
|
||||
DOCKER_BUILD_PROGRESS ?= auto
|
||||
|
||||
# extra message to show when entering the docker container
|
||||
DOCKER_MOTD := docker/assets/etc/motd
|
||||
|
||||
ROOTDIR := $(shell git rev-parse --show-toplevel)
|
||||
|
||||
DOCKER_LINT_CMD = docker run --rm -i -u lchain -v $(ROOTDIR):/src $(DOCKER_IMAGE_NAME):$(GIT_HASH)
|
||||
|
||||
docker: docker.run
|
||||
|
||||
docker.run: docker.build
|
||||
@echo "Docker image: $(DOCKER_IMAGE_NAME):$(GIT_HASH)"
|
||||
docker run --rm -it -u lchain -v $(ROOTDIR):/src $(DOCKER_IMAGE_NAME):$(GIT_HASH)
|
||||
|
||||
docker.jupyter: docker.build
|
||||
docker run --rm -it -v $(ROOTDIR):/src $(DOCKER_IMAGE_NAME):$(GIT_HASH) jupyter notebook
|
||||
|
||||
docker.build: $(SRC) $(DOCKER_SRC) $(DOCKER_MOTD)
|
||||
ifdef $(DOCKER_BUILDKIT)
|
||||
docker buildx build --build-arg PYTHON_VERSION=$(PYTHON_VERSION) \
|
||||
--build-arg POETRY_EXTRA_PACKAGES=$(POETRY_EXTRA_PACKAGES) \
|
||||
--build-arg POETRY_DEPENDENCIES=$(POETRY_DEPENDENCIES) \
|
||||
--progress=$(DOCKER_BUILD_PROGRESS) \
|
||||
$(BUILD_FLAGS) -f docker/Dockerfile -t $(DOCKER_IMAGE_NAME):$(GIT_HASH) .
|
||||
else
|
||||
docker build --build-arg PYTHON_VERSION=$(PYTHON_VERSION) \
|
||||
--build-arg POETRY_EXTRA_PACKAGES=$(POETRY_EXTRA_PACKAGES) \
|
||||
--build-arg POETRY_DEPENDENCIES=$(POETRY_DEPENDENCIES) \
|
||||
$(BUILD_FLAGS) -f docker/Dockerfile -t $(DOCKER_IMAGE_NAME):$(GIT_HASH) .
|
||||
endif
|
||||
docker tag $(DOCKER_IMAGE_NAME):$(GIT_HASH) $(DOCKER_IMAGE_NAME):latest
|
||||
@touch $@ # this prevents docker from rebuilding dependencies that have not
|
||||
@ # changed. Remove the file `docker/docker.build` to force a rebuild.
|
||||
|
||||
docker.force_build: $(DOCKER_SRC)
|
||||
@rm -f docker.build
|
||||
@$(MAKE) docker.build BUILD_FLAGS=--no-cache
|
||||
|
||||
docker.clean:
|
||||
docker rmi $(DOCKER_IMAGE_NAME):$(GIT_HASH) $(DOCKER_IMAGE_NAME):latest
|
||||
|
||||
docker.test: docker.build
|
||||
docker run --rm -it -u lchain -v $(ROOTDIR):/src $(DOCKER_IMAGE_NAME):$(GIT_HASH) \
|
||||
pytest /src/tests/unit_tests
|
||||
|
||||
# this assumes that the docker image has been built
|
||||
docker.lint: docker.lint.mypy docker.lint.black docker.lint.isort \
|
||||
docker.lint.flake
|
||||
|
||||
# these can run in parallel with -j[njobs]
|
||||
docker.lint.mypy:
|
||||
@$(DOCKER_LINT_CMD) mypy /src
|
||||
@printf "\t%s\n" "mypy ... "
|
||||
|
||||
docker.lint.black:
|
||||
@$(DOCKER_LINT_CMD) black /src --check
|
||||
@printf "\t%s\n" "black ... "
|
||||
|
||||
docker.lint.isort:
|
||||
@$(DOCKER_LINT_CMD) isort /src --check
|
||||
@printf "\t%s\n" "isort ... "
|
||||
|
||||
docker.lint.flake:
|
||||
@$(DOCKER_LINT_CMD) flake8 /src
|
||||
@printf "\t%s\n" "flake8 ... "
|
@ -1,10 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
export PATH=$HOME/.local/bin:$PATH
|
||||
|
||||
if [ -z "$1" ]; then
|
||||
cat /etc/motd
|
||||
exec /bin/bash
|
||||
fi
|
||||
|
||||
exec "$@"
|
@ -1,8 +0,0 @@
|
||||
All dependencies have been installed in the current shell. There is no
|
||||
virtualenv or a need for `poetry` inside the container.
|
||||
|
||||
Running the command `make docker.run` at the root directory of the project will
|
||||
build the container the first time. On the next runs it will use the cached
|
||||
image. A rebuild will happen when changes are made to the source code.
|
||||
|
||||
You local source directory has been mounted to the /src directory.
|
@ -1,17 +0,0 @@
|
||||
version: "3.7"
|
||||
|
||||
services:
|
||||
langchain:
|
||||
hostname: langchain
|
||||
image: langchain/dev:latest
|
||||
build:
|
||||
context: ../
|
||||
dockerfile: docker/Dockerfile
|
||||
args:
|
||||
PYTHON_VERSION: ${PYTHON_VERSION}
|
||||
POETRY_EXTRA_PACKAGES: ${POETRY_EXTRA_PACKAGES}
|
||||
POETRY_DEPENDENCIES: ${POETRY_DEPENDENCIES}
|
||||
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- 127.0.0.1:8888:8888
|
@ -1,25 +0,0 @@
|
||||
# AtlasDB
|
||||
|
||||
This page covers how to Nomic's Atlas ecosystem within LangChain.
|
||||
It is broken into two parts: installation and setup, and then references to specific Atlas wrappers.
|
||||
|
||||
## Installation and Setup
|
||||
- Install the Python package with `pip install nomic`
|
||||
- Nomic is also included in langchains poetry extras `poetry install -E all`
|
||||
-
|
||||
## Wrappers
|
||||
|
||||
### VectorStore
|
||||
|
||||
There exists a wrapper around the Atlas neural database, allowing you to use it as a vectorstore.
|
||||
This vectorstore also gives you full access to the underlying AtlasProject object, which will allow you to use the full range of Atlas map interactions, such as bulk tagging and automatic topic modeling.
|
||||
Please see [the Nomic docs](https://docs.nomic.ai/atlas_api.html) for more detailed information.
|
||||
|
||||
|
||||
|
||||
To import this vectorstore:
|
||||
```python
|
||||
from langchain.vectorstores import AtlasDB
|
||||
```
|
||||
|
||||
For a more detailed walkthrough of the Chroma wrapper, see [this notebook](../modules/indexes/examples/vectorstores.ipynb)
|
@ -1,79 +0,0 @@
|
||||
# Banana
|
||||
|
||||
This page covers how to use the Banana ecosystem within LangChain.
|
||||
It is broken into two parts: installation and setup, and then references to specific Banana wrappers.
|
||||
|
||||
## Installation and Setup
|
||||
|
||||
- Install with `pip3 install banana-dev`
|
||||
- Get an Banana api key and set it as an environment variable (`BANANA_API_KEY`)
|
||||
|
||||
## Define your Banana Template
|
||||
|
||||
If you want to use an available language model template you can find one [here](https://app.banana.dev/templates/conceptofmind/serverless-template-palmyra-base).
|
||||
This template uses the Palmyra-Base model by [Writer](https://writer.com/product/api/).
|
||||
You can check out an example Banana repository [here](https://github.com/conceptofmind/serverless-template-palmyra-base).
|
||||
|
||||
## Build the Banana app
|
||||
|
||||
Banana Apps must include the "output" key in the return json.
|
||||
There is a rigid response structure.
|
||||
|
||||
```python
|
||||
# Return the results as a dictionary
|
||||
result = {'output': result}
|
||||
```
|
||||
|
||||
An example inference function would be:
|
||||
|
||||
```python
|
||||
def inference(model_inputs:dict) -> dict:
|
||||
global model
|
||||
global tokenizer
|
||||
|
||||
# Parse out your arguments
|
||||
prompt = model_inputs.get('prompt', None)
|
||||
if prompt == None:
|
||||
return {'message': "No prompt provided"}
|
||||
|
||||
# Run the model
|
||||
input_ids = tokenizer.encode(prompt, return_tensors='pt').cuda()
|
||||
output = model.generate(
|
||||
input_ids,
|
||||
max_length=100,
|
||||
do_sample=True,
|
||||
top_k=50,
|
||||
top_p=0.95,
|
||||
num_return_sequences=1,
|
||||
temperature=0.9,
|
||||
early_stopping=True,
|
||||
no_repeat_ngram_size=3,
|
||||
num_beams=5,
|
||||
length_penalty=1.5,
|
||||
repetition_penalty=1.5,
|
||||
bad_words_ids=[[tokenizer.encode(' ', add_prefix_space=True)[0]]]
|
||||
)
|
||||
|
||||
result = tokenizer.decode(output[0], skip_special_tokens=True)
|
||||
# Return the results as a dictionary
|
||||
result = {'output': result}
|
||||
return result
|
||||
```
|
||||
|
||||
You can find a full example of a Banana app [here](https://github.com/conceptofmind/serverless-template-palmyra-base/blob/main/app.py).
|
||||
|
||||
## Wrappers
|
||||
|
||||
### LLM
|
||||
|
||||
There exists an Banana LLM wrapper, which you can access with
|
||||
|
||||
```python
|
||||
from langchain.llms import Banana
|
||||
```
|
||||
|
||||
You need to provide a model key located in the dashboard:
|
||||
|
||||
```python
|
||||
llm = Banana(model_key="YOUR_MODEL_KEY")
|
||||
```
|
@ -1,66 +0,0 @@
|
||||
# Modal
|
||||
|
||||
This page covers how to use the Modal ecosystem within LangChain.
|
||||
It is broken into two parts: installation and setup, and then references to specific Modal wrappers.
|
||||
|
||||
## Installation and Setup
|
||||
- Install with `pip install modal-client`
|
||||
- Run `modal token new`
|
||||
|
||||
## Define your Modal Functions and Webhooks
|
||||
|
||||
You must include a prompt. There is a rigid response structure.
|
||||
|
||||
```python
|
||||
class Item(BaseModel):
|
||||
prompt: str
|
||||
|
||||
@stub.webhook(method="POST")
|
||||
def my_webhook(item: Item):
|
||||
return {"prompt": my_function.call(item.prompt)}
|
||||
```
|
||||
|
||||
An example with GPT2:
|
||||
|
||||
```python
|
||||
from pydantic import BaseModel
|
||||
|
||||
import modal
|
||||
|
||||
stub = modal.Stub("example-get-started")
|
||||
|
||||
volume = modal.SharedVolume().persist("gpt2_model_vol")
|
||||
CACHE_PATH = "/root/model_cache"
|
||||
|
||||
@stub.function(
|
||||
gpu="any",
|
||||
image=modal.Image.debian_slim().pip_install(
|
||||
"tokenizers", "transformers", "torch", "accelerate"
|
||||
),
|
||||
shared_volumes={CACHE_PATH: volume},
|
||||
retries=3,
|
||||
)
|
||||
def run_gpt2(text: str):
|
||||
from transformers import GPT2Tokenizer, GPT2LMHeadModel
|
||||
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
|
||||
model = GPT2LMHeadModel.from_pretrained('gpt2')
|
||||
encoded_input = tokenizer(text, return_tensors='pt').input_ids
|
||||
output = model.generate(encoded_input, max_length=50, do_sample=True)
|
||||
return tokenizer.decode(output[0], skip_special_tokens=True)
|
||||
|
||||
class Item(BaseModel):
|
||||
prompt: str
|
||||
|
||||
@stub.webhook(method="POST")
|
||||
def get_text(item: Item):
|
||||
return {"prompt": run_gpt2.call(item.prompt)}
|
||||
```
|
||||
|
||||
## Wrappers
|
||||
|
||||
### LLM
|
||||
|
||||
There exists an Modal LLM wrapper, which you can access with
|
||||
```python
|
||||
from langchain.llms import Modal
|
||||
```
|
@ -1,17 +0,0 @@
|
||||
# StochasticAI
|
||||
|
||||
This page covers how to use the StochasticAI ecosystem within LangChain.
|
||||
It is broken into two parts: installation and setup, and then references to specific StochasticAI wrappers.
|
||||
|
||||
## Installation and Setup
|
||||
- Install with `pip install stochasticx`
|
||||
- Get an StochasticAI api key and set it as an environment variable (`STOCHASTICAI_API_KEY`)
|
||||
|
||||
## Wrappers
|
||||
|
||||
### LLM
|
||||
|
||||
There exists an StochasticAI LLM wrapper, which you can access with
|
||||
```python
|
||||
from langchain.llms import StochasticAI
|
||||
```
|
@ -1,16 +0,0 @@
|
||||
# Writer
|
||||
|
||||
This page covers how to use the Writer ecosystem within LangChain.
|
||||
It is broken into two parts: installation and setup, and then references to specific Writer wrappers.
|
||||
|
||||
## Installation and Setup
|
||||
- Get an Writer api key and set it as an environment variable (`WRITER_API_KEY`)
|
||||
|
||||
## Wrappers
|
||||
|
||||
### LLM
|
||||
|
||||
There exists an Writer LLM wrapper, which you can access with
|
||||
```python
|
||||
from langchain.llms import Writer
|
||||
```
|
@ -1,494 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "68b24990",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Agents and Vectorstores\n",
|
||||
"\n",
|
||||
"This notebook covers how to combine agents and vectorstores. The use case for this is that you've ingested your data into a vectorstore and want to interact with it in an agentic manner.\n",
|
||||
"\n",
|
||||
"The reccomended method for doing so is to create a VectorDBQAChain and then use that as a tool in the overall agent. Let's take a look at doing this below. You can do this with multiple different vectordbs, and use the agent as a way to route between them. There are two different ways of doing this - you can either let the agent use the vectorstores as normal tools, or you can set `return_direct=True` to really just use the agent as a router."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "9b22020a",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Create the Vectorstore"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"id": "2e87c10a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.embeddings.openai import OpenAIEmbeddings\n",
|
||||
"from langchain.vectorstores import Chroma\n",
|
||||
"from langchain.text_splitter import CharacterTextSplitter\n",
|
||||
"from langchain import OpenAI, VectorDBQA\n",
|
||||
"llm = OpenAI(temperature=0)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 37,
|
||||
"id": "f2675861",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Running Chroma using direct local API.\n",
|
||||
"Using DuckDB in-memory for database. Data will be transient.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from langchain.document_loaders import TextLoader\n",
|
||||
"loader = TextLoader('../../state_of_the_union.txt')\n",
|
||||
"documents = loader.load()\n",
|
||||
"text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
|
||||
"texts = text_splitter.split_documents(documents)\n",
|
||||
"\n",
|
||||
"embeddings = OpenAIEmbeddings()\n",
|
||||
"docsearch = Chroma.from_documents(texts, embeddings, collection_name=\"state-of-union\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 38,
|
||||
"id": "bc5403d4",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"state_of_union = VectorDBQA.from_chain_type(llm=llm, chain_type=\"stuff\", vectorstore=docsearch)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 39,
|
||||
"id": "1431cded",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders import WebBaseLoader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 40,
|
||||
"id": "915d3ff3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = WebBaseLoader(\"https://beta.ruff.rs/docs/faq/\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 41,
|
||||
"id": "96a2edf8",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Running Chroma using direct local API.\n",
|
||||
"Using DuckDB in-memory for database. Data will be transient.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"docs = loader.load()\n",
|
||||
"ruff_texts = text_splitter.split_documents(docs)\n",
|
||||
"ruff_db = Chroma.from_documents(ruff_texts, embeddings, collection_name=\"ruff\")\n",
|
||||
"ruff = VectorDBQA.from_chain_type(llm=llm, chain_type=\"stuff\", vectorstore=ruff_db)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "71ecef90",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "c0a6c031",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Create the Agent"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 43,
|
||||
"id": "eb142786",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Import things that are needed generically\n",
|
||||
"from langchain.agents import initialize_agent, Tool\n",
|
||||
"from langchain.tools import BaseTool\n",
|
||||
"from langchain.llms import OpenAI\n",
|
||||
"from langchain import LLMMathChain, SerpAPIWrapper"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 44,
|
||||
"id": "850bc4e9",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tools = [\n",
|
||||
" Tool(\n",
|
||||
" name = \"State of Union QA System\",\n",
|
||||
" func=state_of_union.run,\n",
|
||||
" description=\"useful for when you need to answer questions about the most recent state of the union address. Input should be a fully formed question.\"\n",
|
||||
" ),\n",
|
||||
" Tool(\n",
|
||||
" name = \"Ruff QA System\",\n",
|
||||
" func=ruff.run,\n",
|
||||
" description=\"useful for when you need to answer questions about ruff (a python linter). Input should be a fully formed question.\"\n",
|
||||
" ),\n",
|
||||
"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 45,
|
||||
"id": "fc47f230",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Construct the agent. We will use the default agent type here.\n",
|
||||
"# See documentation for a full list of options.\n",
|
||||
"agent = initialize_agent(tools, llm, agent=\"zero-shot-react-description\", verbose=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 46,
|
||||
"id": "10ca2db8",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"\n",
|
||||
"\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
|
||||
"\u001b[32;1m\u001b[1;3m I need to find out what Biden said about Ketanji Brown Jackson in the State of the Union address.\n",
|
||||
"Action: State of Union QA System\n",
|
||||
"Action Input: What did Biden say about Ketanji Brown Jackson in the State of the Union address?\u001b[0m\n",
|
||||
"Observation: \u001b[36;1m\u001b[1;3m Biden said that Jackson is one of the nation's top legal minds and that she will continue Justice Breyer's legacy of excellence.\u001b[0m\n",
|
||||
"Thought:\u001b[32;1m\u001b[1;3m I now know the final answer\n",
|
||||
"Final Answer: Biden said that Jackson is one of the nation's top legal minds and that she will continue Justice Breyer's legacy of excellence.\u001b[0m\n",
|
||||
"\n",
|
||||
"\u001b[1m> Finished chain.\u001b[0m\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"\"Biden said that Jackson is one of the nation's top legal minds and that she will continue Justice Breyer's legacy of excellence.\""
|
||||
]
|
||||
},
|
||||
"execution_count": 46,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"agent.run(\"What did biden say about ketanji brown jackson is the state of the union address?\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 47,
|
||||
"id": "4e91b811",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"\n",
|
||||
"\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
|
||||
"\u001b[32;1m\u001b[1;3m I need to find out the advantages of using ruff over flake8\n",
|
||||
"Action: Ruff QA System\n",
|
||||
"Action Input: What are the advantages of using ruff over flake8?\u001b[0m\n",
|
||||
"Observation: \u001b[33;1m\u001b[1;3m Ruff can be used as a drop-in replacement for Flake8 when used (1) without or with a small number of plugins, (2) alongside Black, and (3) on Python 3 code. It also re-implements some of the most popular Flake8 plugins and related code quality tools natively, including isort, yesqa, eradicate, and most of the rules implemented in pyupgrade. Ruff also supports automatically fixing its own lint violations, which Flake8 does not.\u001b[0m\n",
|
||||
"Thought:\u001b[32;1m\u001b[1;3m I now know the final answer\n",
|
||||
"Final Answer: Ruff can be used as a drop-in replacement for Flake8 when used (1) without or with a small number of plugins, (2) alongside Black, and (3) on Python 3 code. It also re-implements some of the most popular Flake8 plugins and related code quality tools natively, including isort, yesqa, eradicate, and most of the rules implemented in pyupgrade. Ruff also supports automatically fixing its own lint violations, which Flake8 does not.\u001b[0m\n",
|
||||
"\n",
|
||||
"\u001b[1m> Finished chain.\u001b[0m\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'Ruff can be used as a drop-in replacement for Flake8 when used (1) without or with a small number of plugins, (2) alongside Black, and (3) on Python 3 code. It also re-implements some of the most popular Flake8 plugins and related code quality tools natively, including isort, yesqa, eradicate, and most of the rules implemented in pyupgrade. Ruff also supports automatically fixing its own lint violations, which Flake8 does not.'"
|
||||
]
|
||||
},
|
||||
"execution_count": 47,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"agent.run(\"Why use ruff over flake8?\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "787a9b5e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Use the Agent solely as a router"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "9161ba91",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"You can also set `return_direct=True` if you intend to use the agent as a router and just want to directly return the result of the VectorDBQaChain.\n",
|
||||
"\n",
|
||||
"Notice that in the above examples the agent did some extra work after querying the VectorDBQAChain. You can avoid that and just return the result directly."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 48,
|
||||
"id": "f59b377e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tools = [\n",
|
||||
" Tool(\n",
|
||||
" name = \"State of Union QA System\",\n",
|
||||
" func=state_of_union.run,\n",
|
||||
" description=\"useful for when you need to answer questions about the most recent state of the union address. Input should be a fully formed question.\",\n",
|
||||
" return_direct=True\n",
|
||||
" ),\n",
|
||||
" Tool(\n",
|
||||
" name = \"Ruff QA System\",\n",
|
||||
" func=ruff.run,\n",
|
||||
" description=\"useful for when you need to answer questions about ruff (a python linter). Input should be a fully formed question.\",\n",
|
||||
" return_direct=True\n",
|
||||
" ),\n",
|
||||
"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 49,
|
||||
"id": "8615707a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"agent = initialize_agent(tools, llm, agent=\"zero-shot-react-description\", verbose=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 50,
|
||||
"id": "36e718a9",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"\n",
|
||||
"\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
|
||||
"\u001b[32;1m\u001b[1;3m I need to find out what Biden said about Ketanji Brown Jackson in the State of the Union address.\n",
|
||||
"Action: State of Union QA System\n",
|
||||
"Action Input: What did Biden say about Ketanji Brown Jackson in the State of the Union address?\u001b[0m\n",
|
||||
"Observation: \u001b[36;1m\u001b[1;3m Biden said that Jackson is one of the nation's top legal minds and that she will continue Justice Breyer's legacy of excellence.\u001b[0m\n",
|
||||
"\u001b[32;1m\u001b[1;3m\u001b[0m\n",
|
||||
"\n",
|
||||
"\u001b[1m> Finished chain.\u001b[0m\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"\" Biden said that Jackson is one of the nation's top legal minds and that she will continue Justice Breyer's legacy of excellence.\""
|
||||
]
|
||||
},
|
||||
"execution_count": 50,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"agent.run(\"What did biden say about ketanji brown jackson in the state of the union address?\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 51,
|
||||
"id": "edfd0a1a",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"\n",
|
||||
"\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
|
||||
"\u001b[32;1m\u001b[1;3m I need to find out the advantages of using ruff over flake8\n",
|
||||
"Action: Ruff QA System\n",
|
||||
"Action Input: What are the advantages of using ruff over flake8?\u001b[0m\n",
|
||||
"Observation: \u001b[33;1m\u001b[1;3m Ruff can be used as a drop-in replacement for Flake8 when used (1) without or with a small number of plugins, (2) alongside Black, and (3) on Python 3 code. It also re-implements some of the most popular Flake8 plugins and related code quality tools natively, including isort, yesqa, eradicate, and most of the rules implemented in pyupgrade. Ruff also supports automatically fixing its own lint violations, which Flake8 does not.\u001b[0m\n",
|
||||
"\u001b[32;1m\u001b[1;3m\u001b[0m\n",
|
||||
"\n",
|
||||
"\u001b[1m> Finished chain.\u001b[0m\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"' Ruff can be used as a drop-in replacement for Flake8 when used (1) without or with a small number of plugins, (2) alongside Black, and (3) on Python 3 code. It also re-implements some of the most popular Flake8 plugins and related code quality tools natively, including isort, yesqa, eradicate, and most of the rules implemented in pyupgrade. Ruff also supports automatically fixing its own lint violations, which Flake8 does not.'"
|
||||
]
|
||||
},
|
||||
"execution_count": 51,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"agent.run(\"Why use ruff over flake8?\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "49a0cbbe",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Multi-Hop vectorstore reasoning\n",
|
||||
"\n",
|
||||
"Because vectorstores are easily usable as tools in agents, it is easy to use answer multi-hop questions that depend on vectorstores using the existing agent framework"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 57,
|
||||
"id": "d397a233",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tools = [\n",
|
||||
" Tool(\n",
|
||||
" name = \"State of Union QA System\",\n",
|
||||
" func=state_of_union.run,\n",
|
||||
" description=\"useful for when you need to answer questions about the most recent state of the union address. Input should be a fully formed question, not referencing any obscure pronouns from the conversation before.\"\n",
|
||||
" ),\n",
|
||||
" Tool(\n",
|
||||
" name = \"Ruff QA System\",\n",
|
||||
" func=ruff.run,\n",
|
||||
" description=\"useful for when you need to answer questions about ruff (a python linter). Input should be a fully formed question, not referencing any obscure pronouns from the conversation before.\"\n",
|
||||
" ),\n",
|
||||
"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 58,
|
||||
"id": "06157240",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Construct the agent. We will use the default agent type here.\n",
|
||||
"# See documentation for a full list of options.\n",
|
||||
"agent = initialize_agent(tools, llm, agent=\"zero-shot-react-description\", verbose=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 59,
|
||||
"id": "b492b520",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"\n",
|
||||
"\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
|
||||
"\u001b[32;1m\u001b[1;3m I need to find out what tool ruff uses to run over Jupyter Notebooks, and if the president mentioned it in the state of the union.\n",
|
||||
"Action: Ruff QA System\n",
|
||||
"Action Input: What tool does ruff use to run over Jupyter Notebooks?\u001b[0m\n",
|
||||
"Observation: \u001b[33;1m\u001b[1;3m Ruff is integrated into nbQA, a tool for running linters and code formatters over Jupyter Notebooks. After installing ruff and nbqa, you can run Ruff over a notebook like so: > nbqa ruff Untitled.ipynb\u001b[0m\n",
|
||||
"Thought:\u001b[32;1m\u001b[1;3m I now need to find out if the president mentioned this tool in the state of the union.\n",
|
||||
"Action: State of Union QA System\n",
|
||||
"Action Input: Did the president mention nbQA in the state of the union?\u001b[0m\n",
|
||||
"Observation: \u001b[36;1m\u001b[1;3m No, the president did not mention nbQA in the state of the union.\u001b[0m\n",
|
||||
"Thought:\u001b[32;1m\u001b[1;3m I now know the final answer.\n",
|
||||
"Final Answer: No, the president did not mention nbQA in the state of the union.\u001b[0m\n",
|
||||
"\n",
|
||||
"\u001b[1m> Finished chain.\u001b[0m\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'No, the president did not mention nbQA in the state of the union.'"
|
||||
]
|
||||
},
|
||||
"execution_count": 59,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"agent.run(\"What tool does ruff use to run over Jupyter Notebooks? Did the president mention that tool in the state of the union?\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "b3b857d6",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
@ -1,116 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "9f98a15e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# CoNLL-U\n",
|
||||
"This is an example of how to load a file in [CoNLL-U](https://universaldependencies.org/format.html) format. The whole file is treated as one document. The example data (`conllu.conllu`) is based on one of the standard UD/CoNLL-U examples."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "d9b2e33e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders import CoNLLULoader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "5b5eec48",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = CoNLLULoader(\"example_data/conllu.conllu\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "10f3f725",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"document = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "acbb3579",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"document"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.8"
|
||||
},
|
||||
"toc": {
|
||||
"base_numbering": 1,
|
||||
"nav_menu": {},
|
||||
"number_sections": true,
|
||||
"sideBar": true,
|
||||
"skip_h1_title": false,
|
||||
"title_cell": "Table of Contents",
|
||||
"title_sidebar": "Contents",
|
||||
"toc_cell": false,
|
||||
"toc_position": {},
|
||||
"toc_section_display": true,
|
||||
"toc_window_display": false
|
||||
},
|
||||
"varInspector": {
|
||||
"cols": {
|
||||
"lenName": 16,
|
||||
"lenType": 16,
|
||||
"lenVar": 40
|
||||
},
|
||||
"kernels_config": {
|
||||
"python": {
|
||||
"delete_cmd_postfix": "",
|
||||
"delete_cmd_prefix": "del ",
|
||||
"library": "var_list.py",
|
||||
"varRefreshCmd": "print(var_dic_list())"
|
||||
},
|
||||
"r": {
|
||||
"delete_cmd_postfix": ") ",
|
||||
"delete_cmd_prefix": "rm(",
|
||||
"library": "var_list.r",
|
||||
"varRefreshCmd": "cat(var_dic_list()) "
|
||||
}
|
||||
},
|
||||
"types_to_exclude": [
|
||||
"module",
|
||||
"function",
|
||||
"builtin_function_or_method",
|
||||
"instance",
|
||||
"_Feature"
|
||||
],
|
||||
"window_display": false
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
@ -1,102 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "d9826810",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Copy Paste\n",
|
||||
"\n",
|
||||
"This notebook covers how to load a document object from something you just want to copy and paste. In this case, you don't even need to use a DocumentLoader, but rather can just construct the Document directly."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "fd9e71a2",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.docstore.document import Document"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "f40d3f30",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"text = \"..... put the text you copy pasted here......\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "d409bdba",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"doc = Document(page_content=text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "cc0eff72",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Metadata\n",
|
||||
"If you want to add metadata about the where you got this piece of text, you easily can with the metadata key."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "fe3aa5aa",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"metadata = {\"source\": \"internet\", \"date\": \"Friday\"}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "827d4e91",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"doc = Document(page_content=text, metadata=metadata)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c986a43d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
@ -1,8 +0,0 @@
|
||||
# sent_id = 1
|
||||
# text = They buy and sell books.
|
||||
1 They they PRON PRP Case=Nom|Number=Plur 2 nsubj 2:nsubj|4:nsubj _
|
||||
2 buy buy VERB VBP Number=Plur|Person=3|Tense=Pres 0 root 0:root _
|
||||
3 and and CONJ CC _ 4 cc 4:cc _
|
||||
4 sell sell VERB VBP Number=Plur|Person=3|Tense=Pres 2 conj 0:root|2:conj _
|
||||
5 books book NOUN NNS Number=Plur 2 obj 2:obj|4:obj SpaceAfter=No
|
||||
6 . . PUNCT . _ 2 punct 2:punct _
|
@ -1,64 +0,0 @@
|
||||
{
|
||||
"participants": [{"name": "User 1"}, {"name": "User 2"}],
|
||||
"messages": [
|
||||
{"sender_name": "User 2", "timestamp_ms": 1675597571851, "content": "Bye!"},
|
||||
{
|
||||
"sender_name": "User 1",
|
||||
"timestamp_ms": 1675597435669,
|
||||
"content": "Oh no worries! Bye",
|
||||
},
|
||||
{
|
||||
"sender_name": "User 2",
|
||||
"timestamp_ms": 1675596277579,
|
||||
"content": "No Im sorry it was my mistake, the blue one is not for sale",
|
||||
},
|
||||
{
|
||||
"sender_name": "User 1",
|
||||
"timestamp_ms": 1675595140251,
|
||||
"content": "I thought you were selling the blue one!",
|
||||
},
|
||||
{
|
||||
"sender_name": "User 1",
|
||||
"timestamp_ms": 1675595109305,
|
||||
"content": "Im not interested in this bag. Im interested in the blue one!",
|
||||
},
|
||||
{
|
||||
"sender_name": "User 2",
|
||||
"timestamp_ms": 1675595068468,
|
||||
"content": "Here is $129",
|
||||
},
|
||||
{
|
||||
"sender_name": "User 2",
|
||||
"timestamp_ms": 1675595060730,
|
||||
"photos": [
|
||||
{"uri": "url_of_some_picture.jpg", "creation_timestamp": 1675595059}
|
||||
],
|
||||
},
|
||||
{
|
||||
"sender_name": "User 2",
|
||||
"timestamp_ms": 1675595045152,
|
||||
"content": "Online is at least $100",
|
||||
},
|
||||
{
|
||||
"sender_name": "User 1",
|
||||
"timestamp_ms": 1675594799696,
|
||||
"content": "How much do you want?",
|
||||
},
|
||||
{
|
||||
"sender_name": "User 2",
|
||||
"timestamp_ms": 1675577876645,
|
||||
"content": "Goodmorning! $50 is too low.",
|
||||
},
|
||||
{
|
||||
"sender_name": "User 1",
|
||||
"timestamp_ms": 1675549022673,
|
||||
"content": "Hi! Im interested in your bag. Im offering $50. Let me know if you are interested. Thanks!",
|
||||
},
|
||||
],
|
||||
"title": "User 1 and User 2 chat",
|
||||
"is_still_participant": true,
|
||||
"thread_path": "inbox/User 1 and User 2 chat",
|
||||
"magic_words": [],
|
||||
"image": {"uri": "image_of_the_chat.jpg", "creation_timestamp": 1675549016},
|
||||
"joinable_mode": {"mode": 1, "link": ""},
|
||||
}
|
@ -1,83 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Notebook\n",
|
||||
"\n",
|
||||
"This notebook covers how to load data from an .ipynb notebook into a format suitable by LangChain."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders import NotebookLoader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = NotebookLoader(\"example_data/notebook.ipynb\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"`NotebookLoader.load()` loads the `.ipynb` notebook file into a `Document` object.\n",
|
||||
"\n",
|
||||
"**Parameters**:\n",
|
||||
"\n",
|
||||
"* `include_outputs` (bool): whether to include cell outputs in the resulting document (default is False).\n",
|
||||
"* `max_output_length` (int): the maximum number of characters to include from each cell output (default is 10).\n",
|
||||
"* `remove_newline` (bool): whether to remove newline characters from the cell sources and outputs (default is False).\n",
|
||||
"* `traceback` (bool): whether to include full traceback (default is False)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader.load(include_outputs=True, max_output_length=20, remove_newline=True)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.1"
|
||||
},
|
||||
"orig_nbformat": 4,
|
||||
"vscode": {
|
||||
"interpreter": {
|
||||
"hash": "981b6680a42bdb5eb22187741e1607b3aae2cf73db800d1af1f268d1de6a1f70"
|
||||
}
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
@ -1,77 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Facebook Chat\n",
|
||||
"\n",
|
||||
"This notebook covers how to load data from the Facebook Chats into a format that can be ingested into LangChain."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders import FacebookChatLoader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = FacebookChatLoader(\"example_data/facebook_chat.json\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[Document(page_content='User 2 on 2023-02-05 12:46:11: Bye!\\n\\nUser 1 on 2023-02-05 12:43:55: Oh no worries! Bye\\n\\nUser 2 on 2023-02-05 12:24:37: No Im sorry it was my mistake, the blue one is not for sale\\n\\nUser 1 on 2023-02-05 12:05:40: I thought you were selling the blue one!\\n\\nUser 1 on 2023-02-05 12:05:09: Im not interested in this bag. Im interested in the blue one!\\n\\nUser 2 on 2023-02-05 12:04:28: Here is $129\\n\\nUser 2 on 2023-02-05 12:04:05: Online is at least $100\\n\\nUser 1 on 2023-02-05 11:59:59: How much do you want?\\n\\nUser 2 on 2023-02-05 07:17:56: Goodmorning! $50 is too low.\\n\\nUser 1 on 2023-02-04 23:17:02: Hi! Im interested in your bag. Im offering $50. Let me know if you are interested. Thanks!\\n\\n', lookup_str='', metadata={'source': 'docs/modules/document_loaders/examples/example_data/facebook_chat.json'}, lookup_index=0)]"
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"loader.load()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.1"
|
||||
},
|
||||
"vscode": {
|
||||
"interpreter": {
|
||||
"hash": "384707f4965e853a82006e90614c2e1a578ea1f6eb0ee07a1dd78a657d37dd67"
|
||||
}
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
File diff suppressed because one or more lines are too long
@ -1,98 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Notebook\n",
|
||||
"\n",
|
||||
"This notebook covers how to load data from an .ipynb notebook into a format suitable by LangChain."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders import NotebookLoader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = NotebookLoader(\"example_data/notebook.ipynb\", include_outputs=True, max_output_length=20, remove_newline=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"`NotebookLoader.load()` loads the `.ipynb` notebook file into a `Document` object.\n",
|
||||
"\n",
|
||||
"**Parameters**:\n",
|
||||
"\n",
|
||||
"* `include_outputs` (bool): whether to include cell outputs in the resulting document (default is False).\n",
|
||||
"* `max_output_length` (int): the maximum number of characters to include from each cell output (default is 10).\n",
|
||||
"* `remove_newline` (bool): whether to remove newline characters from the cell sources and outputs (default is False).\n",
|
||||
"* `traceback` (bool): whether to include full traceback (default is False)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[Document(page_content='\\'markdown\\' cell: \\'[\\'# Notebook\\', \\'\\', \\'This notebook covers how to load data from an .ipynb notebook into a format suitable by LangChain.\\']\\'\\n\\n \\'code\\' cell: \\'[\\'from langchain.document_loaders import NotebookLoader\\']\\'\\n\\n \\'code\\' cell: \\'[\\'loader = NotebookLoader(\"example_data/notebook.ipynb\")\\']\\'\\n\\n \\'markdown\\' cell: \\'[\\'`NotebookLoader.load()` loads the `.ipynb` notebook file into a `Document` object.\\', \\'\\', \\'**Parameters**:\\', \\'\\', \\'* `include_outputs` (bool): whether to include cell outputs in the resulting document (default is False).\\', \\'* `max_output_length` (int): the maximum number of characters to include from each cell output (default is 10).\\', \\'* `remove_newline` (bool): whether to remove newline characters from the cell sources and outputs (default is False).\\', \\'* `traceback` (bool): whether to include full traceback (default is False).\\']\\'\\n\\n \\'code\\' cell: \\'[\\'loader.load(include_outputs=True, max_output_length=20, remove_newline=True)\\']\\'\\n\\n', lookup_str='', metadata={'source': 'example_data/notebook.ipynb'}, lookup_index=0)]"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.1"
|
||||
},
|
||||
"vscode": {
|
||||
"interpreter": {
|
||||
"hash": "981b6680a42bdb5eb22187741e1607b3aae2cf73db800d1af1f268d1de6a1f70"
|
||||
}
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
@ -1,137 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "39af9ecd",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Word Documents\n",
|
||||
"\n",
|
||||
"This covers how to load Word documents into a document format that we can use downstream."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "721c48aa",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders import UnstructuredWordDocumentLoader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "9d3d0e35",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = UnstructuredWordDocumentLoader(\"fake.docx\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "06073f91",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "c9adc5cb",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': 'fake.docx'}, lookup_index=0)]"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"data"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "525d6b67",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Retain Elements\n",
|
||||
"\n",
|
||||
"Under the hood, Unstructured creates different \"elements\" for different chunks of text. By default we combine those together, but you can easily keep that separation by specifying `mode=\"elements\"`."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "064f9162",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = UnstructuredWordDocumentLoader(\"fake.docx\", mode=\"elements\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "abefbbdb",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "a547c534",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': 'fake.docx', 'filename': 'fake.docx', 'category': 'Title'}, lookup_index=0)"
|
||||
]
|
||||
},
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"data[0]"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
@ -1,266 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%% md\n"
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"# AtlasDB\n",
|
||||
"\n",
|
||||
"This notebook shows you how to use functionality related to the AtlasDB"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import time\n",
|
||||
"from langchain.embeddings.openai import OpenAIEmbeddings\n",
|
||||
"from langchain.text_splitter import SpacyTextSplitter\n",
|
||||
"from langchain.vectorstores import AtlasDB\n",
|
||||
"from langchain.document_loaders import TextLoader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Collecting en-core-web-sm==3.5.0\n",
|
||||
" Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)\n",
|
||||
"\u001B[2K \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m12.8/12.8 MB\u001B[0m \u001B[31m90.8 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m00:01\u001B[0m00:01\u001B[0m\n",
|
||||
"\u001B[?25hRequirement already satisfied: spacy<3.6.0,>=3.5.0 in /home/ubuntu/langchain/.venv/lib/python3.9/site-packages (from en-core-web-sm==3.5.0) (3.5.0)\n",
|
||||
"Requirement already satisfied: packaging>=20.0 in /home/ubuntu/langchain/.venv/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (23.0)\n",
|
||||
"Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /home/ubuntu/langchain/.venv/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (1.1.1)\n",
|
||||
"Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /home/ubuntu/langchain/.venv/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (3.3.0)\n",
|
||||
"Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /home/ubuntu/langchain/.venv/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (2.4.5)\n",
|
||||
"Requirement already satisfied: pathy>=0.10.0 in /home/ubuntu/langchain/.venv/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (0.10.1)\n",
|
||||
"Requirement already satisfied: setuptools in /home/ubuntu/langchain/.venv/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (67.4.0)\n",
|
||||
"Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /home/ubuntu/langchain/.venv/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (4.64.1)\n",
|
||||
"Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /home/ubuntu/langchain/.venv/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (1.0.4)\n",
|
||||
"Requirement already satisfied: smart-open<7.0.0,>=5.2.1 in /home/ubuntu/langchain/.venv/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (6.3.0)\n",
|
||||
"Requirement already satisfied: thinc<8.2.0,>=8.1.0 in /home/ubuntu/langchain/.venv/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (8.1.7)\n",
|
||||
"Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /home/ubuntu/langchain/.venv/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (2.0.7)\n",
|
||||
"Requirement already satisfied: typer<0.8.0,>=0.3.0 in /home/ubuntu/langchain/.venv/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (0.7.0)\n",
|
||||
"Requirement already satisfied: requests<3.0.0,>=2.13.0 in /home/ubuntu/langchain/.venv/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (2.28.2)\n",
|
||||
"Requirement already satisfied: jinja2 in /home/ubuntu/langchain/.venv/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (3.1.2)\n",
|
||||
"Requirement already satisfied: pydantic!=1.8,!=1.8.1,<1.11.0,>=1.7.4 in /home/ubuntu/langchain/.venv/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (1.10.5)\n",
|
||||
"Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /home/ubuntu/langchain/.venv/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (2.0.8)\n",
|
||||
"Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /home/ubuntu/langchain/.venv/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (3.0.12)\n",
|
||||
"Requirement already satisfied: numpy>=1.15.0 in /home/ubuntu/langchain/.venv/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (1.24.2)\n",
|
||||
"Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /home/ubuntu/langchain/.venv/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (1.0.9)\n",
|
||||
"Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /home/ubuntu/langchain/.venv/lib/python3.9/site-packages (from spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (3.0.8)\n",
|
||||
"Requirement already satisfied: typing-extensions>=4.2.0 in /home/ubuntu/langchain/.venv/lib/python3.9/site-packages (from pydantic!=1.8,!=1.8.1,<1.11.0,>=1.7.4->spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (4.5.0)\n",
|
||||
"Requirement already satisfied: charset-normalizer<4,>=2 in /home/ubuntu/langchain/.venv/lib/python3.9/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (3.0.1)\n",
|
||||
"Requirement already satisfied: idna<4,>=2.5 in /home/ubuntu/langchain/.venv/lib/python3.9/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (3.4)\n",
|
||||
"Requirement already satisfied: certifi>=2017.4.17 in /home/ubuntu/langchain/.venv/lib/python3.9/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (2022.12.7)\n",
|
||||
"Requirement already satisfied: urllib3<1.27,>=1.21.1 in /home/ubuntu/langchain/.venv/lib/python3.9/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (1.26.14)\n",
|
||||
"Requirement already satisfied: blis<0.8.0,>=0.7.8 in /home/ubuntu/langchain/.venv/lib/python3.9/site-packages (from thinc<8.2.0,>=8.1.0->spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (0.7.9)\n",
|
||||
"Requirement already satisfied: confection<1.0.0,>=0.0.1 in /home/ubuntu/langchain/.venv/lib/python3.9/site-packages (from thinc<8.2.0,>=8.1.0->spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (0.0.4)\n",
|
||||
"Requirement already satisfied: click<9.0.0,>=7.1.1 in /home/ubuntu/langchain/.venv/lib/python3.9/site-packages (from typer<0.8.0,>=0.3.0->spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (8.1.3)\n",
|
||||
"Requirement already satisfied: MarkupSafe>=2.0 in /home/ubuntu/langchain/.venv/lib/python3.9/site-packages (from jinja2->spacy<3.6.0,>=3.5.0->en-core-web-sm==3.5.0) (2.1.2)\n",
|
||||
"\n",
|
||||
"\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m A new release of pip is available: \u001B[0m\u001B[31;49m23.0\u001B[0m\u001B[39;49m -> \u001B[0m\u001B[32;49m23.0.1\u001B[0m\n",
|
||||
"\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m To update, run: \u001B[0m\u001B[32;49mpip install --upgrade pip\u001B[0m\n",
|
||||
"\u001B[38;5;2m✔ Download and installation successful\u001B[0m\n",
|
||||
"You can now load the package via spacy.load('en_core_web_sm')\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"!python -m spacy download en_core_web_sm"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"ATLAS_TEST_API_KEY = '7xDPkYXSYDc1_ErdTPIcoAR9RNd8YDlkS3nVNXcVoIMZ6'"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = TextLoader('../../state_of_the_union.txt')\n",
|
||||
"documents = loader.load()\n",
|
||||
"text_splitter = SpacyTextSplitter(separator='|')\n",
|
||||
"texts = []\n",
|
||||
"for doc in text_splitter.split_documents(documents):\n",
|
||||
" texts.extend(doc.page_content.split('|'))\n",
|
||||
" \n",
|
||||
"texts = [e.strip() for e in texts]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"2023-02-24 16:13:49.696 | INFO | nomic.project:_create_project:884 - Creating project `test_index_1677255228.136989` in organization `Atlas Demo`\n",
|
||||
"2023-02-24 16:13:51.087 | INFO | nomic.project:wait_for_project_lock:993 - test_index_1677255228.136989: Project lock is released.\n",
|
||||
"2023-02-24 16:13:51.225 | INFO | nomic.project:wait_for_project_lock:993 - test_index_1677255228.136989: Project lock is released.\n",
|
||||
"2023-02-24 16:13:51.481 | INFO | nomic.project:add_text:1351 - Uploading text to Atlas.\n",
|
||||
"1it [00:00, 1.20it/s]\n",
|
||||
"2023-02-24 16:13:52.318 | INFO | nomic.project:add_text:1422 - Text upload succeeded.\n",
|
||||
"2023-02-24 16:13:52.628 | INFO | nomic.project:wait_for_project_lock:993 - test_index_1677255228.136989: Project lock is released.\n",
|
||||
"2023-02-24 16:13:53.380 | INFO | nomic.project:create_index:1192 - Created map `test_index_1677255228.136989_index` in project `test_index_1677255228.136989`: https://atlas.nomic.ai/map/ee2354a3-7f9a-4c6b-af43-b0cda09d7198/db996d77-8981-48a0-897a-ff2c22bbf541\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"db = AtlasDB.from_texts(texts=texts,\n",
|
||||
" name='test_index_'+str(time.time()),\n",
|
||||
" description='test_index',\n",
|
||||
" api_key=ATLAS_TEST_API_KEY,\n",
|
||||
" index_kwargs={'build_topic_model': True})"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {
|
||||
"scrolled": false
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"2023-02-24 16:14:09.106 | INFO | nomic.project:wait_for_project_lock:993 - test_index_1677255228.136989: Project lock is released.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"with db.project.wait_for_project_lock():\n",
|
||||
" time.sleep(1)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"\n",
|
||||
" <strong><a href=\"https://atlas.nomic.ai/dashboard/project/ee2354a3-7f9a-4c6b-af43-b0cda09d7198\">test_index_1677255228.136989</strong></a>\n",
|
||||
" <br>\n",
|
||||
" A description for your project 508 datums inserted.\n",
|
||||
" <br>\n",
|
||||
" 1 index built.\n",
|
||||
" <br><strong>Projections</strong>\n",
|
||||
"<ul>\n",
|
||||
"<li>test_index_1677255228.136989_index. Status Completed. <a target=\"_blank\" href=\"https://atlas.nomic.ai/map/ee2354a3-7f9a-4c6b-af43-b0cda09d7198/db996d77-8981-48a0-897a-ff2c22bbf541\">view online</a></li></ul><hr><script>\n",
|
||||
" destroy = function() {\n",
|
||||
" document.getElementById(\"iframedb996d77-8981-48a0-897a-ff2c22bbf541\").remove()\n",
|
||||
" }\n",
|
||||
" </script>\n",
|
||||
"\n",
|
||||
" <h4>Projection ID: db996d77-8981-48a0-897a-ff2c22bbf541</h4>\n",
|
||||
" <div class=\"actions\">\n",
|
||||
" <div id=\"hide\" class=\"action\" onclick=\"destroy()\">Hide embedded project</div>\n",
|
||||
" <div class=\"action\" id=\"out\">\n",
|
||||
" <a href=\"https://atlas.nomic.ai/map/ee2354a3-7f9a-4c6b-af43-b0cda09d7198/db996d77-8981-48a0-897a-ff2c22bbf541\" target=\"_blank\">Explore on atlas.nomic.ai</a>\n",
|
||||
" </div>\n",
|
||||
" </div>\n",
|
||||
" \n",
|
||||
" <iframe class=\"iframe\" id=\"iframedb996d77-8981-48a0-897a-ff2c22bbf541\" allow=\"clipboard-read; clipboard-write\" src=\"https://atlas.nomic.ai/map/ee2354a3-7f9a-4c6b-af43-b0cda09d7198/db996d77-8981-48a0-897a-ff2c22bbf541\">\n",
|
||||
" </iframe>\n",
|
||||
"\n",
|
||||
" <style>\n",
|
||||
" .iframe {\n",
|
||||
" /* vh can be **very** large in vscode ipynb. */\n",
|
||||
" height: min(75vh, 66vw);\n",
|
||||
" width: 100%;\n",
|
||||
" }\n",
|
||||
" </style>\n",
|
||||
" \n",
|
||||
" <style>\n",
|
||||
" .actions {\n",
|
||||
" display: block;\n",
|
||||
" }\n",
|
||||
" .action {\n",
|
||||
" min-height: 18px;\n",
|
||||
" margin: 5px;\n",
|
||||
" transition: all 500ms ease-in-out;\n",
|
||||
" }\n",
|
||||
" .action:hover {\n",
|
||||
" cursor: pointer;\n",
|
||||
" }\n",
|
||||
" #hide:hover::after {\n",
|
||||
" content: \" X\";\n",
|
||||
" }\n",
|
||||
" #out:hover::after {\n",
|
||||
" content: \"\";\n",
|
||||
" }\n",
|
||||
" </style>\n",
|
||||
" "
|
||||
],
|
||||
"text/plain": [
|
||||
"AtlasProject: <{'id': 'ee2354a3-7f9a-4c6b-af43-b0cda09d7198', 'owner': '9c29afbb-a002-4d49-958e-ecf5ae1351ac', 'project_name': 'test_index_1677255228.136989', 'creator': 'auth0|63efc4b5462246f4d9a6ecf2', 'description': 'A description for your project', 'opensearch_index_id': 'f61fb8dd-0abf-4f31-9130-41870e443902', 'is_public': True, 'project_fields': ['atlas_id', 'text'], 'unique_id_field': 'atlas_id', 'modality': 'text', 'total_datums_in_project': 508, 'created_timestamp': '2023-02-24T16:13:50.313363+00:00', 'atlas_indices': [{'id': 'b1b01833-0964-4597-a4bc-a2d60700949d', 'project_id': 'ee2354a3-7f9a-4c6b-af43-b0cda09d7198', 'index_name': 'test_index_1677255228.136989_index', 'indexed_field': 'text', 'created_timestamp': '2023-02-24T16:13:52.957101+00:00', 'updated_timestamp': '2023-02-24T16:14:03.469621+00:00', 'atoms': ['charchunk', 'document'], 'colorable_fields': [], 'embedders': [{'id': '7ec0868a-4eed-4414-a482-25cce9803e1b', 'atlas_index_id': 'b1b01833-0964-4597-a4bc-a2d60700949d', 'ready': True, 'model_name': 'NomicEmbed', 'hyperparameters': {'norm': 'both', 'batch_size': 20, 'polymerize_by': 'charchunk', 'dataset_buffer_size': 1000}}], 'nearest_neighbor_indices': [{'id': '86f8e3ff-e07c-4678-a4d7-144db4b0301d', 'index_name': 'NomicOrganize', 'ready': True, 'hyperparameters': {'dim': 384, 'space': 'l2'}, 'atom_strategies': ['document']}], 'projections': [{'id': 'db996d77-8981-48a0-897a-ff2c22bbf541', 'projection_name': 'NomicProject', 'ready': True, 'hyperparameters': {'spread': 1.0, 'n_epochs': 50, 'n_neighbors': 15}, 'atom_strategies': ['document'], 'created_timestamp': '2023-02-24T16:13:52.979561+00:00', 'updated_timestamp': '2023-02-24T16:14:03.466309+00:00'}]}], 'insert_update_delete_lock': False}>"
|
||||
]
|
||||
},
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"db.project"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.4"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 1
|
||||
}
|
@ -1,85 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Banana\n",
|
||||
"This example goes over how to use LangChain to interact with Banana models"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"from langchain.llms import Banana\n",
|
||||
"from langchain import PromptTemplate, LLMChain\n",
|
||||
"os.environ[\"BANANA_API_KEY\"] = \"YOUR_API_KEY\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"template = \"\"\"Question: {question}\n",
|
||||
"\n",
|
||||
"Answer: Let's think step by step.\"\"\"\n",
|
||||
"\n",
|
||||
"prompt = PromptTemplate(template=template, input_variables=[\"question\"])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"llm = Banana(model_key=\"YOUR_MODEL_KEY\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"llm_chain = LLMChain(prompt=prompt, llm=llm)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"question = \"What NFL team won the Super Bowl in the year Justin Beiber was born?\"\n",
|
||||
"\n",
|
||||
"llm_chain.run(question)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.9.12 ('palm')",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"name": "python",
|
||||
"version": "3.9.12"
|
||||
},
|
||||
"orig_nbformat": 4,
|
||||
"vscode": {
|
||||
"interpreter": {
|
||||
"hash": "a0a0263b650d907a3bfe41c0f8d6a63a071b884df3cfdc1579f00cdc1aed6b03"
|
||||
}
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
@ -1,83 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Modal\n",
|
||||
"This example goes over how to use LangChain to interact with Modal models"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.llms import Modal\n",
|
||||
"from langchain import PromptTemplate, LLMChain"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"template = \"\"\"Question: {question}\n",
|
||||
"\n",
|
||||
"Answer: Let's think step by step.\"\"\"\n",
|
||||
"\n",
|
||||
"prompt = PromptTemplate(template=template, input_variables=[\"question\"])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"llm = Modal(endpoint_url=\"YOUR_ENDPOINT_URL\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"llm_chain = LLMChain(prompt=prompt, llm=llm)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"question = \"What NFL team won the Super Bowl in the year Justin Beiber was born?\"\n",
|
||||
"\n",
|
||||
"llm_chain.run(question)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.9.12 ('palm')",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"name": "python",
|
||||
"version": "3.9.12"
|
||||
},
|
||||
"orig_nbformat": 4,
|
||||
"vscode": {
|
||||
"interpreter": {
|
||||
"hash": "a0a0263b650d907a3bfe41c0f8d6a63a071b884df3cfdc1579f00cdc1aed6b03"
|
||||
}
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
@ -1,83 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# StochasticAI\n",
|
||||
"This example goes over how to use LangChain to interact with StochasticAI models"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.llms import StochasticAI\n",
|
||||
"from langchain import PromptTemplate, LLMChain"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"template = \"\"\"Question: {question}\n",
|
||||
"\n",
|
||||
"Answer: Let's think step by step.\"\"\"\n",
|
||||
"\n",
|
||||
"prompt = PromptTemplate(template=template, input_variables=[\"question\"])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"llm = StochasticAI(api_url=\"YOUR_API_URL\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"llm_chain = LLMChain(prompt=prompt, llm=llm)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"question = \"What NFL team won the Super Bowl in the year Justin Beiber was born?\"\n",
|
||||
"\n",
|
||||
"llm_chain.run(question)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.9.12 ('palm')",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"name": "python",
|
||||
"version": "3.9.12"
|
||||
},
|
||||
"orig_nbformat": 4,
|
||||
"vscode": {
|
||||
"interpreter": {
|
||||
"hash": "a0a0263b650d907a3bfe41c0f8d6a63a071b884df3cfdc1579f00cdc1aed6b03"
|
||||
}
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
@ -1,83 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Writer\n",
|
||||
"This example goes over how to use LangChain to interact with Writer models"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.llms import Writer\n",
|
||||
"from langchain import PromptTemplate, LLMChain"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"template = \"\"\"Question: {question}\n",
|
||||
"\n",
|
||||
"Answer: Let's think step by step.\"\"\"\n",
|
||||
"\n",
|
||||
"prompt = PromptTemplate(template=template, input_variables=[\"question\"])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"llm = Writer()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"llm_chain = LLMChain(prompt=prompt, llm=llm)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"question = \"What NFL team won the Super Bowl in the year Justin Beiber was born?\"\n",
|
||||
"\n",
|
||||
"llm_chain.run(question)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.9.12 ('palm')",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"name": "python",
|
||||
"version": "3.9.12"
|
||||
},
|
||||
"orig_nbformat": 4,
|
||||
"vscode": {
|
||||
"interpreter": {
|
||||
"hash": "a0a0263b650d907a3bfe41c0f8d6a63a071b884df3cfdc1579f00cdc1aed6b03"
|
||||
}
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
@ -1,184 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "9355a547",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Partial Prompt Templates\n",
|
||||
"\n",
|
||||
"A prompt template is a class with a `.format` method which takes in a key-value map and returns a string (a prompt) to pass to the language model. Like other methods, it can make sense to \"partial\" a prompt template - eg pass in a subset of the required values, as to create a new prompt template which expects only the remaining subset of values.\n",
|
||||
"\n",
|
||||
"LangChain supports this in two ways: we allow for partially formatted prompts (1) with string values, (2) with functions that return string values. These two different ways support different use cases. In the documentation below we go over the motivations for both use cases as well as how to do it in LangChain.\n",
|
||||
"\n",
|
||||
"## Partial With Strings\n",
|
||||
"\n",
|
||||
"One common use case for wanting to partial a prompt template is if you get some of the variables before others. For example, suppose you have a prompt template that requires two variables, `foo` and `baz`. If you get the `foo` value early on in the chain, but the `baz` value later, it can be annoying to wait until you have both variables in the same place to pass them to the prompt template. Instead, you can partial the prompt template with the `foo` value, and then pass the partialed prompt template along and just use that. Below is an example of doing this:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "643af5da",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.prompts import PromptTemplate"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "4080d8d7",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"foobaz\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"prompt = PromptTemplate(template=\"{foo}{bar}\", input_variables=[\"foo\", \"bar\"])\n",
|
||||
"partial_prompt = prompt.partial(foo=\"foo\");\n",
|
||||
"print(partial_prompt.format(bar=\"baz\"))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "9986766e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"You can also just initialize the prompt with the partialed variables."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "e2ce95b3",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"foobaz\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"prompt = PromptTemplate(template=\"{foo}{bar}\", input_variables=[\"bar\"], partial_variables={\"foo\": \"foo\"})\n",
|
||||
"print(prompt.format(bar=\"baz\"))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "a9c66f83",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Partial With Functions\n",
|
||||
"\n",
|
||||
"The other common use is to partial with a function. The use case for this is when you have a variable you know that you always want to fetch in a common way. A prime example of this is with date or time. Imagine you have a prompt which you always want to have the current date. You can't hard code it in the prompt, and passing it along with the other input variables is a bit annoying. In this case, it's very handy to be able to partial the prompt with a function that always returns the current date."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "d0712d8a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from datetime import datetime\n",
|
||||
"\n",
|
||||
"def _get_datetime():\n",
|
||||
" now = datetime.now()\n",
|
||||
" return now.strftime(\"%m/%d/%Y, %H:%M:%S\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "4cbcb666",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Tell me a funny joke about the day 02/27/2023, 22:15:16\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"prompt = PromptTemplate(\n",
|
||||
" template=\"Tell me a {adjective} joke about the day {date}\", \n",
|
||||
" input_variables=[\"adjective\", \"date\"]\n",
|
||||
");\n",
|
||||
"partial_prompt = prompt.partial(date=_get_datetime)\n",
|
||||
"print(partial_prompt.format(adjective=\"funny\"))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ffed6811",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"You can also just initialize the prompt with the partialed variables, which often makes more sense in this workflow."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "96285b25",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Tell me a funny joke about the day 02/27/2023, 22:15:16\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"prompt = PromptTemplate(\n",
|
||||
" template=\"Tell me a {adjective} joke about the day {date}\", \n",
|
||||
" input_variables=[\"adjective\"],\n",
|
||||
" partial_variables={\"date\": _get_datetime}\n",
|
||||
");\n",
|
||||
"print(prompt.format(adjective=\"funny\"))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "4bff16f7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
@ -1,85 +1,85 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "8f210ec3",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Bash\n",
|
||||
"It can often be useful to have an LLM generate bash commands, and then run them. A common use case for this is letting the LLM interact with your local file system. We provide an easy util to execute bash commands."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "f7b3767b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.utilities import BashProcess"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "cf1c92f0",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"bash = BashProcess()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "2fa952fc",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"bash.ipynb\n",
|
||||
"google_search.ipynb\n",
|
||||
"python.ipynb\n",
|
||||
"requests.ipynb\n",
|
||||
"serpapi.ipynb\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(bash.run(\"ls\"))"
|
||||
]
|
||||
},
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "8f210ec3",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Bash\n",
|
||||
"It can often be useful to have an LLM generate bash commands, and then run them. A common use case this is for letting it interact with your local file system. We provide an easy util to execute bash commands."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "f7b3767b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.utilities import BashProcess"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "cf1c92f0",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"bash = BashProcess()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "2fa952fc",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "851fee9f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.9"
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"bash.ipynb\n",
|
||||
"google_search.ipynb\n",
|
||||
"python.ipynb\n",
|
||||
"requests.ipynb\n",
|
||||
"serpapi.ipynb\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(bash.run(\"ls\"))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "851fee9f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
|
@ -1,180 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"jukit_cell_id": "O4HPx3boF0"
|
||||
},
|
||||
"source": [],
|
||||
"outputs": [],
|
||||
"execution_count": null
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"jukit_cell_id": "hqQkbPEwTJ"
|
||||
},
|
||||
"source": [
|
||||
"# Using the DockerWrapper utility"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"jukit_cell_id": "vCepuypaFH"
|
||||
},
|
||||
"source": [
|
||||
"from langchain.utilities.docker import DockerWrapper"
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": null
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"jukit_cell_id": "BtYVqy2YtO"
|
||||
},
|
||||
"source": [
|
||||
"d = DockerWrapper(image='shell')"
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": null
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"jukit_cell_id": "ELWWm03ptQ"
|
||||
},
|
||||
"source": [
|
||||
"query = \"\"\"\n",
|
||||
"for i in $(seq 1 10)\n",
|
||||
"do\n",
|
||||
" echo $i\n",
|
||||
"done\n",
|
||||
"\"\"\"\n",
|
||||
"print(d.exec_run(query))"
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "1\n2\n3\n4\n5\n6\n7\n8\n9\n10\n"
|
||||
}
|
||||
],
|
||||
"execution_count": 1
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"jukit_cell_id": "lGMqLz5sDo"
|
||||
},
|
||||
"source": [
|
||||
"p = DockerWrapper(image='python')\n",
|
||||
"\n",
|
||||
"py_payload = \"\"\"\n",
|
||||
"def hello_world():\n",
|
||||
" return 'hello world'\n",
|
||||
"\n",
|
||||
"hello_world()\n",
|
||||
"\"\"\""
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": null
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"jukit_cell_id": "X04Wd6zbrk"
|
||||
},
|
||||
"source": [
|
||||
"print(p.exec_run(py_payload))"
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "'hello world'\n"
|
||||
}
|
||||
],
|
||||
"execution_count": 2
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"jukit_cell_id": "lKOfuDoJGk"
|
||||
},
|
||||
"source": [],
|
||||
"outputs": [],
|
||||
"execution_count": null
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"jukit_cell_id": "eSzXtDrpqU"
|
||||
},
|
||||
"source": [
|
||||
"## Passing custom parameters\n",
|
||||
"\n",
|
||||
"By default containers are run with a safe set of parameters. You can pass any parameters\n",
|
||||
"that are accepted by the docker python sdk to the run and exec commands.\n",
|
||||
"\n",
|
||||
"### Using networking"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"jukit_cell_id": "eWFGCxD9pv"
|
||||
},
|
||||
"source": [
|
||||
"# by default containers don't have access to the network\n",
|
||||
"print(d.run('ping -c 1 google.com'))"
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "STDERR: Command '/bin/sh -c 'ping -c 1 google.com'' in image 'alpine:latest' returned non-zero exit status 1: b\"ping: bad address 'google.com'\\n\"\n"
|
||||
}
|
||||
],
|
||||
"execution_count": 3
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"jukit_cell_id": "Z0YkpuXVyL"
|
||||
},
|
||||
"source": [
|
||||
"# using the network parameter\n",
|
||||
"print(d.run('ping -c 1 google.com', network='bridge'))"
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "PING google.com (142.250.200.110): 56 data bytes\n64 bytes from 142.250.200.110: seq=0 ttl=42 time=13.695 ms\n\n--- google.com ping statistics ---\n1 packets transmitted, 1 packets received, 0% packet loss\nround-trip min/avg/max = 13.695/13.695/13.695 ms\n"
|
||||
}
|
||||
],
|
||||
"execution_count": 4
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"jukit_cell_id": "3rMWzzuLHq"
|
||||
},
|
||||
"source": [],
|
||||
"outputs": [],
|
||||
"execution_count": null
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"anaconda-cloud": {},
|
||||
"kernelspec": {
|
||||
"display_name": "python",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
@ -1,35 +1,29 @@
|
||||
# Key Concepts
|
||||
|
||||
## Python REPL
|
||||
|
||||
Sometimes, for complex calculations, rather than have an LLM generate the answer directly,
|
||||
it can be better to have the LLM generate code to calculate the answer, and then run that code to get the answer.
|
||||
Sometimes, for complex calculations, rather than have an LLM generate the answer directly,
|
||||
it can be better to have the LLM generate code to calculate the answer, and then run that code to get the answer.
|
||||
In order to easily do that, we provide a simple Python REPL to execute commands in.
|
||||
This interface will only return things that are printed -
|
||||
This interface will only return things that are printed -
|
||||
therefore, if you want to use it to calculate an answer, make sure to have it print out the answer.
|
||||
|
||||
## Bash
|
||||
|
||||
It can often be useful to have an LLM generate bash commands, and then run them.
|
||||
A common use case for this is letting the LLM interact with your local file system.
|
||||
It can often be useful to have an LLM generate bash commands, and then run them.
|
||||
A common use case this is for letting it interact with your local file system.
|
||||
We provide an easy component to execute bash commands.
|
||||
|
||||
## Requests Wrapper
|
||||
|
||||
The web contains a lot of information that LLMs do not have access to.
|
||||
In order to easily let LLMs interact with that information,
|
||||
The web contains a lot of information that LLMs do not have access to.
|
||||
In order to easily let LLMs interact with that information,
|
||||
we provide a wrapper around the Python Requests module that takes in a URL and fetches data from that URL.
|
||||
|
||||
## Google Search
|
||||
|
||||
This uses the official Google Search API to look up information on the web.
|
||||
|
||||
## SerpAPI
|
||||
|
||||
This uses SerpAPI, a third party search API engine, to interact with Google Search.
|
||||
|
||||
## Searx Search
|
||||
|
||||
This uses the Searx (SearxNG fork) meta search engine API to lookup information
|
||||
on the web. It supports 139 search engines and is easy to self-host
|
||||
on the web. It supports 139 search engines and is easy to self-host
|
||||
which makes it a good choice for privacy-conscious users.
|
||||
|
@ -1,33 +0,0 @@
|
||||
"""Load CoNLL-U files."""
|
||||
import csv
|
||||
from typing import List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
class CoNLLULoader(BaseLoader):
|
||||
"""Load CoNLL-U files."""
|
||||
|
||||
def __init__(self, file_path: str):
|
||||
"""Initialize with file path."""
|
||||
self.file_path = file_path
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load from file path."""
|
||||
with open(self.file_path, encoding="utf8") as f:
|
||||
tsv = list(csv.reader(f, delimiter="\t"))
|
||||
|
||||
# If len(line) > 1, the line is not a comment
|
||||
lines = [line for line in tsv if len(line) > 1]
|
||||
|
||||
text = ""
|
||||
for i, line in enumerate(lines):
|
||||
# Do not add a space after a punctuation mark or at the end of the sentence
|
||||
if line[9] == "SpaceAfter=No" or i == len(lines) - 1:
|
||||
text += line[1]
|
||||
else:
|
||||
text += line[1] + " "
|
||||
|
||||
metadata = {"source": self.file_path}
|
||||
return [Document(page_content=text, metadata=metadata)]
|
@ -1,57 +0,0 @@
|
||||
"""Loader that loads Facebook chat json dump."""
|
||||
import datetime
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
def concatenate_rows(row: dict) -> str:
|
||||
"""Combine message information in a readable format ready to be used."""
|
||||
sender = row["sender_name"]
|
||||
text = row["content"]
|
||||
date = datetime.datetime.fromtimestamp(row["timestamp_ms"] / 1000).strftime(
|
||||
"%Y-%m-%d %H:%M:%S"
|
||||
)
|
||||
return f"{sender} on {date}: {text}\n\n"
|
||||
|
||||
|
||||
class FacebookChatLoader(BaseLoader):
|
||||
"""Loader that loads Facebook messages json directory dump."""
|
||||
|
||||
def __init__(self, path: str):
|
||||
"""Initialize with path."""
|
||||
self.file_path = path
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load documents."""
|
||||
try:
|
||||
import pandas as pd
|
||||
except ImportError:
|
||||
raise ValueError(
|
||||
"pandas is needed for Facebook chat loader, "
|
||||
"please install with `pip install pandas`"
|
||||
)
|
||||
p = Path(self.file_path)
|
||||
|
||||
with open(p, encoding="utf8") as f:
|
||||
d = json.load(f)
|
||||
|
||||
normalized_messages = pd.json_normalize(d["messages"])
|
||||
df_normalized_messages = pd.DataFrame(normalized_messages)
|
||||
|
||||
# Only keep plain text messages
|
||||
# (no services, nor links, hashtags, code, bold ...)
|
||||
df_filtered = df_normalized_messages[
|
||||
(df_normalized_messages.content.apply(lambda x: type(x) == str))
|
||||
]
|
||||
|
||||
df_filtered = df_filtered[["timestamp_ms", "content", "sender_name"]]
|
||||
|
||||
text = df_filtered.apply(concatenate_rows, axis=1).str.cat(sep="")
|
||||
|
||||
metadata = {"source": str(p)}
|
||||
|
||||
return [Document(page_content=text, metadata=metadata)]
|
@ -1,202 +0,0 @@
|
||||
"""Loader that loads iFixit data."""
|
||||
from typing import List, Optional
|
||||
|
||||
import requests
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
from langchain.document_loaders.web_base import WebBaseLoader
|
||||
|
||||
IFIXIT_BASE_URL = "https://www.ifixit.com/api/2.0"
|
||||
|
||||
|
||||
class IFixitLoader(BaseLoader):
|
||||
"""Load iFixit repair guides, device wikis and answers.
|
||||
|
||||
iFixit is the largest, open repair community on the web. The site contains nearly
|
||||
100k repair manuals, 200k Questions & Answers on 42k devices, and all the data is
|
||||
licensed under CC-BY.
|
||||
|
||||
This loader will allow you to download the text of a repair guide, text of Q&A's
|
||||
and wikis from devices on iFixit using their open APIs and web scraping.
|
||||
"""
|
||||
|
||||
def __init__(self, web_path: str):
|
||||
"""Initialize with web path."""
|
||||
if not web_path.startswith("https://www.ifixit.com"):
|
||||
raise ValueError("web path must start with 'https://www.ifixit.com'")
|
||||
|
||||
path = web_path.replace("https://www.ifixit.com", "")
|
||||
|
||||
allowed_paths = ["/Device", "/Guide", "/Answers", "/Teardown"]
|
||||
|
||||
""" TODO: Add /Wiki """
|
||||
if not any(path.startswith(allowed_path) for allowed_path in allowed_paths):
|
||||
raise ValueError(
|
||||
"web path must start with /Device, /Guide, /Teardown or /Answers"
|
||||
)
|
||||
|
||||
pieces = [x for x in path.split("/") if x]
|
||||
|
||||
"""Teardowns are just guides by a different name"""
|
||||
self.page_type = pieces[0] if pieces[0] != "Teardown" else "Guide"
|
||||
|
||||
if self.page_type == "Guide" or self.page_type == "Answers":
|
||||
self.id = pieces[2]
|
||||
else:
|
||||
self.id = pieces[1]
|
||||
|
||||
self.web_path = web_path
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
if self.page_type == "Device":
|
||||
return self.load_device()
|
||||
elif self.page_type == "Guide" or self.page_type == "Teardown":
|
||||
return self.load_guide()
|
||||
elif self.page_type == "Answers":
|
||||
return self.load_questions_and_answers()
|
||||
else:
|
||||
raise ValueError("Unknown page type: " + self.page_type)
|
||||
|
||||
@staticmethod
|
||||
def load_suggestions(query: str = "", doc_type: str = "all") -> List[Document]:
|
||||
res = requests.get(
|
||||
IFIXIT_BASE_URL + "/suggest/" + query + "?doctypes=" + doc_type
|
||||
)
|
||||
|
||||
if res.status_code != 200:
|
||||
raise ValueError(
|
||||
'Could not load suggestions for "' + query + '"\n' + res.json()
|
||||
)
|
||||
|
||||
data = res.json()
|
||||
|
||||
results = data["results"]
|
||||
output = []
|
||||
|
||||
for result in results:
|
||||
try:
|
||||
loader = IFixitLoader(result["url"])
|
||||
if loader.page_type == "Device":
|
||||
output += loader.load_device(include_guides=False)
|
||||
else:
|
||||
output += loader.load()
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
return output
|
||||
|
||||
def load_questions_and_answers(
|
||||
self, url_override: Optional[str] = None
|
||||
) -> List[Document]:
|
||||
loader = WebBaseLoader(self.web_path if url_override is None else url_override)
|
||||
soup = loader.scrape()
|
||||
|
||||
output = []
|
||||
|
||||
title = soup.find("h1", "post-title").text
|
||||
|
||||
output.append("# " + title)
|
||||
output.append(soup.select_one(".post-content .post-text").text.strip())
|
||||
|
||||
output.append("\n## " + soup.find("div", "post-answers-header").text.strip())
|
||||
for answer in soup.select(".js-answers-list .post.post-answer"):
|
||||
if answer.has_attr("itemprop") and "acceptedAnswer" in answer["itemprop"]:
|
||||
output.append("\n### Accepted Answer")
|
||||
elif "post-helpful" in answer["class"]:
|
||||
output.append("\n### Most Helpful Answer")
|
||||
else:
|
||||
output.append("\n### Other Answer")
|
||||
|
||||
output += [
|
||||
a.text.strip() for a in answer.select(".post-content .post-text")
|
||||
]
|
||||
output.append("\n")
|
||||
|
||||
text = "\n".join(output).strip()
|
||||
|
||||
metadata = {"source": self.web_path, "title": title}
|
||||
|
||||
return [Document(page_content=text, metadata=metadata)]
|
||||
|
||||
def load_device(
|
||||
self, url_override: Optional[str] = None, include_guides: bool = True
|
||||
) -> List[Document]:
|
||||
documents = []
|
||||
if url_override is None:
|
||||
url = IFIXIT_BASE_URL + "/wikis/CATEGORY/" + self.id
|
||||
else:
|
||||
url = url_override
|
||||
|
||||
res = requests.get(url)
|
||||
data = res.json()
|
||||
text = "\n".join(
|
||||
[
|
||||
data[key]
|
||||
for key in ["title", "description", "contents_raw"]
|
||||
if key in data
|
||||
]
|
||||
).strip()
|
||||
|
||||
metadata = {"source": self.web_path, "title": data["title"]}
|
||||
documents.append(Document(page_content=text, metadata=metadata))
|
||||
|
||||
if include_guides:
|
||||
"""Load and return documents for each guide linked to from the device"""
|
||||
guide_urls = [guide["url"] for guide in data["guides"]]
|
||||
for guide_url in guide_urls:
|
||||
documents.append(IFixitLoader(guide_url).load()[0])
|
||||
|
||||
return documents
|
||||
|
||||
def load_guide(self, url_override: Optional[str] = None) -> List[Document]:
|
||||
if url_override is None:
|
||||
url = IFIXIT_BASE_URL + "/guides/" + self.id
|
||||
else:
|
||||
url = url_override
|
||||
|
||||
res = requests.get(url)
|
||||
|
||||
if res.status_code != 200:
|
||||
raise ValueError(
|
||||
"Could not load guide: " + self.web_path + "\n" + res.json()
|
||||
)
|
||||
|
||||
data = res.json()
|
||||
|
||||
doc_parts = ["# " + data["title"], data["introduction_raw"]]
|
||||
|
||||
doc_parts.append("\n\n###Tools Required:")
|
||||
if len(data["tools"]) == 0:
|
||||
doc_parts.append("\n - None")
|
||||
else:
|
||||
for tool in data["tools"]:
|
||||
doc_parts.append("\n - " + tool["text"])
|
||||
|
||||
doc_parts.append("\n\n###Parts Required:")
|
||||
if len(data["parts"]) == 0:
|
||||
doc_parts.append("\n - None")
|
||||
else:
|
||||
for part in data["parts"]:
|
||||
doc_parts.append("\n - " + part["text"])
|
||||
|
||||
for row in data["steps"]:
|
||||
doc_parts.append(
|
||||
"\n\n## "
|
||||
+ (
|
||||
row["title"]
|
||||
if row["title"] != ""
|
||||
else "Step {}".format(row["orderby"])
|
||||
)
|
||||
)
|
||||
|
||||
for line in row["lines"]:
|
||||
doc_parts.append(line["text_raw"])
|
||||
|
||||
doc_parts.append(data["conclusion_raw"])
|
||||
|
||||
text = "\n".join(doc_parts)
|
||||
|
||||
metadata = {"source": self.web_path, "title": data["title"]}
|
||||
|
||||
return [Document(page_content=text, metadata=metadata)]
|
@ -1,13 +0,0 @@
|
||||
"""Loader that loads image files."""
|
||||
from typing import List
|
||||
|
||||
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
||||
|
||||
|
||||
class UnstructuredImageLoader(UnstructuredFileLoader):
|
||||
"""Loader that uses unstructured to load image files, such as PNGs and JPGs."""
|
||||
|
||||
def _get_elements(self) -> List:
|
||||
from unstructured.partition.image import partition_image
|
||||
|
||||
return partition_image(filename=self.file_path)
|
@ -1,109 +0,0 @@
|
||||
"""Loader that loads .ipynb notebook files."""
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Any, List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
def concatenate_cells(
|
||||
cell: dict, include_outputs: bool, max_output_length: int, traceback: bool
|
||||
) -> str:
|
||||
"""Combine cells information in a readable format ready to be used."""
|
||||
cell_type = cell["cell_type"]
|
||||
source = cell["source"]
|
||||
output = cell["outputs"]
|
||||
|
||||
if include_outputs and cell_type == "code" and output:
|
||||
if "ename" in output[0].keys():
|
||||
error_name = output[0]["ename"]
|
||||
error_value = output[0]["evalue"]
|
||||
if traceback:
|
||||
traceback = output[0]["traceback"]
|
||||
return (
|
||||
f"'{cell_type}' cell: '{source}'\n, gives error '{error_name}',"
|
||||
f" with description '{error_value}'\n"
|
||||
f"and traceback '{traceback}'\n\n"
|
||||
)
|
||||
else:
|
||||
return (
|
||||
f"'{cell_type}' cell: '{source}'\n, gives error '{error_name}',"
|
||||
f"with description '{error_value}'\n\n"
|
||||
)
|
||||
elif output[0]["output_type"] == "stream":
|
||||
output = output[0]["text"]
|
||||
min_output = min(max_output_length, len(output))
|
||||
return (
|
||||
f"'{cell_type}' cell: '{source}'\n with "
|
||||
f"output: '{output[:min_output]}'\n\n"
|
||||
)
|
||||
else:
|
||||
return f"'{cell_type}' cell: '{source}'\n\n"
|
||||
|
||||
return ""
|
||||
|
||||
|
||||
def remove_newlines(x: Any) -> Any:
|
||||
"""Remove recursively newlines, no matter the data structure they are stored in."""
|
||||
import pandas as pd
|
||||
|
||||
if isinstance(x, str):
|
||||
return x.replace("\n", "")
|
||||
elif isinstance(x, list):
|
||||
return [remove_newlines(elem) for elem in x]
|
||||
elif isinstance(x, pd.DataFrame):
|
||||
return x.applymap(remove_newlines)
|
||||
else:
|
||||
return x
|
||||
|
||||
|
||||
class NotebookLoader(BaseLoader):
|
||||
"""Loader that loads .ipynb notebook files."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
path: str,
|
||||
include_outputs: bool = False,
|
||||
max_output_length: int = 10,
|
||||
remove_newline: bool = False,
|
||||
traceback: bool = False,
|
||||
):
|
||||
"""Initialize with path."""
|
||||
self.file_path = path
|
||||
self.include_outputs = include_outputs
|
||||
self.max_output_length = max_output_length
|
||||
self.remove_newline = remove_newline
|
||||
self.traceback = traceback
|
||||
|
||||
def load(
|
||||
self,
|
||||
) -> List[Document]:
|
||||
"""Load documents."""
|
||||
try:
|
||||
import pandas as pd
|
||||
except ImportError:
|
||||
raise ValueError(
|
||||
"pandas is needed for Notebook Loader, "
|
||||
"please install with `pip install pandas`"
|
||||
)
|
||||
p = Path(self.file_path)
|
||||
|
||||
with open(p, encoding="utf8") as f:
|
||||
d = json.load(f)
|
||||
|
||||
data = pd.json_normalize(d["cells"])
|
||||
filtered_data = data[["cell_type", "source", "outputs"]]
|
||||
if self.remove_newline:
|
||||
filtered_data = filtered_data.applymap(remove_newlines)
|
||||
|
||||
text = filtered_data.apply(
|
||||
lambda x: concatenate_cells(
|
||||
x, self.include_outputs, self.max_output_length, self.traceback
|
||||
),
|
||||
axis=1,
|
||||
).str.cat(sep=" ")
|
||||
|
||||
metadata = {"source": str(p)}
|
||||
|
||||
return [Document(page_content=text, metadata=metadata)]
|
@ -1,43 +0,0 @@
|
||||
"""Loader that loads word documents."""
|
||||
import os
|
||||
from typing import List
|
||||
|
||||
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
||||
|
||||
|
||||
class UnstructuredWordDocumentLoader(UnstructuredFileLoader):
|
||||
"""Loader that uses unstructured to load word documents."""
|
||||
|
||||
def _get_elements(self) -> List:
|
||||
from unstructured.__version__ import __version__ as __unstructured_version__
|
||||
from unstructured.file_utils.filetype import FileType, detect_filetype
|
||||
|
||||
unstructured_version = tuple(
|
||||
[int(x) for x in __unstructured_version__.split(".")]
|
||||
)
|
||||
# NOTE(MthwRobinson) - magic will raise an import error if the libmagic
|
||||
# system dependency isn't installed. If it's not installed, we'll just
|
||||
# check the file extension
|
||||
try:
|
||||
import magic # noqa: F401
|
||||
|
||||
is_doc = detect_filetype(self.file_path) == FileType.DOC
|
||||
except ImportError:
|
||||
_, extension = os.path.splitext(self.file_path)
|
||||
is_doc = extension == ".doc"
|
||||
|
||||
if is_doc and unstructured_version < (0, 4, 11):
|
||||
raise ValueError(
|
||||
f"You are on unstructured version {__unstructured_version__}. "
|
||||
"Partitioning .doc files is only supported in unstructured>=0.4.11. "
|
||||
"Please upgrade the unstructured package and try again."
|
||||
)
|
||||
|
||||
if is_doc:
|
||||
from unstructured.partition.doc import partition_doc
|
||||
|
||||
return partition_doc(filename=self.file_path)
|
||||
else:
|
||||
from unstructured.partition.docx import partition_docx
|
||||
|
||||
return partition_docx(filename=self.file_path)
|
@ -1,5 +1,4 @@
|
||||
"""All index utils."""
|
||||
from langchain.indexes.graph import GraphIndexCreator
|
||||
from langchain.indexes.vectorstore import VectorstoreIndexCreator
|
||||
|
||||
__all__ = ["GraphIndexCreator", "VectorstoreIndexCreator"]
|
||||
__all__ = ["GraphIndexCreator"]
|
||||
|
@ -1,69 +0,0 @@
|
||||
from typing import Any, List, Optional, Type
|
||||
|
||||
from pydantic import BaseModel, Extra, Field
|
||||
|
||||
from langchain.chains.qa_with_sources.vector_db import VectorDBQAWithSourcesChain
|
||||
from langchain.chains.vector_db_qa.base import VectorDBQA
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
from langchain.embeddings.base import Embeddings
|
||||
from langchain.embeddings.openai import OpenAIEmbeddings
|
||||
from langchain.llms.base import BaseLLM
|
||||
from langchain.llms.openai import OpenAI
|
||||
from langchain.text_splitter import RecursiveCharacterTextSplitter, TextSplitter
|
||||
from langchain.vectorstores.base import VectorStore
|
||||
from langchain.vectorstores.chroma import Chroma
|
||||
|
||||
|
||||
def _get_default_text_splitter() -> TextSplitter:
|
||||
return RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
|
||||
|
||||
|
||||
class VectorStoreIndexWrapper(BaseModel):
|
||||
"""Wrapper around a vectorstore for easy access."""
|
||||
|
||||
vectorstore: VectorStore
|
||||
|
||||
class Config:
|
||||
"""Configuration for this pydantic object."""
|
||||
|
||||
extra = Extra.forbid
|
||||
arbitrary_types_allowed = True
|
||||
|
||||
def query(self, question: str, llm: Optional[BaseLLM] = None, **kwargs: Any) -> str:
|
||||
"""Query the vectorstore."""
|
||||
llm = llm or OpenAI(temperature=0)
|
||||
chain = VectorDBQA.from_chain_type(llm, vectorstore=self.vectorstore, **kwargs)
|
||||
return chain.run(question)
|
||||
|
||||
def query_with_sources(
|
||||
self, question: str, llm: Optional[BaseLLM] = None, **kwargs: Any
|
||||
) -> dict:
|
||||
"""Query the vectorstore and get back sources."""
|
||||
llm = llm or OpenAI(temperature=0)
|
||||
chain = VectorDBQAWithSourcesChain.from_chain_type(
|
||||
llm, vectorstore=self.vectorstore, **kwargs
|
||||
)
|
||||
return chain({chain.question_key: question})
|
||||
|
||||
|
||||
class VectorstoreIndexCreator(BaseModel):
|
||||
"""Logic for creating indexes."""
|
||||
|
||||
vectorstore_cls: Type[VectorStore] = Chroma
|
||||
embedding: Embeddings = Field(default_factory=OpenAIEmbeddings)
|
||||
text_splitter: TextSplitter = Field(default_factory=_get_default_text_splitter)
|
||||
|
||||
class Config:
|
||||
"""Configuration for this pydantic object."""
|
||||
|
||||
extra = Extra.forbid
|
||||
arbitrary_types_allowed = True
|
||||
|
||||
def from_loaders(self, loaders: List[BaseLoader]) -> VectorStoreIndexWrapper:
|
||||
"""Create a vectorstore index from loaders."""
|
||||
docs = []
|
||||
for loader in loaders:
|
||||
docs.extend(loader.load())
|
||||
sub_docs = self.text_splitter.split_documents(docs)
|
||||
vectorstore = self.vectorstore_cls.from_documents(sub_docs, self.embedding)
|
||||
return VectorStoreIndexWrapper(vectorstore=vectorstore)
|
@ -1,117 +0,0 @@
|
||||
"""Wrapper around Banana API."""
|
||||
import logging
|
||||
from typing import Any, Dict, List, Mapping, Optional
|
||||
|
||||
from pydantic import BaseModel, Extra, Field, root_validator
|
||||
|
||||
from langchain.llms.base import LLM
|
||||
from langchain.llms.utils import enforce_stop_tokens
|
||||
from langchain.utils import get_from_dict_or_env
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Banana(LLM, BaseModel):
|
||||
"""Wrapper around Banana large language models.
|
||||
|
||||
To use, you should have the ``banana-dev`` python package installed,
|
||||
and the environment variable ``BANANA_API_KEY`` set with your API key.
|
||||
|
||||
Any parameters that are valid to be passed to the call can be passed
|
||||
in, even if not explicitly saved on this class.
|
||||
|
||||
Example:
|
||||
.. code-block:: python
|
||||
from langchain.llms import Banana
|
||||
banana = Banana(model_key="")
|
||||
"""
|
||||
|
||||
model_key: str = ""
|
||||
"""model endpoint to use"""
|
||||
|
||||
model_kwargs: Dict[str, Any] = Field(default_factory=dict)
|
||||
"""Holds any model parameters valid for `create` call not
|
||||
explicitly specified."""
|
||||
|
||||
banana_api_key: Optional[str] = None
|
||||
|
||||
class Config:
|
||||
"""Configuration for this pydantic config."""
|
||||
|
||||
extra = Extra.forbid
|
||||
|
||||
@root_validator(pre=True)
|
||||
def build_extra(cls, values: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Build extra kwargs from additional params that were passed in."""
|
||||
all_required_field_names = {field.alias for field in cls.__fields__.values()}
|
||||
|
||||
extra = values.get("model_kwargs", {})
|
||||
for field_name in list(values):
|
||||
if field_name not in all_required_field_names:
|
||||
if field_name in extra:
|
||||
raise ValueError(f"Found {field_name} supplied twice.")
|
||||
logger.warning(
|
||||
f"""{field_name} was transfered to model_kwargs.
|
||||
Please confirm that {field_name} is what you intended."""
|
||||
)
|
||||
extra[field_name] = values.pop(field_name)
|
||||
values["model_kwargs"] = extra
|
||||
return values
|
||||
|
||||
@root_validator()
|
||||
def validate_environment(cls, values: Dict) -> Dict:
|
||||
"""Validate that api key and python package exists in environment."""
|
||||
banana_api_key = get_from_dict_or_env(
|
||||
values, "banana_api_key", "BANANA_API_KEY"
|
||||
)
|
||||
values["banana_api_key"] = banana_api_key
|
||||
return values
|
||||
|
||||
@property
|
||||
def _identifying_params(self) -> Mapping[str, Any]:
|
||||
"""Get the identifying parameters."""
|
||||
return {
|
||||
**{"model_key": self.model_key},
|
||||
**{"model_kwargs": self.model_kwargs},
|
||||
}
|
||||
|
||||
@property
|
||||
def _llm_type(self) -> str:
|
||||
"""Return type of llm."""
|
||||
return "banana"
|
||||
|
||||
def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
|
||||
"""Call to Banana endpoint."""
|
||||
try:
|
||||
import banana_dev as banana
|
||||
except ImportError:
|
||||
raise ValueError(
|
||||
"Could not import banana-dev python package. "
|
||||
"Please install it with `pip install banana-dev`."
|
||||
)
|
||||
params = self.model_kwargs or {}
|
||||
api_key = self.banana_api_key
|
||||
model_key = self.model_key
|
||||
model_inputs = {
|
||||
# a json specific to your model.
|
||||
"prompt": prompt,
|
||||
**params,
|
||||
}
|
||||
response = banana.run(api_key, model_key, model_inputs)
|
||||
try:
|
||||
text = response["modelOutputs"][0]["output"]
|
||||
except (KeyError, TypeError):
|
||||
returned = response["modelOutputs"][0]
|
||||
raise ValueError(
|
||||
"Response should be of schema: {'output': 'text'}."
|
||||
f"\nResponse was: {returned}"
|
||||
"\nTo fix this:"
|
||||
"\n- fork the source repo of the Banana model"
|
||||
"\n- modify app.py to return the above schema"
|
||||
"\n- deploy that as a custom repo"
|
||||
)
|
||||
if stop is not None:
|
||||
# I believe this is required since the stop tokens
|
||||
# are not enforced by the model parameters
|
||||
text = enforce_stop_tokens(text, stop)
|
||||
return text
|
@ -1,92 +0,0 @@
|
||||
"""Wrapper around Modal API."""
|
||||
import logging
|
||||
from typing import Any, Dict, List, Mapping, Optional
|
||||
|
||||
import requests
|
||||
from pydantic import BaseModel, Extra, Field, root_validator
|
||||
|
||||
from langchain.llms.base import LLM
|
||||
from langchain.llms.utils import enforce_stop_tokens
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Modal(LLM, BaseModel):
|
||||
"""Wrapper around Modal large language models.
|
||||
|
||||
To use, you should have the ``modal-client`` python package installed.
|
||||
|
||||
Any parameters that are valid to be passed to the call can be passed
|
||||
in, even if not explicitly saved on this class.
|
||||
|
||||
Example:
|
||||
.. code-block:: python
|
||||
from langchain.llms import Modal
|
||||
modal = Modal(endpoint_url="")
|
||||
|
||||
"""
|
||||
|
||||
endpoint_url: str = ""
|
||||
"""model endpoint to use"""
|
||||
|
||||
model_kwargs: Dict[str, Any] = Field(default_factory=dict)
|
||||
"""Holds any model parameters valid for `create` call not
|
||||
explicitly specified."""
|
||||
|
||||
class Config:
|
||||
"""Configuration for this pydantic config."""
|
||||
|
||||
extra = Extra.forbid
|
||||
|
||||
@root_validator(pre=True)
|
||||
def build_extra(cls, values: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Build extra kwargs from additional params that were passed in."""
|
||||
all_required_field_names = {field.alias for field in cls.__fields__.values()}
|
||||
|
||||
extra = values.get("model_kwargs", {})
|
||||
for field_name in list(values):
|
||||
if field_name not in all_required_field_names:
|
||||
if field_name in extra:
|
||||
raise ValueError(f"Found {field_name} supplied twice.")
|
||||
logger.warning(
|
||||
f"""{field_name} was transfered to model_kwargs.
|
||||
Please confirm that {field_name} is what you intended."""
|
||||
)
|
||||
extra[field_name] = values.pop(field_name)
|
||||
values["model_kwargs"] = extra
|
||||
return values
|
||||
|
||||
@property
|
||||
def _identifying_params(self) -> Mapping[str, Any]:
|
||||
"""Get the identifying parameters."""
|
||||
return {
|
||||
**{"endpoint_url": self.endpoint_url},
|
||||
**{"model_kwargs": self.model_kwargs},
|
||||
}
|
||||
|
||||
@property
|
||||
def _llm_type(self) -> str:
|
||||
"""Return type of llm."""
|
||||
return "modal"
|
||||
|
||||
def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
|
||||
"""Call to Modal endpoint."""
|
||||
params = self.model_kwargs or {}
|
||||
response = requests.post(
|
||||
url=self.endpoint_url,
|
||||
headers={
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
json={"prompt": prompt, **params},
|
||||
)
|
||||
try:
|
||||
if prompt in response.json()["prompt"]:
|
||||
response_json = response.json()
|
||||
except KeyError:
|
||||
raise ValueError("LangChain requires 'prompt' key in response.")
|
||||
text = response_json["prompt"]
|
||||
if stop is not None:
|
||||
# I believe this is required since the stop tokens
|
||||
# are not enforced by the model parameters
|
||||
text = enforce_stop_tokens(text, stop)
|
||||
return text
|
@ -1,130 +0,0 @@
|
||||
"""Wrapper around StochasticAI APIs."""
|
||||
import logging
|
||||
import time
|
||||
from typing import Any, Dict, List, Mapping, Optional
|
||||
|
||||
import requests
|
||||
from pydantic import BaseModel, Extra, Field, root_validator
|
||||
|
||||
from langchain.llms.base import LLM
|
||||
from langchain.llms.utils import enforce_stop_tokens
|
||||
from langchain.utils import get_from_dict_or_env
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class StochasticAI(LLM, BaseModel):
|
||||
"""Wrapper around StochasticAI large language models.
|
||||
|
||||
To use, you should have the environment variable ``STOCHASTICAI_API_KEY``
|
||||
set with your API key.
|
||||
|
||||
Example:
|
||||
.. code-block:: python
|
||||
|
||||
from langchain.llms import StochasticAI
|
||||
stochasticai = StochasticAI(api_url="")
|
||||
"""
|
||||
|
||||
api_url: str = ""
|
||||
"""Model name to use."""
|
||||
|
||||
model_kwargs: Dict[str, Any] = Field(default_factory=dict)
|
||||
"""Holds any model parameters valid for `create` call not
|
||||
explicitly specified."""
|
||||
|
||||
stochasticai_api_key: Optional[str] = None
|
||||
|
||||
class Config:
|
||||
"""Configuration for this pydantic object."""
|
||||
|
||||
extra = Extra.forbid
|
||||
|
||||
@root_validator(pre=True)
|
||||
def build_extra(cls, values: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Build extra kwargs from additional params that were passed in."""
|
||||
all_required_field_names = {field.alias for field in cls.__fields__.values()}
|
||||
|
||||
extra = values.get("model_kwargs", {})
|
||||
for field_name in list(values):
|
||||
if field_name not in all_required_field_names:
|
||||
if field_name in extra:
|
||||
raise ValueError(f"Found {field_name} supplied twice.")
|
||||
logger.warning(
|
||||
f"""{field_name} was transfered to model_kwargs.
|
||||
Please confirm that {field_name} is what you intended."""
|
||||
)
|
||||
extra[field_name] = values.pop(field_name)
|
||||
values["model_kwargs"] = extra
|
||||
return values
|
||||
|
||||
@root_validator()
|
||||
def validate_environment(cls, values: Dict) -> Dict:
|
||||
"""Validate that api key exists in environment."""
|
||||
stochasticai_api_key = get_from_dict_or_env(
|
||||
values, "stochasticai_api_key", "STOCHASTICAI_API_KEY"
|
||||
)
|
||||
values["stochasticai_api_key"] = stochasticai_api_key
|
||||
return values
|
||||
|
||||
@property
|
||||
def _identifying_params(self) -> Mapping[str, Any]:
|
||||
"""Get the identifying parameters."""
|
||||
return {
|
||||
**{"endpoint_url": self.api_url},
|
||||
**{"model_kwargs": self.model_kwargs},
|
||||
}
|
||||
|
||||
@property
|
||||
def _llm_type(self) -> str:
|
||||
"""Return type of llm."""
|
||||
return "stochasticai"
|
||||
|
||||
def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
|
||||
"""Call out to StochasticAI's complete endpoint.
|
||||
|
||||
Args:
|
||||
prompt: The prompt to pass into the model.
|
||||
stop: Optional list of stop words to use when generating.
|
||||
|
||||
Returns:
|
||||
The string generated by the model.
|
||||
|
||||
Example:
|
||||
.. code-block:: python
|
||||
|
||||
response = StochasticAI("Tell me a joke.")
|
||||
"""
|
||||
params = self.model_kwargs or {}
|
||||
response_post = requests.post(
|
||||
url=self.api_url,
|
||||
json={"prompt": prompt, "params": params},
|
||||
headers={
|
||||
"apiKey": f"{self.stochasticai_api_key}",
|
||||
"Accept": "application/json",
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
)
|
||||
response_post.raise_for_status()
|
||||
response_post_json = response_post.json()
|
||||
completed = False
|
||||
while not completed:
|
||||
response_get = requests.get(
|
||||
url=response_post_json["data"]["responseUrl"],
|
||||
headers={
|
||||
"apiKey": f"{self.stochasticai_api_key}",
|
||||
"Accept": "application/json",
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
)
|
||||
response_get.raise_for_status()
|
||||
response_get_json = response_get.json()["data"]
|
||||
text = response_get_json.get("completion")
|
||||
completed = text is not None
|
||||
time.sleep(0.5)
|
||||
text = text[0]
|
||||
if stop is not None:
|
||||
# I believe this is required since the stop tokens
|
||||
# are not enforced by the model parameters
|
||||
text = enforce_stop_tokens(text, stop)
|
||||
return text
|
@ -1,155 +0,0 @@
|
||||
"""Wrapper around Writer APIs."""
|
||||
from typing import Any, Dict, List, Mapping, Optional
|
||||
|
||||
import requests
|
||||
from pydantic import BaseModel, Extra, root_validator
|
||||
|
||||
from langchain.llms.base import LLM
|
||||
from langchain.llms.utils import enforce_stop_tokens
|
||||
from langchain.utils import get_from_dict_or_env
|
||||
|
||||
|
||||
class Writer(LLM, BaseModel):
|
||||
"""Wrapper around Writer large language models.
|
||||
|
||||
To use, you should have the environment variable ``WRITER_API_KEY``
|
||||
set with your API key.
|
||||
|
||||
Example:
|
||||
.. code-block:: python
|
||||
|
||||
from langchain import Writer
|
||||
writer = Writer(model_id="palmyra-base")
|
||||
"""
|
||||
|
||||
model_id: str = "palmyra-base"
|
||||
"""Model name to use."""
|
||||
|
||||
tokens_to_generate: int = 24
|
||||
"""Max number of tokens to generate."""
|
||||
|
||||
logprobs: bool = False
|
||||
"""Whether to return log probabilities."""
|
||||
|
||||
temperature: float = 1.0
|
||||
"""What sampling temperature to use."""
|
||||
|
||||
length: int = 256
|
||||
"""The maximum number of tokens to generate in the completion."""
|
||||
|
||||
top_p: float = 1.0
|
||||
"""Total probability mass of tokens to consider at each step."""
|
||||
|
||||
top_k: int = 1
|
||||
"""The number of highest probability vocabulary tokens to
|
||||
keep for top-k-filtering."""
|
||||
|
||||
repetition_penalty: float = 1.0
|
||||
"""Penalizes repeated tokens according to frequency."""
|
||||
|
||||
random_seed: int = 0
|
||||
"""The model generates random results.
|
||||
Changing the random seed alone will produce a different response
|
||||
with similar characteristics. It is possible to reproduce results
|
||||
by fixing the random seed (assuming all other hyperparameters
|
||||
are also fixed)"""
|
||||
|
||||
beam_search_diversity_rate: float = 1.0
|
||||
"""Only applies to beam search, i.e. when the beam width is >1.
|
||||
A higher value encourages beam search to return a more diverse
|
||||
set of candidates"""
|
||||
|
||||
beam_width: Optional[int] = None
|
||||
"""The number of concurrent candidates to keep track of during
|
||||
beam search"""
|
||||
|
||||
length_pentaly: float = 1.0
|
||||
"""Only applies to beam search, i.e. when the beam width is >1.
|
||||
Larger values penalize long candidates more heavily, thus preferring
|
||||
shorter candidates"""
|
||||
|
||||
writer_api_key: Optional[str] = None
|
||||
|
||||
stop: Optional[List[str]] = None
|
||||
"""Sequences when completion generation will stop"""
|
||||
|
||||
base_url: Optional[str] = None
|
||||
"""Base url to use, if None decides based on model name."""
|
||||
|
||||
class Config:
|
||||
"""Configuration for this pydantic object."""
|
||||
|
||||
extra = Extra.forbid
|
||||
|
||||
@root_validator()
|
||||
def validate_environment(cls, values: Dict) -> Dict:
|
||||
"""Validate that api key exists in environment."""
|
||||
writer_api_key = get_from_dict_or_env(
|
||||
values, "writer_api_key", "WRITER_API_KEY"
|
||||
)
|
||||
values["writer_api_key"] = writer_api_key
|
||||
return values
|
||||
|
||||
@property
|
||||
def _default_params(self) -> Mapping[str, Any]:
|
||||
"""Get the default parameters for calling Writer API."""
|
||||
return {
|
||||
"tokens_to_generate": self.tokens_to_generate,
|
||||
"stop": self.stop,
|
||||
"logprobs": self.logprobs,
|
||||
"temperature": self.temperature,
|
||||
"top_p": self.top_p,
|
||||
"top_k": self.top_k,
|
||||
"repetition_penalty": self.repetition_penalty,
|
||||
"random_seed": self.random_seed,
|
||||
"beam_search_diversity_rate": self.beam_search_diversity_rate,
|
||||
"beam_width": self.beam_width,
|
||||
"length_pentaly": self.length_pentaly,
|
||||
}
|
||||
|
||||
@property
|
||||
def _identifying_params(self) -> Mapping[str, Any]:
|
||||
"""Get the identifying parameters."""
|
||||
return {**{"model_id": self.model_id}, **self._default_params}
|
||||
|
||||
@property
|
||||
def _llm_type(self) -> str:
|
||||
"""Return type of llm."""
|
||||
return "writer"
|
||||
|
||||
def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
|
||||
"""Call out to Writer's complete endpoint.
|
||||
|
||||
Args:
|
||||
prompt: The prompt to pass into the model.
|
||||
stop: Optional list of stop words to use when generating.
|
||||
|
||||
Returns:
|
||||
The string generated by the model.
|
||||
|
||||
Example:
|
||||
.. code-block:: python
|
||||
|
||||
response = Writer("Tell me a joke.")
|
||||
"""
|
||||
if self.base_url is not None:
|
||||
base_url = self.base_url
|
||||
else:
|
||||
base_url = (
|
||||
"https://api.llm.writer.com/v1/models/{self.model_id}/completions"
|
||||
)
|
||||
response = requests.post(
|
||||
url=base_url,
|
||||
headers={
|
||||
"Authorization": f"Bearer {self.writer_api_key}",
|
||||
"Content-Type": "application/json",
|
||||
"Accept": "application/json",
|
||||
},
|
||||
json={"prompt": prompt, **self._default_params},
|
||||
)
|
||||
text = response.text
|
||||
if stop is not None:
|
||||
# I believe this is required since the stop tokens
|
||||
# are not enforced by the model parameters
|
||||
text = enforce_stop_tokens(text, stop)
|
||||
return text
|
@ -1,42 +0,0 @@
|
||||
"""Wrapper for untrusted code exectuion on docker."""
|
||||
# TODO: pass payload to contanier via filesystem
|
||||
# TEST: more tests for attach to running container
|
||||
# TODO: embed file payloads in the call to run (in LLMChain)?
|
||||
# TODO: [doc] image selection helper
|
||||
# TODO: LLMChain decorator ?
|
||||
|
||||
|
||||
import docker
|
||||
from typing import Any
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
GVISOR_WARNING = """Warning: gVisor runtime not available for {docker_host}.
|
||||
|
||||
Running untrusted code in a container without gVisor is not recommended. Docker
|
||||
containers are not isolated. They can be abused to gain access to the host
|
||||
system. To mitigate this risk, gVisor can be used to run the container in a
|
||||
sandboxed environment. see: https://gvisor.dev/ for more info.
|
||||
"""
|
||||
|
||||
|
||||
def gvisor_runtime_available(client: Any) -> bool:
|
||||
"""Verify if gVisor runtime is available."""
|
||||
logger.debug("verifying availability of gVisor runtime...")
|
||||
info = client.info()
|
||||
if 'Runtimes' in info:
|
||||
return 'runsc' in info['Runtimes']
|
||||
return False
|
||||
|
||||
|
||||
def _check_gvisor_runtime():
|
||||
client = docker.from_env()
|
||||
docker_host = client.api.base_url
|
||||
if not gvisor_runtime_available(docker.from_env()):
|
||||
logger.warning(GVISOR_WARNING.format(docker_host=docker_host))
|
||||
|
||||
|
||||
_check_gvisor_runtime()
|
||||
|
||||
from .tool import DockerWrapper
|
@ -1,103 +0,0 @@
|
||||
"""This module defines template images and halpers for common docker images."""
|
||||
|
||||
from enum import Enum
|
||||
from typing import Optional, List, Type, Union
|
||||
from pydantic import BaseModel, Extra, validator
|
||||
|
||||
|
||||
|
||||
class BaseImage(BaseModel, extra=Extra.forbid):
|
||||
"""Base docker image template class."""
|
||||
tty: bool = False
|
||||
stdin_open: bool = True
|
||||
name: str
|
||||
tag: Optional[str] = 'latest'
|
||||
default_command: Optional[List[str]] = None
|
||||
stdin_command: Optional[List[str]] = None
|
||||
network: str = 'none'
|
||||
|
||||
def dict(self, *args, **kwargs):
|
||||
"""Override the dict method to add the image name."""
|
||||
d = super().dict(*args, **kwargs)
|
||||
del d['name']
|
||||
del d['tag']
|
||||
d['image'] = self.image_name
|
||||
return d
|
||||
|
||||
@property
|
||||
def image_name(self) -> str:
|
||||
"""Image name."""
|
||||
return f'{self.name}:{self.tag}'
|
||||
|
||||
|
||||
|
||||
class ShellTypes(str, Enum):
|
||||
"""Enum class for shell types."""
|
||||
bash = '/bin/bash'
|
||||
sh = '/bin/sh'
|
||||
zsh = '/bin/zsh'
|
||||
|
||||
|
||||
class Shell(BaseImage):
|
||||
"""Shell image focused on running shell commands.
|
||||
|
||||
A shell image can be crated by passing a shell alias such as `sh` or `bash`
|
||||
or by passing the full path to the shell binary.
|
||||
"""
|
||||
name: str = 'alpine'
|
||||
default_command: List[str] = [ShellTypes.sh.value, '-c']
|
||||
stdin_command: List[str] = [ShellTypes.sh.value, '-i']
|
||||
|
||||
@validator('default_command')
|
||||
def validate_default_command(cls, value: str) -> str:
|
||||
"""Validate shell type."""
|
||||
val = getattr(ShellTypes, value, None)
|
||||
if val:
|
||||
return val.value
|
||||
return value
|
||||
|
||||
@validator('stdin_command')
|
||||
def validate_stdin_command(cls, value: str) -> str:
|
||||
"""Validate shell type."""
|
||||
val = getattr(ShellTypes, value, None)
|
||||
if val:
|
||||
return val.value
|
||||
return value
|
||||
|
||||
# example using base image to construct python image
|
||||
class Python(BaseImage):
|
||||
"""Python image class.
|
||||
|
||||
The python image needs to be launced using the `python3 -i` command to keep
|
||||
stdin open.
|
||||
"""
|
||||
name: str = 'python'
|
||||
default_command: List[str] = ['python3', '-c']
|
||||
stdin_command: List[str] = ['python3', '-iq']
|
||||
|
||||
|
||||
def get_image_template(image_name: str = 'shell') -> Union[str, Type[BaseImage]]:
|
||||
"""Helper to get an image template from a string.
|
||||
|
||||
It tries to find a class with the same name as the image name and returns the
|
||||
class. If no class is found, it returns the image name.
|
||||
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
>>> image = get_image_template('python')
|
||||
>>> assert type(image) == Python
|
||||
"""
|
||||
import importlib
|
||||
import inspect
|
||||
|
||||
classes = inspect.getmembers(importlib.import_module(__name__),
|
||||
lambda x: inspect.isclass(x) and x.__name__ == image_name.capitalize()
|
||||
)
|
||||
|
||||
if classes:
|
||||
cls = classes[0][1]
|
||||
return cls
|
||||
else:
|
||||
return image_name
|
||||
|
@ -1,110 +0,0 @@
|
||||
"""Low level socket IO for docker API."""
|
||||
import struct
|
||||
import logging
|
||||
from typing import Any
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
SOCK_BUF_SIZE = 1024
|
||||
|
||||
class DockerSocket:
|
||||
"""Wrapper around docker API's socket object. Can be used as a context manager."""
|
||||
|
||||
_timeout: int = 5
|
||||
|
||||
|
||||
def __init__(self, socket, timeout: int = _timeout):
|
||||
self.socket = socket
|
||||
self.socket._sock.settimeout(timeout)
|
||||
# self.socket._sock.setblocking(False)
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_value, traceback):
|
||||
self.close()
|
||||
|
||||
def close(self):
|
||||
logger.debug("closing socket...")
|
||||
self.socket._sock.shutdown(2) # 2 = SHUT_RDWR
|
||||
self.socket._sock.close()
|
||||
self.socket.close()
|
||||
|
||||
def sendall(self, data: bytes) -> None:
|
||||
self.socket._sock.sendall(data)
|
||||
|
||||
def setblocking(self, flag: bool) -> None:
|
||||
self.socket._sock.setblocking(flag)
|
||||
|
||||
def recv(self) -> Any:
|
||||
"""Wrapper for socket.recv that does buffured read."""
|
||||
|
||||
# NOTE: this is optional as a bonus
|
||||
# TODO: Recv with TTY enabled
|
||||
#
|
||||
# When the TTY setting is enabled in POST /containers/create, the stream
|
||||
# is not multiplexed. The data exchanged over the hijacked connection is
|
||||
# simply the raw data from the process PTY and client's stdin.
|
||||
|
||||
# header := [8]byte{STREAM_TYPE, 0, 0, 0, SIZE1, SIZE2, SIZE3, SIZE4}
|
||||
# STREAM_TYPE can be:
|
||||
#
|
||||
# 0: stdin (is written on stdout)
|
||||
# 1: stdout
|
||||
# 2: stderr
|
||||
# SIZE1, SIZE2, SIZE3, SIZE4 are the four bytes of the uint32 size encoded as
|
||||
# big endian.
|
||||
#
|
||||
# Following the header is the payload, which is the specified number of bytes of
|
||||
# STREAM_TYPE.
|
||||
#
|
||||
# The simplest way to implement this protocol is the following:
|
||||
#
|
||||
# - Read 8 bytes.
|
||||
# - Choose stdout or stderr depending on the first byte.
|
||||
# - Extract the frame size from the last four bytes.
|
||||
# - Read the extracted size and output it on the correct output.
|
||||
# - Goto 1.
|
||||
|
||||
chunks = []
|
||||
# try:
|
||||
# self.socket._sock.recv(8)
|
||||
# except BlockingIOError as e:
|
||||
# raise ValueError("incomplete read from container output")
|
||||
|
||||
while True:
|
||||
header = b''
|
||||
try:
|
||||
# strip the header
|
||||
# the first recv is blocking to wait for the container to start
|
||||
header = self.socket._sock.recv(8)
|
||||
except BlockingIOError:
|
||||
# logger.debug("[header] blocking IO")
|
||||
break
|
||||
|
||||
self.socket._sock.setblocking(False)
|
||||
|
||||
if header == b'':
|
||||
break
|
||||
stream_type, size = struct.unpack("!BxxxI", header)
|
||||
|
||||
payload = b''
|
||||
while size:
|
||||
chunk = b''
|
||||
try:
|
||||
chunk = self.socket._sock.recv(min(size, SOCK_BUF_SIZE))
|
||||
except BlockingIOError:
|
||||
# logger.debug("[body] blocking IO")
|
||||
break
|
||||
if chunk == b'':
|
||||
raise ValueError("incomplete read from container output")
|
||||
payload += chunk
|
||||
size -= len(chunk)
|
||||
chunks.append((stream_type, payload))
|
||||
# try:
|
||||
# msg = self.socket._sock.recv(SOCK_BUF_SIZE)
|
||||
# chunk += msg
|
||||
# except BlockingIOError as e:
|
||||
# break
|
||||
|
||||
return chunks
|
@ -1,449 +0,0 @@
|
||||
# TODO!: using pexpect to with containers
|
||||
# TODO: add default expect pattern to image template
|
||||
# TODO: pass max reads parameters for read trials
|
||||
# NOTE: spawning with tty true or not gives slightly different stdout format
|
||||
# NOTE: echo=False works when tty is disabled and only stdin is connected
|
||||
|
||||
import shlex
|
||||
import os
|
||||
import io
|
||||
import tarfile
|
||||
import time
|
||||
import pandas as pd # type: ignore
|
||||
import docker
|
||||
import socket
|
||||
|
||||
from typing import Any, Dict, Optional, Union, Type
|
||||
from pydantic import BaseModel, Extra, root_validator, Field
|
||||
from docker.errors import APIError, ContainerError # type: ignore
|
||||
|
||||
from .images import Shell, BaseImage, get_image_template
|
||||
from . import gvisor_runtime_available
|
||||
from .socket_io import DockerSocket
|
||||
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_default_params = {
|
||||
# the only required parameter to be able to attach.
|
||||
'stdin_open': True,
|
||||
}
|
||||
|
||||
|
||||
def _get_command(query: str, **kwargs: Dict) -> str:
|
||||
"""Build an escaped command from a query string and keyword arguments."""
|
||||
cmd = query
|
||||
if 'default_command' in kwargs:
|
||||
cmd = shlex.join([*kwargs.get('default_command'), query]) # type: ignore
|
||||
|
||||
return cmd
|
||||
|
||||
|
||||
class DockerWrapper(BaseModel, extra=Extra.allow):
|
||||
"""Executes arbitrary commands or payloads on containers and returns the output.
|
||||
|
||||
Args:
|
||||
image (str | Type[BaseImage]): Docker image to use for execution. The
|
||||
image can be a string or a subclass of images.BaseImage.
|
||||
default_command (List[str]): Default command to use when creating the container.
|
||||
"""
|
||||
|
||||
_docker_client: docker.DockerClient = None # type: ignore
|
||||
_params: Dict = Field(default_factory=Shell().dict(), skip=True)
|
||||
image: Union[str, Type[BaseImage]] = Field(default_factory=Shell, skip=True)
|
||||
from_env: Optional[bool] = Field(default=True, skip=True)
|
||||
|
||||
# @property
|
||||
# def image_name(self) -> str:
|
||||
# """The image name that will be used when creating a container."""
|
||||
# return self._params.image
|
||||
#
|
||||
def __init__(self, **kwargs):
|
||||
"""Initialize docker client."""
|
||||
super().__init__(**kwargs)
|
||||
|
||||
if self.from_env:
|
||||
self._docker_client = docker.from_env()
|
||||
if gvisor_runtime_available(docker.from_env()):
|
||||
self._params['runtime'] = 'runsc'
|
||||
|
||||
# if not isinstance(self.image, str) and issubclass(self.image, BaseImage):
|
||||
# self._params = {**self._params, **self.image().dict()}
|
||||
#
|
||||
# # if the user defined a custom image not pre registerd already we should
|
||||
# # not use the custom command
|
||||
# elif isinstance(self.image, str):
|
||||
# self._params = {**_default_params(), **{'image': self.image}}
|
||||
|
||||
@property
|
||||
def client(self) -> docker.DockerClient: # type: ignore
|
||||
"""Docker client."""
|
||||
return self._docker_client
|
||||
|
||||
@property
|
||||
def info(self) -> Any:
|
||||
"""Prints docker `info`."""
|
||||
return self._docker_client.info()
|
||||
|
||||
# @validator("image", pre=True, always=True)
|
||||
# def validate_image(cls, value):
|
||||
# if value is None:
|
||||
# raise ValueError("image is required")
|
||||
# if isinstance(value, str) :
|
||||
# image = get_image(value)
|
||||
# if isinstance(image, BaseImage):
|
||||
# return image
|
||||
# else:
|
||||
# #set default params to base ones
|
||||
# if issubclass(value, BaseImage):
|
||||
# return value
|
||||
# else:
|
||||
# raise ValueError("image must be a string or a subclass of images.BaseImage")
|
||||
|
||||
@root_validator()
|
||||
def validate_all(cls, values: Dict) -> Dict:
|
||||
"""Validate environment."""
|
||||
image = values.get("image")
|
||||
if image is None:
|
||||
raise ValueError("image is required")
|
||||
if isinstance(image, str):
|
||||
# try to get image
|
||||
_image = get_image_template(image)
|
||||
if isinstance(_image, str):
|
||||
# user wants a custom image, we should use default params
|
||||
values["_params"] = {**_default_params, **{'image': image}}
|
||||
else:
|
||||
# user wants a pre registered image, we should use the image params
|
||||
values["_params"] = _image().dict()
|
||||
# image is a BaseImage class
|
||||
elif issubclass(image.__class__, BaseImage):
|
||||
values["_params"] = image.dict()
|
||||
|
||||
|
||||
def field_filter(x):
|
||||
fields = cls.__fields__
|
||||
if x[0] == '_params':
|
||||
return False
|
||||
field = fields.get(x[0], None)
|
||||
if not field:
|
||||
return True
|
||||
return not field.field_info.extra.get('skip', False)
|
||||
filtered_fields: Dict[Any, Any] = dict(filter(field_filter, values.items())) # type: ignore
|
||||
values["_params"] = {**values["_params"],
|
||||
**filtered_fields}
|
||||
|
||||
return values
|
||||
|
||||
def _clean_kwargs(self, kwargs: dict) -> dict:
|
||||
kwargs.pop('default_command', None)
|
||||
kwargs.pop('stdin_command', None)
|
||||
return kwargs
|
||||
|
||||
|
||||
|
||||
#FIX: default shell command should be different in run vs exec mode
|
||||
def run(self, query: str, **kwargs: Any) -> str:
|
||||
"""Run arbitrary shell command inside a container.
|
||||
|
||||
This method will concatenate the registered default command with the provided
|
||||
query.
|
||||
|
||||
Args:
|
||||
query (str): The command to run.
|
||||
**kwargs: Pass extra parameters to DockerClient.container.run.
|
||||
|
||||
"""
|
||||
kwargs = {**self._params, **kwargs}
|
||||
args = {
|
||||
'image': self._params.get('image'),
|
||||
'command': query,
|
||||
}
|
||||
|
||||
del kwargs['image']
|
||||
cmd = _get_command(query, **kwargs)
|
||||
self._clean_kwargs(kwargs)
|
||||
|
||||
args['command'] = cmd
|
||||
# print(f"args: {args}")
|
||||
# print(f"kwargs: {kwargs}")
|
||||
# return
|
||||
logger.debug(f"running command {args['command']}")
|
||||
logger.debug(f"with params {kwargs}")
|
||||
try:
|
||||
result = self._docker_client.containers.run(*(args.values()),
|
||||
remove=True,
|
||||
**kwargs)
|
||||
return result.decode('utf-8').strip()
|
||||
except ContainerError as e:
|
||||
return f"STDERR: {e}"
|
||||
|
||||
# TODO: handle docker APIError ?
|
||||
except APIError as e:
|
||||
logger.debug(f"APIError: {e}")
|
||||
return "ERROR"
|
||||
|
||||
def _flush_prompt(self, _socket):
|
||||
flush = _socket.recv()
|
||||
_socket.setblocking(True)
|
||||
logger.debug(f"flushed output: {flush}")
|
||||
|
||||
def _massage_output_streams(self, output):
|
||||
df = pd.DataFrame(output, columns=['stream_type', 'payload'])
|
||||
df['payload'] = df['payload'].apply(lambda x: x.decode('utf-8'))
|
||||
df['stream_type'] = df['stream_type'].apply(
|
||||
lambda x: 'stdout' if x == 1 else 'stderr')
|
||||
payload = df.groupby('stream_type')['payload'].apply(''.join).to_dict()
|
||||
logger.debug(f"payload: {payload}")
|
||||
return payload
|
||||
|
||||
|
||||
# TODO: document dif between run and exec_run
|
||||
def exec_run(self, query: str, timeout: int = 5,
|
||||
delay: float = 0.5,
|
||||
with_stderr: bool = False,
|
||||
flush_prompt: bool = False,
|
||||
**kwargs: Any) -> str:
|
||||
"""Run a shell command inside an ephemeral container.
|
||||
|
||||
This will create a container, run the command, and then remove the
|
||||
container. the input is sent to the container's stdin through a socket
|
||||
using Docker API. It effectively simulates a tty session.
|
||||
|
||||
Args:
|
||||
query (str): The command to execute.
|
||||
timeout (int): The timeout for receiving from the attached stdin.
|
||||
delay (float): The delay in seconds before running the command.
|
||||
with_stderr (bool): If True, the stderr will be included in the output
|
||||
flush_prompt (bool): If True, the prompt will be flushed before running the command.
|
||||
**kwargs: Pass extra parameters to DockerClient.container.exec_run.
|
||||
"""
|
||||
# it is necessary to open stdin to keep the container running after it's started
|
||||
# the attach_socket will hold the connection open until the container is stopped or
|
||||
# the socket is closed.
|
||||
|
||||
# NOTE: using tty=True to be able to simulate a tty session.
|
||||
|
||||
# NOTE: some images like python need to be launched with custom
|
||||
# parameters to keep stdin open. For example python image needs to be
|
||||
# started with the command `python3 -i`
|
||||
|
||||
# remove local variables from kwargs
|
||||
for arg in kwargs.keys():
|
||||
if arg in locals():
|
||||
del kwargs[arg]
|
||||
|
||||
|
||||
kwargs = {**self._params, **kwargs}
|
||||
kwargs = self._clean_kwargs(kwargs)
|
||||
|
||||
# exec_run requires flags for stdin so we use `stdin_command` as
|
||||
# a default command for creating the container
|
||||
if 'stdin_command' in kwargs:
|
||||
assert isinstance(kwargs['stdin_command'], list)
|
||||
kwargs['command'] = shlex.join(kwargs['stdin_command'])
|
||||
del kwargs['stdin_command']
|
||||
|
||||
# kwargs.pop('default_command', None)
|
||||
# kwargs['command'] = cmd
|
||||
|
||||
# print(f"kwargs: {kwargs}")
|
||||
# return
|
||||
|
||||
# TODO: handle both output mode for tty=True/False
|
||||
logger.debug(f"creating container with params {kwargs}")
|
||||
|
||||
container = self._docker_client.containers.create(**kwargs)
|
||||
container.start()
|
||||
|
||||
# get underlying socket
|
||||
# important to set 'stream' or attach API does not work
|
||||
_socket = container.attach_socket(params={'stdout': 1, 'stderr': 1,
|
||||
'stdin': 1, 'stream': 1})
|
||||
|
||||
|
||||
# input()
|
||||
with DockerSocket(_socket, timeout=timeout) as _socket:
|
||||
# flush the output buffer (if any prompt)
|
||||
if flush_prompt:
|
||||
self._flush_prompt(_socket)
|
||||
|
||||
# TEST: make sure the container is ready ? use a blocking first call
|
||||
raw_input = f"{query}\n".encode('utf-8')
|
||||
_socket.sendall(raw_input)
|
||||
|
||||
#NOTE: delay ensures that the command is executed after the input is sent
|
||||
time.sleep(delay) #this should be available as a parameter
|
||||
|
||||
try:
|
||||
output = _socket.recv()
|
||||
except socket.timeout:
|
||||
return "ERROR: timeout"
|
||||
|
||||
|
||||
try:
|
||||
container.kill()
|
||||
except APIError:
|
||||
pass
|
||||
container.remove(force=True)
|
||||
|
||||
if output is None:
|
||||
logger.warning("no output")
|
||||
return "ERROR"
|
||||
|
||||
# output is stored in a list of tuples (stream_type, payload)
|
||||
payload = self._massage_output_streams(output)
|
||||
|
||||
|
||||
#NOTE: stderr might contain only the prompt
|
||||
if 'stdout' in payload and 'stderr' in payload and with_stderr:
|
||||
return f"STDOUT:\n {payload['stdout'].strip()}\nSTDERR:\n {payload['stderr']}"
|
||||
elif 'stderr' in payload and not 'stdout' in payload:
|
||||
return f"STDERR: {payload['stderr']}"
|
||||
else:
|
||||
return payload['stdout'].strip()
|
||||
|
||||
|
||||
def exec_attached(self, query: str, container: str,
|
||||
delay: float = 0.5,
|
||||
timeout: int = 5,
|
||||
with_stderr: bool = False,
|
||||
flush_prompt: bool = False,
|
||||
**kwargs: Any) -> str:
|
||||
"""Attach to container and exec query on it.
|
||||
|
||||
This method is very similary to exec_run. It only differs in that it attaches to
|
||||
an already specifed container instead of creating a new one for each query.
|
||||
|
||||
Args:
|
||||
query (str): The command to execute.
|
||||
container (str): The container to attach to.
|
||||
timeout (int): The timeout for receiving from the attached stdin.
|
||||
delay (float): The delay in seconds before running the command.
|
||||
with_stderr (bool): If True, the stderr will be included in the output
|
||||
flush_prompt (bool): If True, the prompt will be flushed before running the command.
|
||||
**kwargs: Pass extra parameters to DockerClient.container.exec_run.
|
||||
|
||||
"""
|
||||
|
||||
# remove local variables from kwargs
|
||||
for arg in kwargs.keys():
|
||||
if arg in locals():
|
||||
del kwargs[arg]
|
||||
|
||||
|
||||
kwargs = {**self._params, **kwargs}
|
||||
kwargs = self._clean_kwargs(kwargs)
|
||||
|
||||
logger.debug(f"attaching to container {container} with params {kwargs}")
|
||||
|
||||
try:
|
||||
_container = self._docker_client.containers.get(container)
|
||||
except Exception as e:
|
||||
logger.error(f"container {container}: {e}")
|
||||
return "ERROR"
|
||||
|
||||
_socket = _container.attach_socket(params={'stdout': 1, 'stderr': 1,
|
||||
'stdin': 1, 'stream': 1})
|
||||
|
||||
|
||||
with DockerSocket(_socket, timeout=timeout) as _socket:
|
||||
# flush the output buffer (if any prompt)
|
||||
if flush_prompt:
|
||||
self._flush_prompt(_socket)
|
||||
|
||||
raw_input = f"{query}\n".encode('utf-8')
|
||||
_socket.sendall(raw_input)
|
||||
|
||||
#NOTE: delay ensures that the command is executed after the input is sent
|
||||
time.sleep(delay) #this should be available as a parameter
|
||||
|
||||
try:
|
||||
output = _socket.recv()
|
||||
except socket.timeout:
|
||||
return "ERROR: timeout"
|
||||
|
||||
if output is None:
|
||||
logger.warning("no output")
|
||||
return "ERROR"
|
||||
|
||||
payload = self._massage_output_streams(output)
|
||||
print(payload)
|
||||
|
||||
#NOTE: stderr might contain only the prompt
|
||||
if 'stdout' in payload and 'stderr' in payload and with_stderr:
|
||||
return f"STDOUT:\n {payload['stdout'].strip()}\nSTDERR:\n {payload['stderr']}"
|
||||
elif 'stderr' in payload and not 'stdout' in payload:
|
||||
return f"STDERR: {payload['stderr']}"
|
||||
else:
|
||||
return payload['stdout'].strip()
|
||||
|
||||
|
||||
|
||||
#WIP method that will copy the given payload to the container filesystem then
|
||||
# invoke the command on the file and return the output
|
||||
def run_file(self, payload: bytes, filename: Optional[str] = None,
|
||||
**kwargs: Any) -> str:
|
||||
"""Run arbitrary shell command inside an ephemeral container on the
|
||||
specified input payload."""
|
||||
|
||||
|
||||
for arg in kwargs.keys():
|
||||
if arg in locals():
|
||||
del kwargs[arg]
|
||||
|
||||
kwargs = {**self._params, **kwargs}
|
||||
self._clean_kwargs(kwargs)
|
||||
|
||||
kwargs['command'] = '/bin/sh'
|
||||
|
||||
k_file_location = '/tmp/payload'
|
||||
if filename is not None:
|
||||
# store at /tmp/file_name
|
||||
# strip all leading path components
|
||||
file_loc = os.path.basename(filename)
|
||||
k_file_location = f'/tmp/{file_loc}'
|
||||
|
||||
# print(kwargs)
|
||||
# return
|
||||
|
||||
# create a container with the given payload
|
||||
# container = self._docker_client.containers.create(**kwargs)
|
||||
# container.start()
|
||||
container = self._docker_client.containers.list()[0]
|
||||
print(container.short_id)
|
||||
|
||||
|
||||
# copy the payload to the container
|
||||
try:
|
||||
# put the data in tar archive at the path specified by k_file_location
|
||||
archive = io.BytesIO()
|
||||
with tarfile.TarFile(fileobj=archive, mode='w') as tar:
|
||||
tarinfo = tarfile.TarInfo(name='test-archive')
|
||||
tarinfo.size = len(payload)
|
||||
tarinfo.mtime = int(time.time())
|
||||
tar.addfile(tarinfo, io.BytesIO(payload))
|
||||
archive.seek(0)
|
||||
|
||||
# store archive on local host at /tmp/test
|
||||
# with open('/tmp/test', 'wb') as f:
|
||||
# f.write(archive.read())
|
||||
|
||||
|
||||
container.put_archive(path='/', data=archive)
|
||||
except APIError as e:
|
||||
logger.error(f"Error: {e}")
|
||||
return "ERROR"
|
||||
|
||||
#execute the command
|
||||
exit_code, out = container.exec_run(['sh', k_file_location])
|
||||
print(f"exit_code: {exit_code}")
|
||||
print(f"out: {out}")
|
||||
|
||||
|
||||
# try:
|
||||
# container.kill()
|
||||
# except APIError:
|
||||
# pass
|
||||
# container.remove(force=True)
|
||||
|
||||
return ""
|
@ -1,322 +0,0 @@
|
||||
"""Wrapper around Atlas by Nomic."""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import uuid
|
||||
from typing import Any, Iterable, List, Optional
|
||||
|
||||
import numpy as np
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.embeddings.base import Embeddings
|
||||
from langchain.vectorstores.base import VectorStore
|
||||
|
||||
logger = logging.getLogger()
|
||||
|
||||
|
||||
class AtlasDB(VectorStore):
|
||||
"""Wrapper around Atlas: Nomic's neural database and rhizomatic instrument.
|
||||
|
||||
To use, you should have the ``nomic`` python package installed.
|
||||
|
||||
Example:
|
||||
.. code-block:: python
|
||||
|
||||
from langchain.vectorstores import AtlasDB
|
||||
from langchain.embeddings.openai import OpenAIEmbeddings
|
||||
|
||||
embeddings = OpenAIEmbeddings()
|
||||
vectorstore = AtlasDB("my_project", embeddings.embed_query)
|
||||
"""
|
||||
|
||||
_ATLAS_DEFAULT_ID_FIELD = "atlas_id"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
name: str,
|
||||
embedding_function: Optional[Embeddings] = None,
|
||||
api_key: Optional[str] = None,
|
||||
description: str = "A description for your project",
|
||||
is_public: bool = True,
|
||||
reset_project_if_exists: bool = False,
|
||||
) -> None:
|
||||
"""
|
||||
Initialize the Atlas Client
|
||||
|
||||
Args:
|
||||
name (str): The name of your project. If the project already exists,
|
||||
it will be loaded.
|
||||
embedding_function (Optional[Callable]): An optional function used for
|
||||
embedding your data. If None, data will be embedded with
|
||||
Nomic's embed model.
|
||||
api_key (str): Your nomic API key
|
||||
description (str): A description for your project.
|
||||
is_public (bool): Whether your project is publicly accessible.
|
||||
True by default.
|
||||
reset_project_if_exists (bool): Whether to reset this project if it
|
||||
already exists. Default False.
|
||||
Generally userful during development and testing.
|
||||
"""
|
||||
try:
|
||||
import nomic
|
||||
from nomic import AtlasProject
|
||||
except ImportError:
|
||||
raise ValueError(
|
||||
"Could not import nomic python package. "
|
||||
"Please it install it with `pip install nomic`."
|
||||
)
|
||||
|
||||
if api_key is None:
|
||||
raise ValueError("No API key provided. Sign up at atlas.nomic.ai!")
|
||||
nomic.login(api_key)
|
||||
|
||||
self._embedding_function = embedding_function
|
||||
modality = "text"
|
||||
if self._embedding_function is not None:
|
||||
modality = "embedding"
|
||||
|
||||
# Check if the project exists, create it if not
|
||||
self.project = AtlasProject(
|
||||
name=name,
|
||||
description=description,
|
||||
modality=modality,
|
||||
is_public=is_public,
|
||||
reset_project_if_exists=reset_project_if_exists,
|
||||
unique_id_field=AtlasDB._ATLAS_DEFAULT_ID_FIELD,
|
||||
)
|
||||
self.project._latest_project_state()
|
||||
|
||||
def add_texts(
|
||||
self,
|
||||
texts: Iterable[str],
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
ids: Optional[List[str]] = None,
|
||||
refresh: bool = True,
|
||||
**kwargs: Any,
|
||||
) -> List[str]:
|
||||
"""Run more texts through the embeddings and add to the vectorstore.
|
||||
|
||||
Args:
|
||||
texts (Iterable[str]): Texts to add to the vectorstore.
|
||||
metadatas (Optional[List[dict]], optional): Optional list of metadatas.
|
||||
ids (Optional[List[str]]): An optional list of ids.
|
||||
refresh(bool): Whether or not to refresh indices with the updated data.
|
||||
Default True.
|
||||
Returns:
|
||||
List[str]: List of IDs of the added texts.
|
||||
"""
|
||||
|
||||
if (
|
||||
metadatas is not None
|
||||
and len(metadatas) > 0
|
||||
and "text" in metadatas[0].keys()
|
||||
):
|
||||
raise ValueError("Cannot accept key text in metadata!")
|
||||
|
||||
texts = list(texts)
|
||||
if ids is None:
|
||||
ids = [str(uuid.uuid1()) for _ in texts]
|
||||
|
||||
# Embedding upload case
|
||||
if self._embedding_function is not None:
|
||||
_embeddings = self._embedding_function.embed_documents(texts)
|
||||
embeddings = np.stack(_embeddings)
|
||||
if metadatas is None:
|
||||
data = [
|
||||
{AtlasDB._ATLAS_DEFAULT_ID_FIELD: ids[i], "text": texts[i]}
|
||||
for i, _ in enumerate(texts)
|
||||
]
|
||||
else:
|
||||
for i in range(len(metadatas)):
|
||||
metadatas[i][AtlasDB._ATLAS_DEFAULT_ID_FIELD] = ids[i]
|
||||
metadatas[i]["text"] = texts[i]
|
||||
data = metadatas
|
||||
|
||||
self.project._validate_map_data_inputs(
|
||||
[], id_field=AtlasDB._ATLAS_DEFAULT_ID_FIELD, data=data
|
||||
)
|
||||
with self.project.wait_for_project_lock():
|
||||
self.project.add_embeddings(embeddings=embeddings, data=data)
|
||||
# Text upload case
|
||||
else:
|
||||
if metadatas is None:
|
||||
data = [
|
||||
{"text": text, AtlasDB._ATLAS_DEFAULT_ID_FIELD: ids[i]}
|
||||
for i, text in enumerate(texts)
|
||||
]
|
||||
else:
|
||||
for i, text in enumerate(texts):
|
||||
metadatas[i]["text"] = texts
|
||||
metadatas[i][AtlasDB._ATLAS_DEFAULT_ID_FIELD] = ids[i]
|
||||
data = metadatas
|
||||
|
||||
self.project._validate_map_data_inputs(
|
||||
[], id_field=AtlasDB._ATLAS_DEFAULT_ID_FIELD, data=data
|
||||
)
|
||||
|
||||
with self.project.wait_for_project_lock():
|
||||
self.project.add_text(data)
|
||||
|
||||
if refresh:
|
||||
if len(self.project.indices) > 0:
|
||||
with self.project.wait_for_project_lock():
|
||||
self.project.rebuild_maps()
|
||||
|
||||
return ids
|
||||
|
||||
def create_index(self, **kwargs: Any) -> Any:
|
||||
"""Creates an index in your project.
|
||||
|
||||
See
|
||||
https://docs.nomic.ai/atlas_api.html#nomic.project.AtlasProject.create_index
|
||||
for full detail.
|
||||
"""
|
||||
with self.project.wait_for_project_lock():
|
||||
return self.project.create_index(**kwargs)
|
||||
|
||||
def similarity_search(
|
||||
self,
|
||||
query: str,
|
||||
k: int = 4,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
"""Run similarity search with AtlasDB
|
||||
|
||||
Args:
|
||||
query (str): Query text to search for.
|
||||
k (int): Number of results to return. Defaults to 4.
|
||||
|
||||
Returns:
|
||||
List[Document]: List of documents most similar to the query text.
|
||||
"""
|
||||
if self._embedding_function is None:
|
||||
raise NotImplementedError(
|
||||
"AtlasDB requires an embedding_function for text similarity search!"
|
||||
)
|
||||
|
||||
_embedding = self._embedding_function.embed_documents([query])[0]
|
||||
embedding = np.array(_embedding).reshape(1, -1)
|
||||
with self.project.wait_for_project_lock():
|
||||
neighbors, _ = self.project.projections[0].vector_search(
|
||||
queries=embedding, k=k
|
||||
)
|
||||
datas = self.project.get_data(ids=neighbors[0])
|
||||
|
||||
docs = [
|
||||
Document(page_content=datas[i]["text"], metadata=datas[i])
|
||||
for i, neighbor in enumerate(neighbors)
|
||||
]
|
||||
return docs
|
||||
|
||||
@classmethod
|
||||
def from_texts(
|
||||
cls,
|
||||
texts: List[str],
|
||||
embedding: Optional[Embeddings] = None,
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
ids: Optional[List[str]] = None,
|
||||
name: Optional[str] = None,
|
||||
api_key: Optional[str] = None,
|
||||
description: str = "A description for your project",
|
||||
is_public: bool = True,
|
||||
reset_project_if_exists: bool = False,
|
||||
index_kwargs: Optional[dict] = None,
|
||||
**kwargs: Any,
|
||||
) -> AtlasDB:
|
||||
"""Create an AtlasDB vectorstore from a raw documents.
|
||||
|
||||
Args:
|
||||
texts (List[str]): The list of texts to ingest.
|
||||
name (str): Name of the project to create.
|
||||
api_key (str): Your nomic API key,
|
||||
embedding (Optional[Embeddings]): Embedding function. Defaults to None.
|
||||
metadatas (Optional[List[dict]]): List of metadatas. Defaults to None.
|
||||
ids (Optional[List[str]]): Optional list of document IDs. If None,
|
||||
ids will be auto created
|
||||
description (str): A description for your project.
|
||||
is_public (bool): Whether your project is publicly accessible.
|
||||
True by default.
|
||||
reset_project_if_exists (bool): Whether to reset this project if it
|
||||
already exists. Default False.
|
||||
Generally userful during development and testing.
|
||||
index_kwargs (Optional[dict]): Dict of kwargs for index creation.
|
||||
See https://docs.nomic.ai/atlas_api.html
|
||||
|
||||
Returns:
|
||||
AtlasDB: Nomic's neural database and finest rhizomatic instrument
|
||||
"""
|
||||
if name is None or api_key is None:
|
||||
raise ValueError("`name` and `api_key` cannot be None.")
|
||||
|
||||
# Inject relevant kwargs
|
||||
all_index_kwargs = {"name": name + "_index", "indexed_field": "text"}
|
||||
if index_kwargs is not None:
|
||||
for k, v in index_kwargs.items():
|
||||
all_index_kwargs[k] = v
|
||||
|
||||
# Build project
|
||||
atlasDB = cls(
|
||||
name,
|
||||
embedding_function=embedding,
|
||||
api_key=api_key,
|
||||
description="A description for your project",
|
||||
is_public=is_public,
|
||||
reset_project_if_exists=reset_project_if_exists,
|
||||
)
|
||||
with atlasDB.project.wait_for_project_lock():
|
||||
atlasDB.add_texts(texts=texts, metadatas=metadatas, ids=ids)
|
||||
atlasDB.create_index(**all_index_kwargs)
|
||||
return atlasDB
|
||||
|
||||
@classmethod
|
||||
def from_documents(
|
||||
cls,
|
||||
documents: List[Document],
|
||||
embedding: Optional[Embeddings] = None,
|
||||
ids: Optional[List[str]] = None,
|
||||
name: Optional[str] = None,
|
||||
api_key: Optional[str] = None,
|
||||
persist_directory: Optional[str] = None,
|
||||
description: str = "A description for your project",
|
||||
is_public: bool = True,
|
||||
reset_project_if_exists: bool = False,
|
||||
index_kwargs: Optional[dict] = None,
|
||||
**kwargs: Any,
|
||||
) -> AtlasDB:
|
||||
"""Create an AtlasDB vectorstore from a list of documents.
|
||||
|
||||
Args:
|
||||
name (str): Name of the collection to create.
|
||||
api_key (str): Your nomic API key,
|
||||
documents (List[Document]): List of documents to add to the vectorstore.
|
||||
embedding (Optional[Embeddings]): Embedding function. Defaults to None.
|
||||
ids (Optional[List[str]]): Optional list of document IDs. If None,
|
||||
ids will be auto created
|
||||
description (str): A description for your project.
|
||||
is_public (bool): Whether your project is publicly accessible.
|
||||
True by default.
|
||||
reset_project_if_exists (bool): Whether to reset this project if
|
||||
it already exists. Default False.
|
||||
Generally userful during development and testing.
|
||||
index_kwargs (Optional[dict]): Dict of kwargs for index creation.
|
||||
See https://docs.nomic.ai/atlas_api.html
|
||||
|
||||
Returns:
|
||||
AtlasDB: Nomic's neural database and finest rhizomatic instrument
|
||||
"""
|
||||
if name is None or api_key is None:
|
||||
raise ValueError("`name` and `api_key` cannot be None.")
|
||||
texts = [doc.page_content for doc in documents]
|
||||
metadatas = [doc.metadata for doc in documents]
|
||||
return cls.from_texts(
|
||||
name=name,
|
||||
api_key=api_key,
|
||||
texts=texts,
|
||||
embedding=embedding,
|
||||
metadatas=metadatas,
|
||||
ids=ids,
|
||||
description=description,
|
||||
is_public=is_public,
|
||||
reset_project_if_exists=reset_project_if_exists,
|
||||
index_kwargs=index_kwargs,
|
||||
)
|
@ -1,211 +0,0 @@
|
||||
"""Wrapper around Activeloop Deep Lake."""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import uuid
|
||||
from typing import Any, Iterable, List, Optional, Sequence
|
||||
|
||||
import numpy as np
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.embeddings.base import Embeddings
|
||||
from langchain.vectorstores.base import VectorStore
|
||||
|
||||
logger = logging.getLogger()
|
||||
|
||||
|
||||
def L2_search(
|
||||
query_embedding: np.ndarray, data_vectors: np.ndarray, k: int = 4
|
||||
) -> list:
|
||||
"""naive L2 search for nearest neighbors"""
|
||||
# Calculate the L2 distance between the query_vector and all data_vectors
|
||||
distances = np.linalg.norm(data_vectors - query_embedding, axis=1)
|
||||
|
||||
# Sort the distances and return the indices of the k nearest vectors
|
||||
nearest_indices = np.argsort(distances)[:k]
|
||||
return nearest_indices.tolist()
|
||||
|
||||
|
||||
class DeepLake(VectorStore):
|
||||
"""Wrapper around Deep Lake, a data lake for deep learning applications.
|
||||
|
||||
It not only stores embeddings, but also the original data and queries with
|
||||
version control automatically enabled.
|
||||
|
||||
It is more than just a vector store. You can use the dataset to fine-tune
|
||||
your own LLM models or use it for other downstream tasks.
|
||||
|
||||
We implement naive similiarity search, but it can be extended with Tensor
|
||||
Query Language (TQL for production use cases) over billion rows.
|
||||
|
||||
To use, you should have the ``deeplake`` python package installed.
|
||||
|
||||
Example:
|
||||
.. code-block:: python
|
||||
|
||||
from langchain.vectorstores import DeepLake
|
||||
from langchain.embeddings.openai import OpenAIEmbeddings
|
||||
|
||||
embeddings = OpenAIEmbeddings()
|
||||
vectorstore = DeepLake("langchain_store", embeddings.embed_query)
|
||||
"""
|
||||
|
||||
_LANGCHAIN_DEFAULT_DEEPLAKE_PATH = "mem://langchain"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
dataset_path: str = _LANGCHAIN_DEFAULT_DEEPLAKE_PATH,
|
||||
token: Optional[str] = None,
|
||||
embedding_function: Optional[Embeddings] = None,
|
||||
) -> None:
|
||||
"""Initialize with Deep Lake client."""
|
||||
|
||||
try:
|
||||
import deeplake
|
||||
except ImportError:
|
||||
raise ValueError(
|
||||
"Could not import deeplake python package. "
|
||||
"Please it install it with `pip install deeplake`."
|
||||
)
|
||||
self._deeplake = deeplake
|
||||
|
||||
if deeplake.exists(dataset_path, token=token):
|
||||
self.ds = deeplake.load(dataset_path, token=token)
|
||||
logger.warning(
|
||||
f"Deep Lake Dataset in {dataset_path} already exists, "
|
||||
f"loading from the storage"
|
||||
)
|
||||
self.ds.summary()
|
||||
else:
|
||||
self.ds = deeplake.empty(dataset_path, token=token, overwrite=True)
|
||||
with self.ds:
|
||||
self.ds.create_tensor("text", htype="text")
|
||||
self.ds.create_tensor("metadata", htype="json")
|
||||
self.ds.create_tensor("embedding", htype="generic")
|
||||
self.ds.create_tensor("ids", htype="text")
|
||||
|
||||
self._embedding_function = embedding_function
|
||||
|
||||
def add_texts(
|
||||
self,
|
||||
texts: Iterable[str],
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
ids: Optional[List[str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[str]:
|
||||
"""Run more texts through the embeddings and add to the vectorstore.
|
||||
|
||||
Args:
|
||||
texts (Iterable[str]): Texts to add to the vectorstore.
|
||||
metadatas (Optional[List[dict]], optional): Optional list of metadatas.
|
||||
ids (Optional[List[str]], optional): Optional list of IDs.
|
||||
|
||||
Returns:
|
||||
List[str]: List of IDs of the added texts.
|
||||
"""
|
||||
|
||||
if ids is None:
|
||||
ids = [str(uuid.uuid1()) for _ in texts]
|
||||
|
||||
text_list = list(texts)
|
||||
|
||||
if self._embedding_function is None:
|
||||
embeddings: Sequence[Optional[List[float]]] = [None] * len(text_list)
|
||||
else:
|
||||
embeddings = self._embedding_function.embed_documents(text_list)
|
||||
|
||||
if metadatas is None:
|
||||
metadatas_to_use: Sequence[Optional[dict]] = [None] * len(text_list)
|
||||
else:
|
||||
metadatas_to_use = metadatas
|
||||
|
||||
elements = zip(text_list, embeddings, metadatas_to_use, ids)
|
||||
|
||||
@self._deeplake.compute
|
||||
def ingest(sample_in: list, sample_out: list) -> None:
|
||||
s = {
|
||||
"text": sample_in[0],
|
||||
"embedding": sample_in[1],
|
||||
"metadata": sample_in[2],
|
||||
"ids": sample_in[3],
|
||||
}
|
||||
sample_out.append(s)
|
||||
|
||||
ingest().eval(list(elements), self.ds)
|
||||
self.ds.commit()
|
||||
|
||||
return ids
|
||||
|
||||
def similarity_search(
|
||||
self, query: str, k: int = 4, **kwargs: Any
|
||||
) -> List[Document]:
|
||||
"""Return docs most similar to query."""
|
||||
if self._embedding_function is None:
|
||||
self.ds.summary()
|
||||
ds_view = self.ds.filter(lambda x: query in x["text"].data()["value"])
|
||||
else:
|
||||
query_emb = np.array(self._embedding_function.embed_query(query))
|
||||
embeddings = self.ds.embedding.numpy()
|
||||
indices = L2_search(query_emb, embeddings, k=k)
|
||||
ds_view = self.ds[indices]
|
||||
|
||||
docs = [
|
||||
Document(
|
||||
page_content=el["text"].data()["value"],
|
||||
metadata=el["metadata"].data()["value"],
|
||||
)
|
||||
for el in ds_view
|
||||
]
|
||||
return docs
|
||||
|
||||
@classmethod
|
||||
def from_texts(
|
||||
cls,
|
||||
texts: List[str],
|
||||
embedding: Optional[Embeddings] = None,
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
ids: Optional[List[str]] = None,
|
||||
dataset_path: str = _LANGCHAIN_DEFAULT_DEEPLAKE_PATH,
|
||||
**kwargs: Any,
|
||||
) -> DeepLake:
|
||||
"""Create a Deep Lake dataset from a raw documents.
|
||||
|
||||
If a persist_directory is specified, the collection will be persisted there.
|
||||
Otherwise, the data will be ephemeral in-memory.
|
||||
|
||||
Args:
|
||||
path (str, pathlib.Path): - The full path to the dataset. Can be:
|
||||
- a Deep Lake cloud path of the form ``hub://username/datasetname``.
|
||||
To write to Deep Lake cloud datasets,
|
||||
ensure that you are logged in to Deep Lake
|
||||
(use 'activeloop login' from command line)
|
||||
- an s3 path of the form ``s3://bucketname/path/to/dataset``.
|
||||
Credentials are required in either the environment or
|
||||
passed to the creds argument.
|
||||
- a local file system path of the form ``./path/to/dataset`` or
|
||||
``~/path/to/dataset`` or ``path/to/dataset``.
|
||||
- a memory path of the form ``mem://path/to/dataset`` which doesn't
|
||||
save the dataset but keeps it in memory instead.
|
||||
Should be used only for testing as it does not persist.
|
||||
documents (List[Document]): List of documents to add.
|
||||
embedding (Optional[Embeddings]): Embedding function. Defaults to None.
|
||||
metadatas (Optional[List[dict]]): List of metadatas. Defaults to None.
|
||||
ids (Optional[List[str]]): List of document IDs. Defaults to None.
|
||||
|
||||
Returns:
|
||||
DeepLake: Deep Lake dataset.
|
||||
"""
|
||||
deeplake_dataset = cls(
|
||||
dataset_path=dataset_path,
|
||||
embedding_function=embedding,
|
||||
)
|
||||
deeplake_dataset.add_texts(texts=texts, metadatas=metadatas, ids=ids)
|
||||
return deeplake_dataset
|
||||
|
||||
def delete_dataset(self) -> None:
|
||||
"""Delete the collection."""
|
||||
self.ds.delete()
|
||||
|
||||
def persist(self) -> None:
|
||||
"""Persist the collection."""
|
||||
self.ds.flush()
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue