{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Atlas\n", "\n", "\n", ">[Atlas](https://docs.nomic.ai/index.html) is a platform for interacting with both small and internet scale unstructured datasets by `Nomic`. \n", "\n", "This notebook shows you how to use functionality related to the `AtlasDB` vectorstore." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [] }, "outputs": [], "source": [ "!pip install spacy" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "pycharm": { "is_executing": true }, "scrolled": true, "tags": [] }, "outputs": [], "source": [ "!python3 -m spacy download en_core_web_sm" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [] }, "outputs": [], "source": [ "!pip install nomic" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "pycharm": { "is_executing": true }, "tags": [] }, "outputs": [], "source": [ "import time\n", "from langchain.embeddings.openai import OpenAIEmbeddings\n", "from langchain.text_splitter import SpacyTextSplitter\n", "from langchain.vectorstores import AtlasDB\n", "from langchain.document_loaders import TextLoader" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "tags": [] }, "outputs": [], "source": [ "ATLAS_TEST_API_KEY = '7xDPkYXSYDc1_ErdTPIcoAR9RNd8YDlkS3nVNXcVoIMZ6'" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "tags": [] }, "outputs": [], "source": [ "loader = TextLoader('../../../state_of_the_union.txt')\n", "documents = loader.load()\n", "text_splitter = SpacyTextSplitter(separator='|')\n", "texts = []\n", "for doc in text_splitter.split_documents(documents):\n", " texts.extend(doc.page_content.split('|'))\n", " \n", "texts = [e.strip() for e in texts]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "pycharm": { "is_executing": true }, "tags": [] }, "outputs": [], "source": [ "db = AtlasDB.from_texts(texts=texts,\n", " name='test_index_'+str(time.time()), # unique name for your vector store\n", " description='test_index', #a description for your vector store\n", " api_key=ATLAS_TEST_API_KEY,\n", " index_kwargs={'build_topic_model': True})" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "db.project.wait_for_project_lock()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " test_index_1677255228.136989\n", "
\n", " A description for your project 508 datums inserted.\n", "
\n", " 1 index built.\n", "
Projections\n", "
\n", "\n", "

Projection ID: db996d77-8981-48a0-897a-ff2c22bbf541

\n", "
\n", "
Hide embedded project
\n", "
\n", " Explore on atlas.nomic.ai\n", "
\n", "
\n", " \n", " \n", "\n", " \n", " \n", " \n", " " ], "text/plain": [ "AtlasProject: <{'id': 'ee2354a3-7f9a-4c6b-af43-b0cda09d7198', 'owner': '9c29afbb-a002-4d49-958e-ecf5ae1351ac', 'project_name': 'test_index_1677255228.136989', 'creator': 'auth0|63efc4b5462246f4d9a6ecf2', 'description': 'A description for your project', 'opensearch_index_id': 'f61fb8dd-0abf-4f31-9130-41870e443902', 'is_public': True, 'project_fields': ['atlas_id', 'text'], 'unique_id_field': 'atlas_id', 'modality': 'text', 'total_datums_in_project': 508, 'created_timestamp': '2023-02-24T16:13:50.313363+00:00', 'atlas_indices': [{'id': 'b1b01833-0964-4597-a4bc-a2d60700949d', 'project_id': 'ee2354a3-7f9a-4c6b-af43-b0cda09d7198', 'index_name': 'test_index_1677255228.136989_index', 'indexed_field': 'text', 'created_timestamp': '2023-02-24T16:13:52.957101+00:00', 'updated_timestamp': '2023-02-24T16:14:03.469621+00:00', 'atoms': ['charchunk', 'document'], 'colorable_fields': [], 'embedders': [{'id': '7ec0868a-4eed-4414-a482-25cce9803e1b', 'atlas_index_id': 'b1b01833-0964-4597-a4bc-a2d60700949d', 'ready': True, 'model_name': 'NomicEmbed', 'hyperparameters': {'norm': 'both', 'batch_size': 20, 'polymerize_by': 'charchunk', 'dataset_buffer_size': 1000}}], 'nearest_neighbor_indices': [{'id': '86f8e3ff-e07c-4678-a4d7-144db4b0301d', 'index_name': 'NomicOrganize', 'ready': True, 'hyperparameters': {'dim': 384, 'space': 'l2'}, 'atom_strategies': ['document']}], 'projections': [{'id': 'db996d77-8981-48a0-897a-ff2c22bbf541', 'projection_name': 'NomicProject', 'ready': True, 'hyperparameters': {'spread': 1.0, 'n_epochs': 50, 'n_neighbors': 15}, 'atom_strategies': ['document'], 'created_timestamp': '2023-02-24T16:13:52.979561+00:00', 'updated_timestamp': '2023-02-24T16:14:03.466309+00:00'}]}], 'insert_update_delete_lock': False}>" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "db.project" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.6" } }, "nbformat": 4, "nbformat_minor": 4 }