2023-02-27 06:11:38 +00:00
|
|
|
{
|
|
|
|
"cells": [
|
|
|
|
{
|
2023-05-02 21:20:01 +00:00
|
|
|
"attachments": {},
|
2023-02-27 06:11:38 +00:00
|
|
|
"cell_type": "markdown",
|
2023-03-10 00:31:14 +00:00
|
|
|
"metadata": {},
|
2023-02-27 06:11:38 +00:00
|
|
|
"source": [
|
|
|
|
"# AtlasDB\n",
|
|
|
|
"\n",
|
2023-04-29 02:26:50 +00:00
|
|
|
"This notebook shows you how to use functionality related to the `AtlasDB`.\n",
|
|
|
|
"\n",
|
2023-05-02 21:20:01 +00:00
|
|
|
"[Atlas](https://docs.nomic.ai/index.html) a platform for interacting with both small and internet scale unstructured datasets by Nomic "
|
2023-04-29 02:26:50 +00:00
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": null,
|
|
|
|
"metadata": {
|
|
|
|
"tags": []
|
|
|
|
},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"!pip install spacy"
|
2023-02-27 06:11:38 +00:00
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2023-03-10 00:31:14 +00:00
|
|
|
"execution_count": null,
|
2023-02-27 06:11:38 +00:00
|
|
|
"metadata": {
|
|
|
|
"pycharm": {
|
2023-03-10 00:31:14 +00:00
|
|
|
"is_executing": true
|
2023-04-29 02:26:50 +00:00
|
|
|
},
|
|
|
|
"scrolled": true,
|
|
|
|
"tags": []
|
2023-02-27 06:11:38 +00:00
|
|
|
},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
2023-04-29 02:26:50 +00:00
|
|
|
"!python3 -m spacy download en_core_web_sm"
|
2023-02-27 06:11:38 +00:00
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2023-03-10 00:31:14 +00:00
|
|
|
"execution_count": null,
|
2023-04-29 02:26:50 +00:00
|
|
|
"metadata": {
|
|
|
|
"tags": []
|
|
|
|
},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"!pip install nomic"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 6,
|
2023-02-27 06:11:38 +00:00
|
|
|
"metadata": {
|
2023-03-10 00:31:14 +00:00
|
|
|
"pycharm": {
|
|
|
|
"is_executing": true
|
2023-03-27 02:49:46 +00:00
|
|
|
},
|
2023-04-29 02:26:50 +00:00
|
|
|
"tags": []
|
2023-03-10 00:31:14 +00:00
|
|
|
},
|
|
|
|
"outputs": [],
|
2023-02-27 06:11:38 +00:00
|
|
|
"source": [
|
2023-04-29 02:26:50 +00:00
|
|
|
"import time\n",
|
|
|
|
"from langchain.embeddings.openai import OpenAIEmbeddings\n",
|
|
|
|
"from langchain.text_splitter import SpacyTextSplitter\n",
|
|
|
|
"from langchain.vectorstores import AtlasDB\n",
|
|
|
|
"from langchain.document_loaders import TextLoader"
|
2023-02-27 06:11:38 +00:00
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2023-04-29 02:26:50 +00:00
|
|
|
"execution_count": 7,
|
|
|
|
"metadata": {
|
|
|
|
"tags": []
|
|
|
|
},
|
2023-02-27 06:11:38 +00:00
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"ATLAS_TEST_API_KEY = '7xDPkYXSYDc1_ErdTPIcoAR9RNd8YDlkS3nVNXcVoIMZ6'"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2023-04-29 02:26:50 +00:00
|
|
|
"execution_count": 8,
|
|
|
|
"metadata": {
|
|
|
|
"tags": []
|
|
|
|
},
|
2023-02-27 06:11:38 +00:00
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
2023-03-27 02:49:46 +00:00
|
|
|
"loader = TextLoader('../../../state_of_the_union.txt')\n",
|
2023-02-27 06:11:38 +00:00
|
|
|
"documents = loader.load()\n",
|
|
|
|
"text_splitter = SpacyTextSplitter(separator='|')\n",
|
|
|
|
"texts = []\n",
|
|
|
|
"for doc in text_splitter.split_documents(documents):\n",
|
|
|
|
" texts.extend(doc.page_content.split('|'))\n",
|
|
|
|
" \n",
|
|
|
|
"texts = [e.strip() for e in texts]"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2023-03-10 00:31:14 +00:00
|
|
|
"execution_count": null,
|
|
|
|
"metadata": {
|
|
|
|
"pycharm": {
|
|
|
|
"is_executing": true
|
2023-04-29 02:26:50 +00:00
|
|
|
},
|
|
|
|
"tags": []
|
2023-03-10 00:31:14 +00:00
|
|
|
},
|
|
|
|
"outputs": [],
|
2023-02-27 06:11:38 +00:00
|
|
|
"source": [
|
|
|
|
"db = AtlasDB.from_texts(texts=texts,\n",
|
2023-03-10 00:31:14 +00:00
|
|
|
" name='test_index_'+str(time.time()), # unique name for your vector store\n",
|
|
|
|
" description='test_index', #a description for your vector store\n",
|
2023-02-27 06:11:38 +00:00
|
|
|
" api_key=ATLAS_TEST_API_KEY,\n",
|
|
|
|
" index_kwargs={'build_topic_model': True})"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2023-03-10 00:31:14 +00:00
|
|
|
"execution_count": null,
|
2023-03-27 02:49:46 +00:00
|
|
|
"metadata": {},
|
2023-03-10 00:31:14 +00:00
|
|
|
"outputs": [],
|
2023-02-27 06:11:38 +00:00
|
|
|
"source": [
|
2023-03-10 00:31:14 +00:00
|
|
|
"db.project.wait_for_project_lock()"
|
2023-03-27 02:49:46 +00:00
|
|
|
]
|
2023-02-27 06:11:38 +00:00
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 7,
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
"data": {
|
|
|
|
"text/html": [
|
|
|
|
"\n",
|
|
|
|
" <strong><a href=\"https://atlas.nomic.ai/dashboard/project/ee2354a3-7f9a-4c6b-af43-b0cda09d7198\">test_index_1677255228.136989</strong></a>\n",
|
|
|
|
" <br>\n",
|
|
|
|
" A description for your project 508 datums inserted.\n",
|
|
|
|
" <br>\n",
|
|
|
|
" 1 index built.\n",
|
|
|
|
" <br><strong>Projections</strong>\n",
|
|
|
|
"<ul>\n",
|
|
|
|
"<li>test_index_1677255228.136989_index. Status Completed. <a target=\"_blank\" href=\"https://atlas.nomic.ai/map/ee2354a3-7f9a-4c6b-af43-b0cda09d7198/db996d77-8981-48a0-897a-ff2c22bbf541\">view online</a></li></ul><hr><script>\n",
|
|
|
|
" destroy = function() {\n",
|
|
|
|
" document.getElementById(\"iframedb996d77-8981-48a0-897a-ff2c22bbf541\").remove()\n",
|
|
|
|
" }\n",
|
|
|
|
" </script>\n",
|
|
|
|
"\n",
|
|
|
|
" <h4>Projection ID: db996d77-8981-48a0-897a-ff2c22bbf541</h4>\n",
|
|
|
|
" <div class=\"actions\">\n",
|
|
|
|
" <div id=\"hide\" class=\"action\" onclick=\"destroy()\">Hide embedded project</div>\n",
|
|
|
|
" <div class=\"action\" id=\"out\">\n",
|
|
|
|
" <a href=\"https://atlas.nomic.ai/map/ee2354a3-7f9a-4c6b-af43-b0cda09d7198/db996d77-8981-48a0-897a-ff2c22bbf541\" target=\"_blank\">Explore on atlas.nomic.ai</a>\n",
|
|
|
|
" </div>\n",
|
|
|
|
" </div>\n",
|
|
|
|
" \n",
|
|
|
|
" <iframe class=\"iframe\" id=\"iframedb996d77-8981-48a0-897a-ff2c22bbf541\" allow=\"clipboard-read; clipboard-write\" src=\"https://atlas.nomic.ai/map/ee2354a3-7f9a-4c6b-af43-b0cda09d7198/db996d77-8981-48a0-897a-ff2c22bbf541\">\n",
|
|
|
|
" </iframe>\n",
|
|
|
|
"\n",
|
|
|
|
" <style>\n",
|
|
|
|
" .iframe {\n",
|
|
|
|
" /* vh can be **very** large in vscode ipynb. */\n",
|
|
|
|
" height: min(75vh, 66vw);\n",
|
|
|
|
" width: 100%;\n",
|
|
|
|
" }\n",
|
|
|
|
" </style>\n",
|
|
|
|
" \n",
|
|
|
|
" <style>\n",
|
|
|
|
" .actions {\n",
|
|
|
|
" display: block;\n",
|
|
|
|
" }\n",
|
|
|
|
" .action {\n",
|
|
|
|
" min-height: 18px;\n",
|
|
|
|
" margin: 5px;\n",
|
|
|
|
" transition: all 500ms ease-in-out;\n",
|
|
|
|
" }\n",
|
|
|
|
" .action:hover {\n",
|
|
|
|
" cursor: pointer;\n",
|
|
|
|
" }\n",
|
|
|
|
" #hide:hover::after {\n",
|
|
|
|
" content: \" X\";\n",
|
|
|
|
" }\n",
|
|
|
|
" #out:hover::after {\n",
|
|
|
|
" content: \"\";\n",
|
|
|
|
" }\n",
|
|
|
|
" </style>\n",
|
|
|
|
" "
|
|
|
|
],
|
|
|
|
"text/plain": [
|
|
|
|
"AtlasProject: <{'id': 'ee2354a3-7f9a-4c6b-af43-b0cda09d7198', 'owner': '9c29afbb-a002-4d49-958e-ecf5ae1351ac', 'project_name': 'test_index_1677255228.136989', 'creator': 'auth0|63efc4b5462246f4d9a6ecf2', 'description': 'A description for your project', 'opensearch_index_id': 'f61fb8dd-0abf-4f31-9130-41870e443902', 'is_public': True, 'project_fields': ['atlas_id', 'text'], 'unique_id_field': 'atlas_id', 'modality': 'text', 'total_datums_in_project': 508, 'created_timestamp': '2023-02-24T16:13:50.313363+00:00', 'atlas_indices': [{'id': 'b1b01833-0964-4597-a4bc-a2d60700949d', 'project_id': 'ee2354a3-7f9a-4c6b-af43-b0cda09d7198', 'index_name': 'test_index_1677255228.136989_index', 'indexed_field': 'text', 'created_timestamp': '2023-02-24T16:13:52.957101+00:00', 'updated_timestamp': '2023-02-24T16:14:03.469621+00:00', 'atoms': ['charchunk', 'document'], 'colorable_fields': [], 'embedders': [{'id': '7ec0868a-4eed-4414-a482-25cce9803e1b', 'atlas_index_id': 'b1b01833-0964-4597-a4bc-a2d60700949d', 'ready': True, 'model_name': 'NomicEmbed', 'hyperparameters': {'norm': 'both', 'batch_size': 20, 'polymerize_by': 'charchunk', 'dataset_buffer_size': 1000}}], 'nearest_neighbor_indices': [{'id': '86f8e3ff-e07c-4678-a4d7-144db4b0301d', 'index_name': 'NomicOrganize', 'ready': True, 'hyperparameters': {'dim': 384, 'space': 'l2'}, 'atom_strategies': ['document']}], 'projections': [{'id': 'db996d77-8981-48a0-897a-ff2c22bbf541', 'projection_name': 'NomicProject', 'ready': True, 'hyperparameters': {'spread': 1.0, 'n_epochs': 50, 'n_neighbors': 15}, 'atom_strategies': ['document'], 'created_timestamp': '2023-02-24T16:13:52.979561+00:00', 'updated_timestamp': '2023-02-24T16:14:03.466309+00:00'}]}], 'insert_update_delete_lock': False}>"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
"execution_count": 7,
|
|
|
|
"metadata": {},
|
|
|
|
"output_type": "execute_result"
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"source": [
|
|
|
|
"db.project"
|
|
|
|
]
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"metadata": {
|
|
|
|
"kernelspec": {
|
|
|
|
"display_name": "Python 3 (ipykernel)",
|
|
|
|
"language": "python",
|
|
|
|
"name": "python3"
|
|
|
|
},
|
|
|
|
"language_info": {
|
|
|
|
"codemirror_mode": {
|
|
|
|
"name": "ipython",
|
|
|
|
"version": 3
|
|
|
|
},
|
|
|
|
"file_extension": ".py",
|
|
|
|
"mimetype": "text/x-python",
|
|
|
|
"name": "python",
|
|
|
|
"nbconvert_exporter": "python",
|
|
|
|
"pygments_lexer": "ipython3",
|
2023-04-29 02:26:50 +00:00
|
|
|
"version": "3.10.6"
|
2023-02-27 06:11:38 +00:00
|
|
|
}
|
|
|
|
},
|
|
|
|
"nbformat": 4,
|
2023-04-29 02:26:50 +00:00
|
|
|
"nbformat_minor": 4
|
2023-03-10 00:31:14 +00:00
|
|
|
}
|