nomic: init pkg (#16853)

Co-authored-by: Lance Martin <lance@langchain.dev>
5 months ago · 17e886388b
parent 2e5949b6f8
commit 17e886388b
24 changed files with 2179 additions and 0 deletions
--- a/.github/workflows/_integration_test.yml
+++ b/.github/workflows/_integration_test.yml
@ -56,6 +56,7 @@ jobs:
          GOOGLE_SEARCH_API_KEY: ${{ secrets.GOOGLE_SEARCH_API_KEY }}
          GOOGLE_CSE_ID: ${{ secrets.GOOGLE_CSE_ID }}
          EXA_API_KEY: ${{ secrets.EXA_API_KEY }}
+          NOMIC_API_KEY: ${{ secrets.NOMIC_API_KEY }}
        run: |
          make integration_tests

--- a/.github/workflows/_release.yml
+++ b/.github/workflows/_release.yml
@ -175,6 +175,7 @@ jobs:
          GOOGLE_SEARCH_API_KEY: ${{ secrets.GOOGLE_SEARCH_API_KEY }}
          GOOGLE_CSE_ID: ${{ secrets.GOOGLE_CSE_ID }}
          EXA_API_KEY: ${{ secrets.EXA_API_KEY }}
+          NOMIC_API_KEY: ${{ secrets.NOMIC_API_KEY }}
        run: make integration_tests
        working-directory: ${{ inputs.working-directory }}

--- a/cookbook/nomic_embeddings.ipynb
+++ b/cookbook/nomic_embeddings.ipynb
@ -0,0 +1,301 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "d8da6094-30c7-43f3-a608-c91717b673db",
+   "metadata": {},
+   "source": [
+    "# Nomic Embeddings\n",
+    "\n",
+    "Nomic has released a new embedding model with strong performance for long context retrieval (8k context window).\n",
+    "\n",
+    "## Signup\n",
+    "\n",
+    "Get your API token, then run:\n",
+    "```\n",
+    "! nomic login\n",
+    "```\n",
+    "\n",
+    "Then run with your generated API token \n",
+    "```\n",
+    "! nomic login < token > \n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f737ec15-e9ab-4629-b54c-24be69e8b60b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! nomic login"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "8ab7434a-2930-42b5-9164-dc2c03abe232",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! nomic login token"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a3501e2a-4686-4b95-8a1c-f19e035ea354",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! pip install -U langchain-nomic"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "134475f2-f256-4c13-9712-c55783e6a4e2",
+   "metadata": {},
+   "source": [
+    "## Document Loading\n",
+    "\n",
+    "Let's test 3 interesting blog posts."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "01c4d270-171e-45c2-a1b6-e350faa74117",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain_community.document_loaders import WebBaseLoader\n",
+    "\n",
+    "urls = [\n",
+    "    \"https://lilianweng.github.io/posts/2023-06-23-agent/\",\n",
+    "    \"https://lilianweng.github.io/posts/2023-03-15-prompt-engineering/\",\n",
+    "    \"https://lilianweng.github.io/posts/2023-10-25-adv-attack-llm/\",\n",
+    "]\n",
+    "\n",
+    "docs = [WebBaseLoader(url).load() for url in urls]\n",
+    "docs_list = [item for sublist in docs for item in sublist]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "75ab7f74-873c-4d84-af5a-5cf19c61239d",
+   "metadata": {},
+   "source": [
+    "## Splitting \n",
+    "\n",
+    "Long context retrieval "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "f512e128-629e-4304-926f-94fe5c999527",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.text_splitter import CharacterTextSplitter\n",
+    "\n",
+    "text_splitter = CharacterTextSplitter.from_tiktoken_encoder(\n",
+    "    chunk_size=7500, chunk_overlap=100\n",
+    ")\n",
+    "doc_splits = text_splitter.split_documents(docs_list)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "d2a69cf0-e3ab-4c92-a1d0-10da45c08b3b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The document is 6562 tokens\n",
+      "The document is 3037 tokens\n",
+      "The document is 6092 tokens\n",
+      "The document is 1050 tokens\n",
+      "The document is 6933 tokens\n",
+      "The document is 5560 tokens\n"
+     ]
+    }
+   ],
+   "source": [
+    "import tiktoken\n",
+    "\n",
+    "encoding = tiktoken.get_encoding(\"cl100k_base\")\n",
+    "encoding = tiktoken.encoding_for_model(\"gpt-3.5-turbo\")\n",
+    "for d in doc_splits:\n",
+    "    print(\"The document is %s tokens\" % len(encoding.encode(d.page_content)))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c58d1e9b-e98e-4bd9-b52f-4dfc2a4e69f4",
+   "metadata": {},
+   "source": [
+    "## Index \n",
+    "\n",
+    "Nomic embeddings [here](https://docs.nomic.ai/reference/endpoints/nomic-embed-text). "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "76447866-bf8b-412b-93bc-d6ea8ec35952",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "from langchain_community.vectorstores import Chroma\n",
+    "from langchain_core.output_parsers import StrOutputParser\n",
+    "from langchain_core.runnables import RunnableLambda, RunnablePassthrough\n",
+    "from langchain_nomic import NomicEmbeddings\n",
+    "from langchain_nomic.embeddings import NomicEmbeddings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "15b3eab2-2689-49d4-8cb0-67ef2adcbc49",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Add to vectorDB\n",
+    "vectorstore = Chroma.from_documents(\n",
+    "    documents=doc_splits,\n",
+    "    collection_name=\"rag-chroma\",\n",
+    "    embedding=NomicEmbeddings(model=\"nomic-embed-text-v1\"),\n",
+    ")\n",
+    "retriever = vectorstore.as_retriever()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "41131122-3591-4566-aac1-ed19d496820a",
+   "metadata": {},
+   "source": [
+    "## RAG Chain\n",
+    "\n",
+    "We can use the Mistral `v0.2`, which is [fine-tuned for 32k context](https://x.com/dchaplot/status/1734198245067243629?s=20).\n",
+    "\n",
+    "We can [use Ollama](https://ollama.ai/library/mistral) -\n",
+    "```\n",
+    "ollama pull mistral:instruct\n",
+    "```\n",
+    "\n",
+    "We can also run [GPT-4 128k](https://openai.com/blog/new-models-and-developer-products-announced-at-devday). "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "1397de64-5b4a-4001-adc5-570ff8d31ff6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain_community.chat_models import ChatOllama\n",
+    "from langchain_core.prompts import ChatPromptTemplate\n",
+    "from langchain_openai import ChatOpenAI\n",
+    "\n",
+    "# Prompt\n",
+    "template = \"\"\"Answer the question based only on the following context:\n",
+    "{context}\n",
+    "\n",
+    "Question: {question}\n",
+    "\"\"\"\n",
+    "prompt = ChatPromptTemplate.from_template(template)\n",
+    "\n",
+    "# LLM API\n",
+    "model = ChatOpenAI(temperature=0, model=\"gpt-4-1106-preview\")\n",
+    "\n",
+    "# Local LLM\n",
+    "ollama_llm = \"mistral:instruct\"\n",
+    "model_local = ChatOllama(model=ollama_llm)\n",
+    "\n",
+    "# Chain\n",
+    "chain = (\n",
+    "    {\"context\": retriever, \"question\": RunnablePassthrough()}\n",
+    "    | prompt\n",
+    "    | model_local\n",
+    "    | StrOutputParser()\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "1548e00c-1ff6-4e88-aa13-69badf2088fb",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "' Agents, especially those used in artificial intelligence and natural language processing, can have different types of memory. Here are some common types:\\n\\n1. **Short-term memory** or working memory: This is a small capacity, high-turnover memory that holds information temporarily while the agent processes it. Short-term memory is essential for tasks requiring attention and quick response, such as parsing sentences or following instructions.\\n\\n2. **Long-term memory**: This is a large capacity, low-turnover memory where agents store information for extended periods. Long-term memory enables learning from experiences, accessing past knowledge, and improving performance over time.\\n\\n3. **Explicit memory** or declarative memory: Agents use explicit memory to store and recall facts, concepts, and rules that can be expressed in natural language. This type of memory is crucial for problem solving and reasoning.\\n\\n4. **Implicit memory** or procedural memory: Implicit memory refers to the acquisition and retention of skills and habits. The agent learns through repeated experiences without necessarily being aware of it.\\n\\n5. **Connectionist memory**: Connectionist memory, also known as neural networks, is inspired by the structure and function of biological brains. Connectionist models learn and store information in interconnected nodes or artificial neurons. This type of memory enables the model to recognize patterns and generalize knowledge.\\n\\n6. **Hybrid memory systems**: Many advanced agents employ a combination of different memory types to maximize their learning potential and performance. These hybrid systems can integrate short-term, long-term, explicit, implicit, and connectionist memories.'"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Question\n",
+    "chain.invoke(\"What are the types of agent memory?\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5ec5b4c3-757d-44df-92ea-dd5f08017dd6",
+   "metadata": {},
+   "source": [
+    "**Mistral**\n",
+    "\n",
+    "Trace: 24k prompt tokens.\n",
+    "\n",
+    "* https://smith.langchain.com/public/3e04d475-ea08-4ee3-ae66-6416a93d8b08/r\n",
+    "\n",
+    "--- \n",
+    "\n",
+    "Some considerations are noted in the [needle in a haystack analysis](https://twitter.com/GregKamradt/status/1722386725635580292?lang=en):\n",
+    "\n",
+    "* LLMs may suffer with retrieval from large context depending on where the information is placed."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6ffb6b63-17ee-42d8-b1fb-d6a866e98458",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.16"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/docs/docs/integrations/providers/nomic.ipynb
+++ b/docs/docs/integrations/providers/nomic.ipynb
@ -0,0 +1,53 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Nomic\n",
+    "\n",
+    "Nomic currently offers two products:\n",
+    "\n",
+    "- Atlas: their Visual Data Engine\n",
+    "- GPT4All: their Open Source Edge Language Model Ecosystem\n",
+    "\n",
+    "Currently, you can import their hosted [embedding model](/docs/integrations/text_embedding/nomic) as follows:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "id": "y8ku6X96sebl"
+   },
+   "outputs": [],
+   "source": [
+    "from langchain_nomic import NomicEmbeddings"
+   ]
+  }
+ ],
+ "metadata": {
+  "colab": {
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
--- a/docs/docs/integrations/text_embedding/nomic.ipynb
+++ b/docs/docs/integrations/text_embedding/nomic.ipynb
@ -0,0 +1,132 @@
+{
+ "cells": [
+  {
+   "cell_type": "raw",
+   "id": "afaf8039",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "sidebar_label: Nomic\n",
+    "---"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e49f1e0d",
+   "metadata": {},
+   "source": [
+    "# NomicEmbeddings\n",
+    "\n",
+    "This notebook covers how to get started with Nomic embedding models.\n",
+    "\n",
+    "## Installation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4c3bef91",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# install package\n",
+    "!pip install -U langchain-nomic"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2b4f3e15",
+   "metadata": {},
+   "source": [
+    "## Environment Setup\n",
+    "\n",
+    "Make sure to set the following environment variables:\n",
+    "\n",
+    "- `NOMIC_API_KEY`\n",
+    "\n",
+    "## Usage"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "62e0dbc3",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from langchain_nomic.embeddings import NomicEmbeddings\n",
+    "\n",
+    "embeddings = NomicEmbeddings(model=\"nomic-embed-text-v1\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "12fcfb4b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "embeddings.embed_query(\"My query to look up\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1f2e6104",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "embeddings.embed_documents(\n",
+    "    [\"This is a content of the document\", \"This is another document\"]\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "46739f68",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# async embed query\n",
+    "await embeddings.aembed_query(\"My query to look up\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e48632ea",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# async embed documents\n",
+    "await embeddings.aembed_documents(\n",
+    "    [\"This is a content of the document\", \"This is another document\"]\n",
+    ")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/libs/partners/nomic/.gitignore
+++ b/libs/partners/nomic/.gitignore
@ -0,0 +1 @@
+__pycache__
--- a/libs/partners/nomic/LICENSE
+++ b/libs/partners/nomic/LICENSE
@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2023 LangChain, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/libs/partners/nomic/Makefile
+++ b/libs/partners/nomic/Makefile
@ -0,0 +1,61 @@
+.PHONY: all format lint test tests integration_tests docker_tests help extended_tests
+
+# Default target executed when no arguments are given to make.
+all: help
+
+# Define a variable for the test file path.
+TEST_FILE ?= tests/unit_tests/
+
+integration_tests: TEST_FILE = tests/integration_tests/
+
+test integration_tests:
+	poetry run pytest $(TEST_FILE)
+
+tests:
+	poetry run pytest $(TEST_FILE)
+
+
+######################
+# LINTING AND FORMATTING
+######################
+
+# Define a variable for Python and notebook files.
+PYTHON_FILES=.
+MYPY_CACHE=.mypy_cache
+lint format: PYTHON_FILES=.
+lint_diff format_diff: PYTHON_FILES=$(shell git diff --relative=libs/partners/nomic --name-only --diff-filter=d master | grep -E '\.py$$|\.ipynb$$')
+lint_package: PYTHON_FILES=langchain_nomic
+lint_tests: PYTHON_FILES=tests
+lint_tests: MYPY_CACHE=.mypy_cache_test
+
+lint lint_diff lint_package lint_tests:
+	poetry run ruff .
+	poetry run ruff format $(PYTHON_FILES) --diff
+	poetry run ruff --select I $(PYTHON_FILES)
+	mkdir $(MYPY_CACHE); poetry run mypy $(PYTHON_FILES) --cache-dir $(MYPY_CACHE)
+
+format format_diff:
+	poetry run ruff format $(PYTHON_FILES)
+	poetry run ruff --select I --fix $(PYTHON_FILES)
+
+spell_check:
+	poetry run codespell --toml pyproject.toml
+
+spell_fix:
+	poetry run codespell --toml pyproject.toml -w
+
+check_imports: $(shell find langchain_nomic -name '*.py')
+	poetry run python ./scripts/check_imports.py $^
+
+######################
+# HELP
+######################
+
+help:
+	@echo '----'
+	@echo 'check_imports				- check imports'
+	@echo 'format                       - run code formatters'
+	@echo 'lint                         - run linters'
+	@echo 'test                         - run unit tests'
+	@echo 'tests                        - run unit tests'
+	@echo 'test TEST_FILE=<test_file>   - run all tests in file'
--- a/libs/partners/nomic/README.md
+++ b/libs/partners/nomic/README.md
@ -0,0 +1,23 @@
+# langchain-nomic
+
+This package contains the LangChain integration with Nomic
+
+## Installation
+
+```bash
+pip install -U langchain-nomic
+```
+
+And you should configure credentials by setting the following environment variables:
+
+* `NOMIC_API_KEY`: your nomic API key
+
+## Embeddings
+
+`NomicEmbeddings` class exposes embeddings from Nomic.
+
+```python
+from langchain_nomic import NomicEmbeddings
+
+embeddings = NomicEmbeddings()
+embeddings.embed_query("What is the meaning of life?")
--- a/libs/partners/nomic/langchain_nomic/init.py
+++ b/libs/partners/nomic/langchain_nomic/init.py
@ -0,0 +1,5 @@
+from langchain_nomic.embeddings import NomicEmbeddings
+
+__all__ = [
+    "NomicEmbeddings",
+]
--- a/libs/partners/nomic/langchain_nomic/embeddings.py
+++ b/libs/partners/nomic/langchain_nomic/embeddings.py
@ -0,0 +1,71 @@
+import os
+from typing import List, Optional
+
+import nomic  # type: ignore
+from langchain_core.embeddings import Embeddings
+
+
+class NomicEmbeddings(Embeddings):
+    """NomicEmbeddings embedding model.
+
+    Example:
+        .. code-block:: python
+
+            from langchain_nomic import NomicEmbeddings
+
+            model = NomicEmbeddings()
+    """
+
+    def __init__(
+        self,
+        *,
+        model: str,
+        nomic_api_key: Optional[str] = None,
+    ):
+        """Initialize NomicEmbeddings model.
+
+        Args:
+            model: model name
+            nomic_api_key: optionally, set the Nomic API key. Uses the NOMIC_API_KEY
+                environment variable by default.
+        """
+        _api_key = nomic_api_key or os.environ.get("NOMIC_API_KEY")
+        if _api_key:
+            nomic.login(_api_key)
+        self.model = model
+
+    def embed(self, texts: List[str], *, task_type: str) -> List[List[float]]:
+        """Embed texts.
+
+        Args:
+            texts: list of texts to embed
+            task_type: the task type to use when embedding. One of `search_query`,
+                `search_document`, `classification`, `clustering`
+        """
+        # TODO: do this via nomic.embed when fixed in nomic sdk
+        from nomic import embed
+
+        output = embed.text(texts=texts, model=self.model, task_type=task_type)
+        return output["embeddings"]
+
+    def embed_documents(self, texts: List[str]) -> List[List[float]]:
+        """Embed search docs.
+
+        Args:
+            texts: list of texts to embed as documents
+        """
+        return self.embed(
+            texts=texts,
+            task_type="search_document",
+        )
+
+    def embed_query(self, text: str) -> List[float]:
+        """Embed query text.
+
+        Args:
+            text: query text
+        """
+        return self.embed(
+            texts=[text],
+            task_type="search_query",
+        )[0]
--- a/libs/partners/nomic/langchain_nomic/py.typed
+++ b/libs/partners/nomic/langchain_nomic/py.typed
--- a/libs/partners/nomic/poetry.lock
+++ b/libs/partners/nomic/poetry.lock
--- a/libs/partners/nomic/pyproject.toml
+++ b/libs/partners/nomic/pyproject.toml
@ -0,0 +1,94 @@
+[tool.poetry]
+name = "langchain-nomic"
+version = "0.0.1"
+description = "An integration package connecting Nomic and LangChain"
+authors = []
+readme = "README.md"
+repository = "https://github.com/langchain-ai/langchain"
+license = "MIT"
+
+[tool.poetry.urls]
+"Source Code" = "https://github.com/langchain-ai/langchain/tree/master/libs/partners/nomic"
+
+[tool.poetry.dependencies]
+python = ">=3.8.1,<4.0"
+langchain-core = ">=0.0.12"
+nomic = "^3.0.7"
+
+[tool.poetry.group.test]
+optional = true
+
+[tool.poetry.group.test.dependencies]
+pytest = "^7.3.0"
+freezegun = "^1.2.2"
+pytest-mock  = "^3.10.0"
+syrupy = "^4.0.2"
+pytest-watcher = "^0.3.4"
+pytest-asyncio = "^0.21.1"
+langchain-core = {path = "../../core", develop = true}
+
+[tool.poetry.group.codespell]
+optional = true
+
+[tool.poetry.group.codespell.dependencies]
+codespell = "^2.2.0"
+
+[tool.poetry.group.test_integration]
+optional = true
+
+[tool.poetry.group.test_integration.dependencies]
+
+[tool.poetry.group.lint]
+optional = true
+
+[tool.poetry.group.lint.dependencies]
+ruff = "^0.1.5"
+
+[tool.poetry.group.typing.dependencies]
+mypy = "^0.991"
+langchain-core = {path = "../../core", develop = true}
+
+[tool.poetry.group.dev]
+optional = true
+
+[tool.poetry.group.dev.dependencies]
+langchain-core = {path = "../../core", develop = true}
+
+[tool.ruff]
+select = [
+  "E",  # pycodestyle
+  "F",  # pyflakes
+  "I",  # isort
+]
+
+[tool.mypy]
+disallow_untyped_defs = "True"
+
+[tool.coverage.run]
+omit = [
+    "tests/*",
+]
+
+[build-system]
+requires = ["poetry-core>=1.0.0"]
+build-backend = "poetry.core.masonry.api"
+
+[tool.pytest.ini_options]
+# --strict-markers will raise errors on unknown marks.
+# https://docs.pytest.org/en/7.1.x/how-to/mark.html#raising-errors-on-unknown-marks
+#
+# https://docs.pytest.org/en/7.1.x/reference/reference.html
+# --strict-config       any warnings encountered while parsing the `pytest`
+#                       section of the configuration file raise errors.
+#
+# https://github.com/tophat/syrupy
+# --snapshot-warn-unused    Prints a warning on unused snapshots rather than fail the test suite.
+addopts = "--snapshot-warn-unused --strict-markers --strict-config --durations=5"
+# Registering custom markers.
+# https://docs.pytest.org/en/7.1.x/example/markers.html#registering-markers
+markers = [
+  "requires: mark tests as requiring a specific library",
+  "asyncio: mark tests as requiring asyncio",
+  "compile: mark placeholder test used to compile integration tests without running them",
+]
+asyncio_mode = "auto"
--- a/libs/partners/nomic/scripts/check_imports.py
+++ b/libs/partners/nomic/scripts/check_imports.py
@ -0,0 +1,17 @@
+import sys
+import traceback
+from importlib.machinery import SourceFileLoader
+
+if __name__ == "__main__":
+    files = sys.argv[1:]
+    has_failure = False
+    for file in files:
+        try:
+            SourceFileLoader("x", file).load_module()
+        except Exception:
+            has_faillure = True
+            print(file)
+            traceback.print_exc()
+            print()
+
+    sys.exit(1 if has_failure else 0)
--- a/libs/partners/nomic/scripts/check_pydantic.sh
+++ b/libs/partners/nomic/scripts/check_pydantic.sh
@ -0,0 +1,27 @@
+#!/bin/bash
+#
+# This script searches for lines starting with "import pydantic" or "from pydantic"
+# in tracked files within a Git repository.
+#
+# Usage: ./scripts/check_pydantic.sh /path/to/repository
+
+# Check if a path argument is provided
+if [ $# -ne 1 ]; then
+  echo "Usage: $0 /path/to/repository"
+  exit 1
+fi
+
+repository_path="$1"
+
+# Search for lines matching the pattern within the specified repository
+result=$(git -C "$repository_path" grep -E '^import pydantic|^from pydantic')
+
+# Check if any matching lines were found
+if [ -n "$result" ]; then
+  echo "ERROR: The following lines need to be updated:"
+  echo "$result"
+  echo "Please replace the code with an import from langchain_core.pydantic_v1."
+  echo "For example, replace 'from pydantic import BaseModel'"
+  echo "with 'from langchain_core.pydantic_v1 import BaseModel'"
+  exit 1
+fi
--- a/libs/partners/nomic/scripts/lint_imports.sh
+++ b/libs/partners/nomic/scripts/lint_imports.sh
@ -0,0 +1,17 @@
+#!/bin/bash
+
+set -eu
+
+# Initialize a variable to keep track of errors
+errors=0
+
+# make sure not importing from langchain or langchain_experimental
+git --no-pager grep '^from langchain\.' . && errors=$((errors+1))
+git --no-pager grep '^from langchain_experimental\.' . && errors=$((errors+1))
+
+# Decide on an exit status based on the errors
+if [ "$errors" -gt 0 ]; then
+    exit 1
+else
+    exit 0
+fi
--- a/libs/partners/nomic/tests/init.py
+++ b/libs/partners/nomic/tests/init.py
--- a/libs/partners/nomic/tests/integration_tests/init.py
+++ b/libs/partners/nomic/tests/integration_tests/init.py
--- a/libs/partners/nomic/tests/integration_tests/test_compile.py
+++ b/libs/partners/nomic/tests/integration_tests/test_compile.py
@ -0,0 +1,7 @@
+import pytest
+
+
+@pytest.mark.compile
+def test_placeholder() -> None:
+    """Used for compiling integration tests without running any real tests."""
+    pass
--- a/libs/partners/nomic/tests/integration_tests/test_embeddings.py
+++ b/libs/partners/nomic/tests/integration_tests/test_embeddings.py
@ -0,0 +1,19 @@
+"""Test Nomic embeddings."""
+from langchain_nomic.embeddings import NomicEmbeddings
+
+
+def test_langchain_nomic_embedding_documents() -> None:
+    """Test cohere embeddings."""
+    documents = ["foo bar"]
+    embedding = NomicEmbeddings(model="nomic-embed-text-v1")
+    output = embedding.embed_documents(documents)
+    assert len(output) == 1
+    assert len(output[0]) > 0
+
+
+def test_langchain_nomic_embedding_query() -> None:
+    """Test cohere embeddings."""
+    document = "foo bar"
+    embedding = NomicEmbeddings(model="nomic-embed-text-v1")
+    output = embedding.embed_query(document)
+    assert len(output) > 0
--- a/libs/partners/nomic/tests/unit_tests/init.py
+++ b/libs/partners/nomic/tests/unit_tests/init.py
--- a/libs/partners/nomic/tests/unit_tests/test_embeddings.py
+++ b/libs/partners/nomic/tests/unit_tests/test_embeddings.py
@ -0,0 +1,9 @@
+"""Test embedding model integration."""
+
+
+from langchain_nomic.embeddings import NomicEmbeddings
+
+
+def test_initialization() -> None:
+    """Test embedding model initialization."""
+    NomicEmbeddings(model="nomic-embed-text-v1")
--- a/libs/partners/nomic/tests/unit_tests/test_imports.py
+++ b/libs/partners/nomic/tests/unit_tests/test_imports.py
@ -0,0 +1,9 @@
+from langchain_nomic import __all__
+
+EXPECTED_ALL = [
+    "NomicEmbeddings",
+]
+
+
+def test_all_imports() -> None:
+    assert sorted(EXPECTED_ALL) == sorted(__all__)