langchain/libs/community/langchain_community/embeddings/oracleai.py

# Authors:
#   Harichandan Roy (hroy)
#   David Jiang (ddjiang)
#
# -----------------------------------------------------------------------------
# oracleai.py
# -----------------------------------------------------------------------------

from __future__ import annotations

import json
import logging
import traceback
from typing import TYPE_CHECKING, Any, Dict, List, Optional

from langchain_core.embeddings import Embeddings
from langchain_core.pydantic_v1 import BaseModel, Extra

if TYPE_CHECKING:
    from oracledb import Connection

logger = logging.getLogger(__name__)

"""OracleEmbeddings class"""


class OracleEmbeddings(BaseModel, Embeddings):
    """Get Embeddings"""

    """Oracle Connection"""
    conn: Any
    """Embedding Parameters"""
    params: Dict[str, Any]
    """Proxy"""
    proxy: Optional[str] = None

    def __init__(self, **kwargs: Any):
        super().__init__(**kwargs)

    class Config:
        """Configuration for this pydantic object."""

        extra = Extra.forbid

    """
    1 - user needs to have create procedure, 
        create mining model, create any directory privilege.
    2 - grant create procedure, create mining model, 
        create any directory to <user>;
    """

    @staticmethod
    def load_onnx_model(
        conn: Connection, dir: str, onnx_file: str, model_name: str
    ) -> None:
        """Load an ONNX model to Oracle Database.
        Args:
            conn: Oracle Connection,
            dir: Oracle Directory,
            onnx_file: ONNX file name,
            model_name: Name of the model.
        """

        try:
            if conn is None or dir is None or onnx_file is None or model_name is None:
                raise Exception("Invalid input")

            cursor = conn.cursor()
            cursor.execute(
                """
                begin
                    dbms_data_mining.drop_model(model_name => :model, force => true);
                    SYS.DBMS_VECTOR.load_onnx_model(:path, :filename, :model, 
                        json('{"function" : "embedding", 
                            "embeddingOutput" : "embedding", 
                            "input": {"input": ["DATA"]}}'));
                end;""",
                path=dir,
                filename=onnx_file,
                model=model_name,
            )

            cursor.close()

        except Exception as ex:
            logger.info(f"An exception occurred :: {ex}")
            traceback.print_exc()
            cursor.close()
            raise

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        """Compute doc embeddings using an OracleEmbeddings.
        Args:
            texts: The list of texts to embed.
        Returns:
            List of embeddings, one for each input text.
        """

        try:
            import oracledb
        except ImportError as e:
            raise ImportError(
                "Unable to import oracledb, please install with "
                "`pip install -U oracledb`."
            ) from e

        if texts is None:
            return None

        embeddings: List[List[float]] = []
        try:
            # returns strings or bytes instead of a locator
            oracledb.defaults.fetch_lobs = False
            cursor = self.conn.cursor()

            if self.proxy:
                cursor.execute(
                    "begin utl_http.set_proxy(:proxy); end;", proxy=self.proxy
                )

            for text in texts:
                cursor.execute(
                    "select t.* "
                    + "from dbms_vector_chain.utl_to_embeddings(:content, "
                    + "json(:params)) t",
                    content=text,
                    params=json.dumps(self.params),
                )

                for row in cursor:
                    if row is None:
                        embeddings.append([])
                    else:
                        rdata = json.loads(row[0])
                        # dereference string as array
                        vec = json.loads(rdata["embed_vector"])
                        embeddings.append(vec)

            cursor.close()
            return embeddings
        except Exception as ex:
            logger.info(f"An exception occurred :: {ex}")
            traceback.print_exc()
            cursor.close()
            raise

    def embed_query(self, text: str) -> List[float]:
        """Compute query embedding using an OracleEmbeddings.
        Args:
            text: The text to embed.
        Returns:
            Embedding for the text.
        """
        return self.embed_documents([text])[0]


# uncomment the following code block to run the test

"""
# A sample unit test.

''' get the Oracle connection '''
conn = oracledb.connect(
    user="",
    password="",
    dsn="")
print("Oracle connection is established...")

''' params '''
embedder_params = {"provider":"database", "model":"demo_model"}
proxy = ""

''' instance '''
embedder = OracleEmbeddings(conn=conn, params=embedder_params, proxy=proxy)

embed = embedder.embed_query("Hello World!")
print(f"Embedding generated by OracleEmbeddings: {embed}")

conn.close()
print("Connection is closed.")

"""
community[minor]: Oraclevs integration (#21123) Thank you for contributing to LangChain! - Oracle AI Vector Search Oracle AI Vector Search is designed for Artificial Intelligence (AI) workloads that allows you to query data based on semantics, rather than keywords. One of the biggest benefit of Oracle AI Vector Search is that semantic search on unstructured data can be combined with relational search on business data in one single system. This is not only powerful but also significantly more effective because you don't need to add a specialized vector database, eliminating the pain of data fragmentation between multiple systems. - Oracle AI Vector Search is designed for Artificial Intelligence (AI) workloads that allows you to query data based on semantics, rather than keywords. One of the biggest benefit of Oracle AI Vector Search is that semantic search on unstructured data can be combined with relational search on business data in one single system. This is not only powerful but also significantly more effective because you don't need to add a specialized vector database, eliminating the pain of data fragmentation between multiple systems. This Pull Requests Adds the following functionalities Oracle AI Vector Search : Vector Store Oracle AI Vector Search : Document Loader Oracle AI Vector Search : Document Splitter Oracle AI Vector Search : Summary Oracle AI Vector Search : Oracle Embeddings - We have added unit tests and have our own local unit test suite which verifies all the code is correct. We have made sure to add guides for each of the components and one end to end guide that shows how the entire thing runs. - We have made sure that make format and make lint run clean. Additional guidelines: - Make sure optional dependencies are imported within a function. - Please do not add dependencies to pyproject.toml files (even optional ones) unless they are required for unit tests. - Most PRs should not touch more than one package. - Changes should be backwards compatible. - If you are adding something to community, do not re-import it in langchain. If no one reviews your PR within a few days, please @-mention one of baskaryan, efriis, eyurtsev, hwchase17. --------- Co-authored-by: skmishraoracle <shailendra.mishra@oracle.com> Co-authored-by: hroyofc <harichandan.roy@oracle.com> Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> Co-authored-by: Bagatur <baskaryan@gmail.com> 5 months ago			`# Authors:`
			`# Harichandan Roy (hroy)`
			`# David Jiang (ddjiang)`
			`#`
			`# -----------------------------------------------------------------------------`
			`# oracleai.py`
			`# -----------------------------------------------------------------------------`

			`from __future__ import annotations`

			`import json`
			`import logging`
			`import traceback`
			`from typing import TYPE_CHECKING, Any, Dict, List, Optional`

			`from langchain_core.embeddings import Embeddings`
			`from langchain_core.pydantic_v1 import BaseModel, Extra`

			`if TYPE_CHECKING:`
			`from oracledb import Connection`

			`logger = logging.getLogger(__name__)`

			`"""OracleEmbeddings class"""`


			`class OracleEmbeddings(BaseModel, Embeddings):`
			`"""Get Embeddings"""`

			`"""Oracle Connection"""`
			`conn: Any`
			`"""Embedding Parameters"""`
			`params: Dict[str, Any]`
			`"""Proxy"""`
			`proxy: Optional[str] = None`

			`def __init__(self, **kwargs: Any):`
			`super().__init__(**kwargs)`

			`class Config:`
			`"""Configuration for this pydantic object."""`

			`extra = Extra.forbid`

			`"""`
			`1 - user needs to have create procedure,`
			`create mining model, create any directory privilege.`
			`2 - grant create procedure, create mining model,`
			`create any directory to <user>;`
			`"""`

			`@staticmethod`
			`def load_onnx_model(`
			`conn: Connection, dir: str, onnx_file: str, model_name: str`
			`) -> None:`
			`"""Load an ONNX model to Oracle Database.`
			`Args:`
			`conn: Oracle Connection,`
			`dir: Oracle Directory,`
			`onnx_file: ONNX file name,`
			`model_name: Name of the model.`
			`"""`

			`try:`
			`if conn is None or dir is None or onnx_file is None or model_name is None:`
			`raise Exception("Invalid input")`

			`cursor = conn.cursor()`
			`cursor.execute(`
			`"""`
			`begin`
			`dbms_data_mining.drop_model(model_name => :model, force => true);`
			`SYS.DBMS_VECTOR.load_onnx_model(:path, :filename, :model,`
			`json('{"function" : "embedding",`
			`"embeddingOutput" : "embedding",`
			`"input": {"input": ["DATA"]}}'));`
			`end;""",`
			`path=dir,`
			`filename=onnx_file,`
			`model=model_name,`
			`)`

			`cursor.close()`

			`except Exception as ex:`
			`logger.info(f"An exception occurred :: {ex}")`
			`traceback.print_exc()`
			`cursor.close()`
			`raise`

			`def embed_documents(self, texts: List[str]) -> List[List[float]]:`
			`"""Compute doc embeddings using an OracleEmbeddings.`
			`Args:`
			`texts: The list of texts to embed.`
			`Returns:`
			`List of embeddings, one for each input text.`
			`"""`

			`try:`
			`import oracledb`
			`except ImportError as e:`
			`raise ImportError(`
			`"Unable to import oracledb, please install with "`
			"`pip install -U oracledb`."
			`) from e`

			`if texts is None:`
			`return None`

			`embeddings: List[List[float]] = []`
			`try:`
			`# returns strings or bytes instead of a locator`
			`oracledb.defaults.fetch_lobs = False`
			`cursor = self.conn.cursor()`

			`if self.proxy:`
			`cursor.execute(`
			`"begin utl_http.set_proxy(:proxy); end;", proxy=self.proxy`
			`)`

			`for text in texts:`
			`cursor.execute(`
			`"select t.* "`
			`+ "from dbms_vector_chain.utl_to_embeddings(:content, "`
			`+ "json(:params)) t",`
			`content=text,`
			`params=json.dumps(self.params),`
			`)`

			`for row in cursor:`
			`if row is None:`
			`embeddings.append([])`
			`else:`
			`rdata = json.loads(row[0])`
			`# dereference string as array`
			`vec = json.loads(rdata["embed_vector"])`
			`embeddings.append(vec)`

			`cursor.close()`
			`return embeddings`
			`except Exception as ex:`
			`logger.info(f"An exception occurred :: {ex}")`
			`traceback.print_exc()`
			`cursor.close()`
			`raise`

			`def embed_query(self, text: str) -> List[float]:`
			`"""Compute query embedding using an OracleEmbeddings.`
			`Args:`
			`text: The text to embed.`
			`Returns:`
			`Embedding for the text.`
			`"""`
			`return self.embed_documents([text])[0]`


			`# uncomment the following code block to run the test`

			`"""`
			`# A sample unit test.`

			`''' get the Oracle connection '''`
			`conn = oracledb.connect(`
			`user="",`
			`password="",`
			`dsn="")`
			`print("Oracle connection is established...")`

			`''' params '''`
			`embedder_params = {"provider":"database", "model":"demo_model"}`
			`proxy = ""`

			`''' instance '''`
			`embedder = OracleEmbeddings(conn=conn, params=embedder_params, proxy=proxy)`

			`embed = embedder.embed_query("Hello World!")`
			`print(f"Embedding generated by OracleEmbeddings: {embed}")`

			`conn.close()`
			`print("Connection is closed.")`

			`"""`