from typing import Any, Dict, List, Optional import numpy as np from langchain_core.embeddings import Embeddings from langchain_core.pydantic_v1 import BaseModel, Extra, root_validator LASER_MULTILINGUAL_MODEL: str = "laser2" class LaserEmbeddings(BaseModel, Embeddings): """LASER Language-Agnostic SEntence Representations. LASER is a Python library developed by the Meta AI Research team and used for creating multilingual sentence embeddings for over 147 languages as of 2/25/2024 See more documentation at: * https://github.com/facebookresearch/LASER/ * https://github.com/facebookresearch/LASER/tree/main/laser_encoders * https://arxiv.org/abs/2205.12654 To use this class, you must install the `laser_encoders` Python package. `pip install laser_encoders` Example: from laser_encoders import LaserEncoderPipeline encoder = LaserEncoderPipeline(lang="eng_Latn") embeddings = encoder.encode_sentences(["Hello", "World"]) """ lang: Optional[str] """The language or language code you'd like to use If empty, this implementation will default to using a multilingual earlier LASER encoder model (called laser2) Find the list of supported languages at https://github.com/facebookresearch/flores/blob/main/flores200/README.md#languages-in-flores-200 """ _encoder_pipeline: Any # : :meta private: class Config: """Configuration for this pydantic object.""" extra = Extra.forbid @root_validator() def validate_environment(cls, values: Dict) -> Dict: """Validate that laser_encoders has been installed.""" try: from laser_encoders import LaserEncoderPipeline lang = values.get("lang") if lang: encoder_pipeline = LaserEncoderPipeline(lang=lang) else: encoder_pipeline = LaserEncoderPipeline(laser=LASER_MULTILINGUAL_MODEL) values["_encoder_pipeline"] = encoder_pipeline except ImportError as e: raise ImportError( "Could not import 'laser_encoders' Python package. " "Please install it with `pip install laser_encoders`." ) from e return values def embed_documents(self, texts: List[str]) -> List[List[float]]: """Generate embeddings for documents using LASER. Args: texts: The list of texts to embed. Returns: List of embeddings, one for each text. """ embeddings: np.ndarray embeddings = self._encoder_pipeline.encode_sentences(texts) return embeddings.tolist() def embed_query(self, text: str) -> List[float]: """Generate single query text embeddings using LASER. Args: text: The text to embed. Returns: Embeddings for the text. """ query_embeddings: np.ndarray query_embeddings = self._encoder_pipeline.encode_sentences([text]) return query_embeddings.tolist()[0]