langchain/libs/community/langchain_community/cross_encoders/huggingface.py

from typing import Any, Dict, List, Tuple

from langchain_core.pydantic_v1 import BaseModel, Extra, Field

from langchain_community.cross_encoders.base import BaseCrossEncoder

DEFAULT_MODEL_NAME = "BAAI/bge-reranker-base"


class HuggingFaceCrossEncoder(BaseModel, BaseCrossEncoder):
    """HuggingFace cross encoder models.

    Example:
        .. code-block:: python

            from langchain_community.cross_encoders import HuggingFaceCrossEncoder

            model_name = "BAAI/bge-reranker-base"
            model_kwargs = {'device': 'cpu'}
            hf = HuggingFaceCrossEncoder(
                model_name=model_name,
                model_kwargs=model_kwargs
            )
    """

    client: Any  #: :meta private:
    model_name: str = DEFAULT_MODEL_NAME
    """Model name to use."""
    model_kwargs: Dict[str, Any] = Field(default_factory=dict)
    """Keyword arguments to pass to the model."""

    def __init__(self, **kwargs: Any):
        """Initialize the sentence_transformer."""
        super().__init__(**kwargs)
        try:
            import sentence_transformers

        except ImportError as exc:
            raise ImportError(
                "Could not import sentence_transformers python package. "
                "Please install it with `pip install sentence-transformers`."
            ) from exc

        self.client = sentence_transformers.CrossEncoder(
            self.model_name, **self.model_kwargs
        )

    class Config:
        """Configuration for this pydantic object."""

        extra = Extra.forbid

    def score(self, text_pairs: List[Tuple[str, str]]) -> List[float]:
        """Compute similarity scores using a HuggingFace transformer model.

        Args:
            text_pairs: The list of text text_pairs to score the similarity.

        Returns:
            List of scores, one for each pair.
        """
        scores = self.client.predict(text_pairs)
        # Somes models e.g bert-multilingual-passage-reranking-msmarco
        # gives two score not_relevant and relevant as compare with the query.
        if len(scores.shape) > 1:  # we are going to get the relevant scores
            scores = map(lambda x: x[1], scores)
        return scores
langchain[minor], community[minor]: add CrossEncoderReranker with HuggingFaceCrossEncoder and SagemakerEndpointCrossEncoder (#13687) - Description: Support reranking based on cross encoder models available from HuggingFace. - Added `CrossEncoder` schema - Implemented `HuggingFaceCrossEncoder` and `SagemakerEndpointCrossEncoder` - Implemented `CrossEncoderReranker` that performs similar functionality to `CohereRerank` - Added `cross-encoder-reranker.ipynb` to demonstrate how to use it. Please let me know if anything else needs to be done to make it visible on the table-of-contents navigation bar on the left, or on the card list on [retrievers documentation page](https://python.langchain.com/docs/integrations/retrievers). - Issue: N/A - Dependencies: None other than the existing ones. --------- Co-authored-by: Kenny Choe <kchoe@amazon.com> Co-authored-by: Bagatur <baskaryan@gmail.com> 2024-03-31 20:51:31 +00:00			`from typing import Any, Dict, List, Tuple`

			`from langchain_core.pydantic_v1 import BaseModel, Extra, Field`

			`from langchain_community.cross_encoders.base import BaseCrossEncoder`

			`DEFAULT_MODEL_NAME = "BAAI/bge-reranker-base"`


			`class HuggingFaceCrossEncoder(BaseModel, BaseCrossEncoder):`
			`"""HuggingFace cross encoder models.`

			`Example:`
			`.. code-block:: python`

			`from langchain_community.cross_encoders import HuggingFaceCrossEncoder`

			`model_name = "BAAI/bge-reranker-base"`
			`model_kwargs = {'device': 'cpu'}`
			`hf = HuggingFaceCrossEncoder(`
			`model_name=model_name,`
			`model_kwargs=model_kwargs`
			`)`
			`"""`

			`client: Any #: :meta private:`
			`model_name: str = DEFAULT_MODEL_NAME`
			`"""Model name to use."""`
			`model_kwargs: Dict[str, Any] = Field(default_factory=dict)`
			`"""Keyword arguments to pass to the model."""`

			`def __init__(self, **kwargs: Any):`
			`"""Initialize the sentence_transformer."""`
			`super().__init__(**kwargs)`
			`try:`
			`import sentence_transformers`

			`except ImportError as exc:`
			`raise ImportError(`
			`"Could not import sentence_transformers python package. "`
			"Please install it with `pip install sentence-transformers`."
			`) from exc`

			`self.client = sentence_transformers.CrossEncoder(`
			`self.model_name, **self.model_kwargs`
			`)`

			`class Config:`
			`"""Configuration for this pydantic object."""`

			`extra = Extra.forbid`

			`def score(self, text_pairs: List[Tuple[str, str]]) -> List[float]:`
			`"""Compute similarity scores using a HuggingFace transformer model.`

			`Args:`
			`text_pairs: The list of text text_pairs to score the similarity.`

			`Returns:`
			`List of scores, one for each pair.`
			`"""`
			`scores = self.client.predict(text_pairs)`
[Community]: HuggingFaceCrossEncoder `score` accounting for <not-relevant score,relevant score> pairs. (#22578) - Description: Some of the Cross-Encoder models provide scores in pairs, i.e., <not-relevant score (higher means the document is less relevant to the query), relevant score (higher means the document is more relevant to the query)>. However, the `HuggingFaceCrossEncoder` `score` method does not currently take into account the pair situation. This PR addresses this issue by modifying the method to consider only the relevant score if score is being provided in pair. The reason for focusing on the relevant score is that the compressors select the top-n documents based on relevance. - Issue: #22556 - Please also refer to this [comment](https://github.com/UKPLab/sentence-transformers/issues/568#issuecomment-729153075) 2024-06-14 15:28:24 +00:00			`# Somes models e.g bert-multilingual-passage-reranking-msmarco`
			`# gives two score not_relevant and relevant as compare with the query.`
			`if len(scores.shape) > 1: # we are going to get the relevant scores`
			`scores = map(lambda x: x[1], scores)`
langchain[minor], community[minor]: add CrossEncoderReranker with HuggingFaceCrossEncoder and SagemakerEndpointCrossEncoder (#13687) - Description: Support reranking based on cross encoder models available from HuggingFace. - Added `CrossEncoder` schema - Implemented `HuggingFaceCrossEncoder` and `SagemakerEndpointCrossEncoder` - Implemented `CrossEncoderReranker` that performs similar functionality to `CohereRerank` - Added `cross-encoder-reranker.ipynb` to demonstrate how to use it. Please let me know if anything else needs to be done to make it visible on the table-of-contents navigation bar on the left, or on the card list on [retrievers documentation page](https://python.langchain.com/docs/integrations/retrievers). - Issue: N/A - Dependencies: None other than the existing ones. --------- Co-authored-by: Kenny Choe <kchoe@amazon.com> Co-authored-by: Bagatur <baskaryan@gmail.com> 2024-03-31 20:51:31 +00:00			`return scores`