langchain/libs/community/langchain_community/llms/mlx_pipeline.py

from __future__ import annotations

import logging
from typing import Any, Callable, Iterator, List, Mapping, Optional

from langchain_core.callbacks import CallbackManagerForLLMRun
from langchain_core.language_models.llms import LLM
from langchain_core.outputs import GenerationChunk
from langchain_core.pydantic_v1 import Extra

DEFAULT_MODEL_ID = "mlx-community/quantized-gemma-2b"

logger = logging.getLogger(__name__)


class MLXPipeline(LLM):
    """MLX Pipeline API.

    To use, you should have the ``mlx-lm`` python package installed.

    Example using from_model_id:
        .. code-block:: python

            from langchain_community.llms import MLXPipeline
            pipe = MLXPipeline.from_model_id(
                model_id="mlx-community/quantized-gemma-2b",
                pipeline_kwargs={"max_tokens": 10, "temp": 0.7},
            )
    Example passing model and tokenizer in directly:
        .. code-block:: python

            from langchain_community.llms import MLXPipeline
            from mlx_lm import load
            model_id="mlx-community/quantized-gemma-2b"
            model, tokenizer = load(model_id)
            pipe = MLXPipeline(model=model, tokenizer=tokenizer)
    """

    model_id: str = DEFAULT_MODEL_ID
    """Model name to use."""
    model: Any  #: :meta private:
    """Model."""
    tokenizer: Any  #: :meta private:
    """Tokenizer."""
    tokenizer_config: Optional[dict] = None
    """
        Configuration parameters specifically for the tokenizer.
        Defaults to an empty dictionary.
    """
    adapter_file: Optional[str] = None
    """
        Path to the adapter file. If provided, applies LoRA layers to the model.
        Defaults to None.
    """
    lazy: bool = False
    """
        If False eval the model parameters to make sure they are
        loaded in memory before returning, otherwise they will be loaded
        when needed. Default: ``False``
    """
    pipeline_kwargs: Optional[dict] = None
    """
    Keyword arguments passed to the pipeline. Defaults include:
        - temp (float): Temperature for generation, default is 0.0.
        - max_tokens (int): Maximum tokens to generate, default is 100.
        - verbose (bool): Whether to output verbose logging, default is False.
        - formatter (Optional[Callable]): A callable to format the output.
          Default is None.
        - repetition_penalty (Optional[float]): The penalty factor for
          repeated sequences, default is None.
        - repetition_context_size (Optional[int]): Size of the context
          for applying repetition penalty, default is None.
        - top_p (float): The cumulative probability threshold for
          top-p filtering, default is 1.0.

    """

    class Config:
        """Configuration for this pydantic object."""

        extra = Extra.forbid

    @classmethod
    def from_model_id(
        cls,
        model_id: str,
        tokenizer_config: Optional[dict] = None,
        adapter_file: Optional[str] = None,
        lazy: bool = False,
        pipeline_kwargs: Optional[dict] = None,
        **kwargs: Any,
    ) -> MLXPipeline:
        """Construct the pipeline object from model_id and task."""
        try:
            from mlx_lm import load

        except ImportError:
            raise ImportError(
                "Could not import mlx_lm python package. "
                "Please install it with `pip install mlx_lm`."
            )

        tokenizer_config = tokenizer_config or {}
        if adapter_file:
            model, tokenizer = load(model_id, tokenizer_config, adapter_file, lazy)
        else:
            model, tokenizer = load(model_id, tokenizer_config, lazy=lazy)

        _pipeline_kwargs = pipeline_kwargs or {}
        return cls(
            model_id=model_id,
            model=model,
            tokenizer=tokenizer,
            tokenizer_config=tokenizer_config,
            adapter_file=adapter_file,
            lazy=lazy,
            pipeline_kwargs=_pipeline_kwargs,
            **kwargs,
        )

    @property
    def _identifying_params(self) -> Mapping[str, Any]:
        """Get the identifying parameters."""
        return {
            "model_id": self.model_id,
            "tokenizer_config": self.tokenizer_config,
            "adapter_file": self.adapter_file,
            "lazy": self.lazy,
            "pipeline_kwargs": self.pipeline_kwargs,
        }

    @property
    def _llm_type(self) -> str:
        return "mlx_pipeline"

    def _call(
        self,
        prompt: str,
        stop: Optional[List[str]] = None,
        run_manager: Optional[CallbackManagerForLLMRun] = None,
        **kwargs: Any,
    ) -> str:
        try:
            from mlx_lm import generate

        except ImportError:
            raise ImportError(
                "Could not import mlx_lm python package. "
                "Please install it with `pip install mlx_lm`."
            )

        pipeline_kwargs = kwargs.get("pipeline_kwargs", self.pipeline_kwargs)

        temp: float = pipeline_kwargs.get("temp", 0.0)
        max_tokens: int = pipeline_kwargs.get("max_tokens", 100)
        verbose: bool = pipeline_kwargs.get("verbose", False)
        formatter: Optional[Callable] = pipeline_kwargs.get("formatter", None)
        repetition_penalty: Optional[float] = pipeline_kwargs.get(
            "repetition_penalty", None
        )
        repetition_context_size: Optional[int] = pipeline_kwargs.get(
            "repetition_context_size", None
        )
        top_p: float = pipeline_kwargs.get("top_p", 1.0)

        return generate(
            model=self.model,
            tokenizer=self.tokenizer,
            prompt=prompt,
            temp=temp,
            max_tokens=max_tokens,
            verbose=verbose,
            formatter=formatter,
            repetition_penalty=repetition_penalty,
            repetition_context_size=repetition_context_size,
            top_p=top_p,
        )

    def _stream(
        self,
        prompt: str,
        stop: Optional[List[str]] = None,
        run_manager: Optional[CallbackManagerForLLMRun] = None,
        **kwargs: Any,
    ) -> Iterator[GenerationChunk]:
        try:
            import mlx.core as mx
            from mlx_lm.utils import generate_step

        except ImportError:
            raise ImportError(
                "Could not import mlx_lm python package. "
                "Please install it with `pip install mlx_lm`."
            )

        pipeline_kwargs = kwargs.get("pipeline_kwargs", self.pipeline_kwargs)

        temp: float = pipeline_kwargs.get("temp", 0.0)
        max_new_tokens: int = pipeline_kwargs.get("max_tokens", 100)
        repetition_penalty: Optional[float] = pipeline_kwargs.get(
            "repetition_penalty", None
        )
        repetition_context_size: Optional[int] = pipeline_kwargs.get(
            "repetition_context_size", None
        )
        top_p: float = pipeline_kwargs.get("top_p", 1.0)

        prompt = self.tokenizer.encode(prompt, return_tensors="np")

        prompt_tokens = mx.array(prompt[0])

        eos_token_id = self.tokenizer.eos_token_id
        detokenizer = self.tokenizer.detokenizer
        detokenizer.reset()

        for (token, prob), n in zip(
            generate_step(
                prompt=prompt_tokens,
                model=self.model,
                temp=temp,
                repetition_penalty=repetition_penalty,
                repetition_context_size=repetition_context_size,
                top_p=top_p,
            ),
            range(max_new_tokens),
        ):
            # identify text to yield
            text: Optional[str] = None
            detokenizer.add_token(token)
            detokenizer.finalize()
            text = detokenizer.last_segment

            # yield text, if any
            if text:
                chunk = GenerationChunk(text=text)
                yield chunk
                if run_manager:
                    run_manager.on_llm_new_token(chunk.text)

            # break if stop sequence found
            if token == eos_token_id or (stop is not None and text in stop):
                break
community[minor]: Add support for MLX models (chat & llm) (#18152) Description: This PR adds support for MLX models both chat (i.e., instruct) and llm (i.e., pretrained) types/ Dependencies: mlx, mlx_lm, transformers Twitter handle: @Prince_Canuma --------- Co-authored-by: Bagatur <baskaryan@gmail.com> Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> 2024-04-09 14:17:07 +00:00			`from __future__ import annotations`

			`import logging`
community[patch]: Fix MLX LLM Stream (#20575) Closes #20561 This PR fixes MLX LLM stream `AttributeError`. Recently, `mlx-lm` changed the token decoding logic, which affected the LC+MLX integration. Additionally, I made minor fixes such as: docs example broken link and enforcing pipeline arguments (max_tokens, temp and etc) for invoke. - Issue: #20561 - Twitter handle: @Prince_Canuma 2024-05-21 00:17:08 +00:00			`from typing import Any, Callable, Iterator, List, Mapping, Optional`
community[minor]: Add support for MLX models (chat & llm) (#18152) Description: This PR adds support for MLX models both chat (i.e., instruct) and llm (i.e., pretrained) types/ Dependencies: mlx, mlx_lm, transformers Twitter handle: @Prince_Canuma --------- Co-authored-by: Bagatur <baskaryan@gmail.com> Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> 2024-04-09 14:17:07 +00:00
			`from langchain_core.callbacks import CallbackManagerForLLMRun`
			`from langchain_core.language_models.llms import LLM`
			`from langchain_core.outputs import GenerationChunk`
			`from langchain_core.pydantic_v1 import Extra`

			`DEFAULT_MODEL_ID = "mlx-community/quantized-gemma-2b"`

			`logger = logging.getLogger(__name__)`


			`class MLXPipeline(LLM):`
			`"""MLX Pipeline API.`

			To use, you should have the ``mlx-lm`` python package installed.

			`Example using from_model_id:`
			`.. code-block:: python`

			`from langchain_community.llms import MLXPipeline`
			`pipe = MLXPipeline.from_model_id(`
			`model_id="mlx-community/quantized-gemma-2b",`
community[patch]: Fix MLX LLM Stream (#20575) Closes #20561 This PR fixes MLX LLM stream `AttributeError`. Recently, `mlx-lm` changed the token decoding logic, which affected the LC+MLX integration. Additionally, I made minor fixes such as: docs example broken link and enforcing pipeline arguments (max_tokens, temp and etc) for invoke. - Issue: #20561 - Twitter handle: @Prince_Canuma 2024-05-21 00:17:08 +00:00			`pipeline_kwargs={"max_tokens": 10, "temp": 0.7},`
community[minor]: Add support for MLX models (chat & llm) (#18152) Description: This PR adds support for MLX models both chat (i.e., instruct) and llm (i.e., pretrained) types/ Dependencies: mlx, mlx_lm, transformers Twitter handle: @Prince_Canuma --------- Co-authored-by: Bagatur <baskaryan@gmail.com> Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> 2024-04-09 14:17:07 +00:00			`)`
			`Example passing model and tokenizer in directly:`
			`.. code-block:: python`

			`from langchain_community.llms import MLXPipeline`
			`from mlx_lm import load`
			`model_id="mlx-community/quantized-gemma-2b"`
			`model, tokenizer = load(model_id)`
			`pipe = MLXPipeline(model=model, tokenizer=tokenizer)`
			`"""`

			`model_id: str = DEFAULT_MODEL_ID`
			`"""Model name to use."""`
			`model: Any #: :meta private:`
			`"""Model."""`
			`tokenizer: Any #: :meta private:`
			`"""Tokenizer."""`
			`tokenizer_config: Optional[dict] = None`
			`"""`
			`Configuration parameters specifically for the tokenizer.`
			`Defaults to an empty dictionary.`
			`"""`
			`adapter_file: Optional[str] = None`
			`"""`
			`Path to the adapter file. If provided, applies LoRA layers to the model.`
			`Defaults to None.`
			`"""`
			`lazy: bool = False`
			`"""`
			`If False eval the model parameters to make sure they are`
			`loaded in memory before returning, otherwise they will be loaded`
			when needed. Default: ``False``
			`"""`
			`pipeline_kwargs: Optional[dict] = None`
community[patch]: Fix MLX LLM Stream (#20575) Closes #20561 This PR fixes MLX LLM stream `AttributeError`. Recently, `mlx-lm` changed the token decoding logic, which affected the LC+MLX integration. Additionally, I made minor fixes such as: docs example broken link and enforcing pipeline arguments (max_tokens, temp and etc) for invoke. - Issue: #20561 - Twitter handle: @Prince_Canuma 2024-05-21 00:17:08 +00:00			`"""`
			`Keyword arguments passed to the pipeline. Defaults include:`
			`- temp (float): Temperature for generation, default is 0.0.`
			`- max_tokens (int): Maximum tokens to generate, default is 100.`
			`- verbose (bool): Whether to output verbose logging, default is False.`
			`- formatter (Optional[Callable]): A callable to format the output.`
			`Default is None.`
			`- repetition_penalty (Optional[float]): The penalty factor for`
			`repeated sequences, default is None.`
			`- repetition_context_size (Optional[int]): Size of the context`
			`for applying repetition penalty, default is None.`
			`- top_p (float): The cumulative probability threshold for`
			`top-p filtering, default is 1.0.`

			`"""`
community[minor]: Add support for MLX models (chat & llm) (#18152) Description: This PR adds support for MLX models both chat (i.e., instruct) and llm (i.e., pretrained) types/ Dependencies: mlx, mlx_lm, transformers Twitter handle: @Prince_Canuma --------- Co-authored-by: Bagatur <baskaryan@gmail.com> Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> 2024-04-09 14:17:07 +00:00
			`class Config:`
			`"""Configuration for this pydantic object."""`

			`extra = Extra.forbid`

			`@classmethod`
			`def from_model_id(`
			`cls,`
			`model_id: str,`
			`tokenizer_config: Optional[dict] = None,`
			`adapter_file: Optional[str] = None,`
			`lazy: bool = False,`
			`pipeline_kwargs: Optional[dict] = None,`
			`**kwargs: Any,`
			`) -> MLXPipeline:`
			`"""Construct the pipeline object from model_id and task."""`
			`try:`
			`from mlx_lm import load`

			`except ImportError:`
community[minor]: import fix (#20995) Issue: When the third-party package is not installed, whenever we need to `pip install <package>` the ImportError is raised. But sometimes, the `ValueError` or `ModuleNotFoundError` is raised. It is bad for consistency. Change: replaced the `ValueError` or `ModuleNotFoundError` with `ImportError` when we raise an error with the `pip install <package>` message. Note: Ideally, we replace all `try: import... except... raise ... `with helper functions like `import_aim` or just use the existing [langchain_core.utils.utils.guard_import](https://api.python.langchain.com/en/latest/utils/langchain_core.utils.utils.guard_import.html#langchain_core.utils.utils.guard_import) But it would be much bigger refactoring. @baskaryan Please, advice on this. 2024-04-29 14:32:50 +00:00			`raise ImportError(`
community[minor]: Add support for MLX models (chat & llm) (#18152) Description: This PR adds support for MLX models both chat (i.e., instruct) and llm (i.e., pretrained) types/ Dependencies: mlx, mlx_lm, transformers Twitter handle: @Prince_Canuma --------- Co-authored-by: Bagatur <baskaryan@gmail.com> Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> 2024-04-09 14:17:07 +00:00			`"Could not import mlx_lm python package. "`
			"Please install it with `pip install mlx_lm`."
			`)`

			`tokenizer_config = tokenizer_config or {}`
			`if adapter_file:`
			`model, tokenizer = load(model_id, tokenizer_config, adapter_file, lazy)`
			`else:`
			`model, tokenizer = load(model_id, tokenizer_config, lazy=lazy)`

			`_pipeline_kwargs = pipeline_kwargs or {}`
			`return cls(`
			`model_id=model_id,`
			`model=model,`
			`tokenizer=tokenizer,`
			`tokenizer_config=tokenizer_config,`
			`adapter_file=adapter_file,`
			`lazy=lazy,`
			`pipeline_kwargs=_pipeline_kwargs,`
			`**kwargs,`
			`)`

			`@property`
			`def _identifying_params(self) -> Mapping[str, Any]:`
			`"""Get the identifying parameters."""`
			`return {`
			`"model_id": self.model_id,`
			`"tokenizer_config": self.tokenizer_config,`
			`"adapter_file": self.adapter_file,`
			`"lazy": self.lazy,`
			`"pipeline_kwargs": self.pipeline_kwargs,`
			`}`

			`@property`
			`def _llm_type(self) -> str:`
			`return "mlx_pipeline"`

			`def _call(`
			`self,`
			`prompt: str,`
			`stop: Optional[List[str]] = None,`
			`run_manager: Optional[CallbackManagerForLLMRun] = None,`
			`**kwargs: Any,`
			`) -> str:`
			`try:`
			`from mlx_lm import generate`

			`except ImportError:`
community[minor]: import fix (#20995) Issue: When the third-party package is not installed, whenever we need to `pip install <package>` the ImportError is raised. But sometimes, the `ValueError` or `ModuleNotFoundError` is raised. It is bad for consistency. Change: replaced the `ValueError` or `ModuleNotFoundError` with `ImportError` when we raise an error with the `pip install <package>` message. Note: Ideally, we replace all `try: import... except... raise ... `with helper functions like `import_aim` or just use the existing [langchain_core.utils.utils.guard_import](https://api.python.langchain.com/en/latest/utils/langchain_core.utils.utils.guard_import.html#langchain_core.utils.utils.guard_import) But it would be much bigger refactoring. @baskaryan Please, advice on this. 2024-04-29 14:32:50 +00:00			`raise ImportError(`
community[minor]: Add support for MLX models (chat & llm) (#18152) Description: This PR adds support for MLX models both chat (i.e., instruct) and llm (i.e., pretrained) types/ Dependencies: mlx, mlx_lm, transformers Twitter handle: @Prince_Canuma --------- Co-authored-by: Bagatur <baskaryan@gmail.com> Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> 2024-04-09 14:17:07 +00:00			`"Could not import mlx_lm python package. "`
			"Please install it with `pip install mlx_lm`."
			`)`

community[patch]: Fix MLX LLM Stream (#20575) Closes #20561 This PR fixes MLX LLM stream `AttributeError`. Recently, `mlx-lm` changed the token decoding logic, which affected the LC+MLX integration. Additionally, I made minor fixes such as: docs example broken link and enforcing pipeline arguments (max_tokens, temp and etc) for invoke. - Issue: #20561 - Twitter handle: @Prince_Canuma 2024-05-21 00:17:08 +00:00			`pipeline_kwargs = kwargs.get("pipeline_kwargs", self.pipeline_kwargs)`
community[minor]: Add support for MLX models (chat & llm) (#18152) Description: This PR adds support for MLX models both chat (i.e., instruct) and llm (i.e., pretrained) types/ Dependencies: mlx, mlx_lm, transformers Twitter handle: @Prince_Canuma --------- Co-authored-by: Bagatur <baskaryan@gmail.com> Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> 2024-04-09 14:17:07 +00:00
community[patch]: Fix MLX LLM Stream (#20575) Closes #20561 This PR fixes MLX LLM stream `AttributeError`. Recently, `mlx-lm` changed the token decoding logic, which affected the LC+MLX integration. Additionally, I made minor fixes such as: docs example broken link and enforcing pipeline arguments (max_tokens, temp and etc) for invoke. - Issue: #20561 - Twitter handle: @Prince_Canuma 2024-05-21 00:17:08 +00:00			`temp: float = pipeline_kwargs.get("temp", 0.0)`
			`max_tokens: int = pipeline_kwargs.get("max_tokens", 100)`
			`verbose: bool = pipeline_kwargs.get("verbose", False)`
			`formatter: Optional[Callable] = pipeline_kwargs.get("formatter", None)`
			`repetition_penalty: Optional[float] = pipeline_kwargs.get(`
			`"repetition_penalty", None`
			`)`
			`repetition_context_size: Optional[int] = pipeline_kwargs.get(`
			`"repetition_context_size", None`
			`)`
			`top_p: float = pipeline_kwargs.get("top_p", 1.0)`

			`return generate(`
			`model=self.model,`
			`tokenizer=self.tokenizer,`
			`prompt=prompt,`
			`temp=temp,`
			`max_tokens=max_tokens,`
			`verbose=verbose,`
			`formatter=formatter,`
			`repetition_penalty=repetition_penalty,`
			`repetition_context_size=repetition_context_size,`
			`top_p=top_p,`
			`)`
community[minor]: Add support for MLX models (chat & llm) (#18152) Description: This PR adds support for MLX models both chat (i.e., instruct) and llm (i.e., pretrained) types/ Dependencies: mlx, mlx_lm, transformers Twitter handle: @Prince_Canuma --------- Co-authored-by: Bagatur <baskaryan@gmail.com> Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> 2024-04-09 14:17:07 +00:00
			`def _stream(`
			`self,`
			`prompt: str,`
			`stop: Optional[List[str]] = None,`
			`run_manager: Optional[CallbackManagerForLLMRun] = None,`
			`**kwargs: Any,`
			`) -> Iterator[GenerationChunk]:`
			`try:`
			`import mlx.core as mx`
			`from mlx_lm.utils import generate_step`

			`except ImportError:`
community[minor]: import fix (#20995) Issue: When the third-party package is not installed, whenever we need to `pip install <package>` the ImportError is raised. But sometimes, the `ValueError` or `ModuleNotFoundError` is raised. It is bad for consistency. Change: replaced the `ValueError` or `ModuleNotFoundError` with `ImportError` when we raise an error with the `pip install <package>` message. Note: Ideally, we replace all `try: import... except... raise ... `with helper functions like `import_aim` or just use the existing [langchain_core.utils.utils.guard_import](https://api.python.langchain.com/en/latest/utils/langchain_core.utils.utils.guard_import.html#langchain_core.utils.utils.guard_import) But it would be much bigger refactoring. @baskaryan Please, advice on this. 2024-04-29 14:32:50 +00:00			`raise ImportError(`
community[minor]: Add support for MLX models (chat & llm) (#18152) Description: This PR adds support for MLX models both chat (i.e., instruct) and llm (i.e., pretrained) types/ Dependencies: mlx, mlx_lm, transformers Twitter handle: @Prince_Canuma --------- Co-authored-by: Bagatur <baskaryan@gmail.com> Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> 2024-04-09 14:17:07 +00:00			`"Could not import mlx_lm python package. "`
			"Please install it with `pip install mlx_lm`."
			`)`

			`pipeline_kwargs = kwargs.get("pipeline_kwargs", self.pipeline_kwargs)`

			`temp: float = pipeline_kwargs.get("temp", 0.0)`
			`max_new_tokens: int = pipeline_kwargs.get("max_tokens", 100)`
			`repetition_penalty: Optional[float] = pipeline_kwargs.get(`
			`"repetition_penalty", None`
			`)`
			`repetition_context_size: Optional[int] = pipeline_kwargs.get(`
			`"repetition_context_size", None`
			`)`
community[patch]: Fix MLX LLM Stream (#20575) Closes #20561 This PR fixes MLX LLM stream `AttributeError`. Recently, `mlx-lm` changed the token decoding logic, which affected the LC+MLX integration. Additionally, I made minor fixes such as: docs example broken link and enforcing pipeline arguments (max_tokens, temp and etc) for invoke. - Issue: #20561 - Twitter handle: @Prince_Canuma 2024-05-21 00:17:08 +00:00			`top_p: float = pipeline_kwargs.get("top_p", 1.0)`
community[minor]: Add support for MLX models (chat & llm) (#18152) Description: This PR adds support for MLX models both chat (i.e., instruct) and llm (i.e., pretrained) types/ Dependencies: mlx, mlx_lm, transformers Twitter handle: @Prince_Canuma --------- Co-authored-by: Bagatur <baskaryan@gmail.com> Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> 2024-04-09 14:17:07 +00:00
			`prompt = self.tokenizer.encode(prompt, return_tensors="np")`

			`prompt_tokens = mx.array(prompt[0])`

			`eos_token_id = self.tokenizer.eos_token_id`
community[patch]: Fix MLX LLM Stream (#20575) Closes #20561 This PR fixes MLX LLM stream `AttributeError`. Recently, `mlx-lm` changed the token decoding logic, which affected the LC+MLX integration. Additionally, I made minor fixes such as: docs example broken link and enforcing pipeline arguments (max_tokens, temp and etc) for invoke. - Issue: #20561 - Twitter handle: @Prince_Canuma 2024-05-21 00:17:08 +00:00			`detokenizer = self.tokenizer.detokenizer`
			`detokenizer.reset()`
community[minor]: Add support for MLX models (chat & llm) (#18152) Description: This PR adds support for MLX models both chat (i.e., instruct) and llm (i.e., pretrained) types/ Dependencies: mlx, mlx_lm, transformers Twitter handle: @Prince_Canuma --------- Co-authored-by: Bagatur <baskaryan@gmail.com> Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> 2024-04-09 14:17:07 +00:00
			`for (token, prob), n in zip(`
			`generate_step(`
community[patch]: Fix MLX LLM Stream (#20575) Closes #20561 This PR fixes MLX LLM stream `AttributeError`. Recently, `mlx-lm` changed the token decoding logic, which affected the LC+MLX integration. Additionally, I made minor fixes such as: docs example broken link and enforcing pipeline arguments (max_tokens, temp and etc) for invoke. - Issue: #20561 - Twitter handle: @Prince_Canuma 2024-05-21 00:17:08 +00:00			`prompt=prompt_tokens,`
			`model=self.model,`
			`temp=temp,`
			`repetition_penalty=repetition_penalty,`
			`repetition_context_size=repetition_context_size,`
			`top_p=top_p,`
community[minor]: Add support for MLX models (chat & llm) (#18152) Description: This PR adds support for MLX models both chat (i.e., instruct) and llm (i.e., pretrained) types/ Dependencies: mlx, mlx_lm, transformers Twitter handle: @Prince_Canuma --------- Co-authored-by: Bagatur <baskaryan@gmail.com> Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> 2024-04-09 14:17:07 +00:00			`),`
			`range(max_new_tokens),`
			`):`
			`# identify text to yield`
			`text: Optional[str] = None`
community[patch]: Fix MLX LLM Stream (#20575) Closes #20561 This PR fixes MLX LLM stream `AttributeError`. Recently, `mlx-lm` changed the token decoding logic, which affected the LC+MLX integration. Additionally, I made minor fixes such as: docs example broken link and enforcing pipeline arguments (max_tokens, temp and etc) for invoke. - Issue: #20561 - Twitter handle: @Prince_Canuma 2024-05-21 00:17:08 +00:00			`detokenizer.add_token(token)`
			`detokenizer.finalize()`
			`text = detokenizer.last_segment`
community[minor]: Add support for MLX models (chat & llm) (#18152) Description: This PR adds support for MLX models both chat (i.e., instruct) and llm (i.e., pretrained) types/ Dependencies: mlx, mlx_lm, transformers Twitter handle: @Prince_Canuma --------- Co-authored-by: Bagatur <baskaryan@gmail.com> Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> 2024-04-09 14:17:07 +00:00
			`# yield text, if any`
			`if text:`
			`chunk = GenerationChunk(text=text)`
			`yield chunk`
			`if run_manager:`
			`run_manager.on_llm_new_token(chunk.text)`

			`# break if stop sequence found`
			`if token == eos_token_id or (stop is not None and text in stop):`
			`break`