From fd1061e7bf2c21bbede5f52f81ea7696aa84eed9 Mon Sep 17 00:00:00 2001
From: Shengsheng Huang <shannie.huang@gmail.com>
Date: Fri, 26 Apr 2024 03:58:18 +0800
Subject: [PATCH] community[patch]: add more data types support to ipex-llm llm
 integration (#20833)

- **Description**:
- **add support for more data types**: by default `IpexLLM` will load
the model in int4 format. This PR adds more data types support such as
`sym_in5`, `sym_int8`, etc. Data formats like NF3, NF4, FP4 and FP8 are
only supported on GPU and will be added in future PR.
    - Fix a small issue in saving/loading, update api docs
- **Dependencies**: `ipex-llm` library
- **Document**: In `docs/docs/integrations/llms/ipex_llm.ipynb`, added
instructions for saving/loading low-bit model.
- **Tests**: added new test cases to
`libs/community/tests/integration_tests/llms/test_ipex_llm.py`, added
config params.
- **Contribution maintainer**: @shane-huang
---
 docs/docs/integrations/llms/ipex_llm.ipynb    | 129 ++++++++++++---
 .../langchain_community/llms/bigdl_llm.py     |  35 +++-
 .../langchain_community/llms/ipex_llm.py      | 153 ++++++++++++------
 .../integration_tests/llms/test_bigdl_llm.py  |  28 +++-
 .../integration_tests/llms/test_ipex_llm.py   |  81 ++++++++--
 5 files changed, 342 insertions(+), 84 deletions(-)

diff --git a/docs/docs/integrations/llms/ipex_llm.ipynb b/docs/docs/integrations/llms/ipex_llm.ipynb
index 25519b2d92..32d768d807 100644
--- a/docs/docs/integrations/llms/ipex_llm.ipynb
+++ b/docs/docs/integrations/llms/ipex_llm.ipynb
@@ -6,9 +6,9 @@
    "source": [
     "# IPEX-LLM\n",
     "\n",
-    "> [IPEX-LLM](https://github.com/intel-analytics/ipex-llm/) is a low-bit LLM optimization library on Intel XPU (Xeon/Core/Flex/Arc/Max). It can make LLMs run extremely fast and consume much less memory on Intel platforms. It is open sourced under Apache 2.0 License.\n",
+    "> [IPEX-LLM](https://github.com/intel-analytics/ipex-llm/) is a PyTorch library for running LLM on Intel CPU and GPU (e.g., local PC with iGPU, discrete GPU such as Arc, Flex and Max) with very low latency. \n",
     "\n",
-    "This example goes over how to use LangChain to interact with IPEX-LLM for text generation. \n"
+    "This example goes over how to use LangChain to interact with `ipex-llm` for text generation. \n"
    ]
   },
   {
@@ -49,7 +49,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Usage"
+    "## Basic Usage"
    ]
   },
   {
@@ -58,9 +58,20 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "import warnings\n",
+    "\n",
     "from langchain.chains import LLMChain\n",
     "from langchain_community.llms import IpexLLM\n",
-    "from langchain_core.prompts import PromptTemplate"
+    "from langchain_core.prompts import PromptTemplate\n",
+    "\n",
+    "warnings.filterwarnings(\"ignore\", category=UserWarning, message=\".*padding_mask.*\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Specify the prompt template for your model. In this example, we use the [vicuna-1.5](https://huggingface.co/lmsys/vicuna-7b-v1.5) model. If you're working with a different model, choose a proper template accordingly."
    ]
   },
   {
@@ -77,7 +88,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Load Model: "
+    "Load the model locally using IpexLLM using `IpexLLM.from_model_id`. It will load the model directly in its Huggingface format and convert it automatically to low-bit format for inference."
    ]
   },
   {
@@ -88,7 +99,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "27c08180714a44c7ab766624d5054163",
+       "model_id": "897501860fe4452b836f816c72d955dd",
        "version_major": 2,
        "version_minor": 0
       },
@@ -103,7 +114,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2024-03-27 00:58:43,670 - INFO - Converting the current model to sym_int4 format......\n"
+      "2024-04-24 21:20:12,461 - INFO - Converting the current model to sym_int4 format......\n"
      ]
     }
    ],
@@ -130,13 +141,9 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/opt/anaconda3/envs/shane-langchain2/lib/python3.9/site-packages/langchain_core/_api/deprecation.py:117: LangChainDeprecationWarning: The function `run` was deprecated in LangChain 0.1.0 and will be removed in 0.2.0. Use invoke instead.\n",
+      "/opt/anaconda3/envs/shane-langchain-3.11/lib/python3.11/site-packages/langchain_core/_api/deprecation.py:119: LangChainDeprecationWarning: The class `LLMChain` was deprecated in LangChain 0.1.17 and will be removed in 0.3.0. Use RunnableSequence, e.g., `prompt | llm` instead.\n",
       "  warn_deprecated(\n",
-      "/opt/anaconda3/envs/shane-langchain2/lib/python3.9/site-packages/transformers/generation/utils.py:1369: UserWarning: Using `max_length`'s default (4096) to control the generation length. This behaviour is deprecated and will be removed from the config in v5 of Transformers -- we recommend using `max_new_tokens` to control the maximum length of the generation.\n",
-      "  warnings.warn(\n",
-      "/opt/anaconda3/envs/shane-langchain2/lib/python3.9/site-packages/ipex_llm/transformers/models/llama.py:218: UserWarning: Passing `padding_mask` is deprecated and will be removed in v4.37.Please make sure use `attention_mask` instead.`\n",
-      "  warnings.warn(\n",
-      "/opt/anaconda3/envs/shane-langchain2/lib/python3.9/site-packages/ipex_llm/transformers/models/llama.py:218: UserWarning: Passing `padding_mask` is deprecated and will be removed in v4.37.Please make sure use `attention_mask` instead.`\n",
+      "/opt/anaconda3/envs/shane-langchain-3.11/lib/python3.11/site-packages/transformers/generation/utils.py:1369: UserWarning: Using `max_length`'s default (4096) to control the generation length. This behaviour is deprecated and will be removed from the config in v5 of Transformers -- we recommend using `max_new_tokens` to control the maximum length of the generation.\n",
       "  warnings.warn(\n"
      ]
     },
@@ -144,10 +151,6 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
-      "To disable this warning, you can either:\n",
-      "\t- Avoid using `tokenizers` before the fork if possible\n",
-      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
       "AI stands for \"Artificial Intelligence.\" It refers to the development of computer systems that can perform tasks that typically require human intelligence, such as visual perception, speech recognition, decision-making, and language translation. AI can be achieved through a combination of techniques such as machine learning, natural language processing, computer vision, and robotics. The ultimate goal of AI research is to create machines that can think and learn like humans, and can even exceed human capabilities in certain areas.\n"
      ]
     }
@@ -156,15 +159,99 @@
     "llm_chain = LLMChain(prompt=prompt, llm=llm)\n",
     "\n",
     "question = \"What is AI?\"\n",
-    "output = llm_chain.run(question)"
+    "output = llm_chain.invoke(question)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Save/Load Low-bit Model\n",
+    "Alternatively, you might save the low-bit model to disk once and use `from_model_id_low_bit` instead of `from_model_id` to reload it for later use - even across different machines. It is space-efficient, as the low-bit model demands significantly less disk space than the original model. And `from_model_id_low_bit` is also more efficient than `from_model_id` in terms of speed and memory usage, as it skips the model conversion step."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To save the low-bit model, use `save_low_bit` as follows."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "saved_lowbit_model_path = \"./vicuna-7b-1.5-low-bit\"  # path to save low-bit model\n",
+    "llm.model.save_low_bit(saved_lowbit_model_path)\n",
+    "del llm"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Load the model from saved lowbit model path as follows. \n",
+    "> Note that the saved path for the low-bit model only includes the model itself but not the tokenizers. If you wish to have everything in one place, you will need to manually download or copy the tokenizer files from the original model's directory to the location where the low-bit model is saved."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024-04-24 21:20:35,874 - INFO - Converting the current model to sym_int4 format......\n"
+     ]
+    }
+   ],
+   "source": [
+    "llm_lowbit = IpexLLM.from_model_id_low_bit(\n",
+    "    model_id=saved_lowbit_model_path,\n",
+    "    tokenizer_id=\"lmsys/vicuna-7b-v1.5\",\n",
+    "    # tokenizer_name=saved_lowbit_model_path,  # copy the tokenizers to saved path if you want to use it this way\n",
+    "    model_kwargs={\"temperature\": 0, \"max_length\": 64, \"trust_remote_code\": True},\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Use the loaded model in Chains:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/anaconda3/envs/shane-langchain-3.11/lib/python3.11/site-packages/transformers/generation/utils.py:1369: UserWarning: Using `max_length`'s default (4096) to control the generation length. This behaviour is deprecated and will be removed from the config in v5 of Transformers -- we recommend using `max_new_tokens` to control the maximum length of the generation.\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "AI stands for \"Artificial Intelligence.\" It refers to the development of computer systems that can perform tasks that typically require human intelligence, such as visual perception, speech recognition, decision-making, and language translation. AI can be achieved through a combination of techniques such as machine learning, natural language processing, computer vision, and robotics. The ultimate goal of AI research is to create machines that can think and learn like humans, and can even exceed human capabilities in certain areas.\n"
+     ]
+    }
+   ],
+   "source": [
+    "llm_chain = LLMChain(prompt=prompt, llm=llm_lowbit)\n",
+    "\n",
+    "question = \"What is AI?\"\n",
+    "output = llm_chain.invoke(question)"
+   ]
   }
  ],
  "metadata": {
@@ -183,7 +270,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.18"
+   "version": "3.11.9"
   }
  },
  "nbformat": 4,
diff --git a/libs/community/langchain_community/llms/bigdl_llm.py b/libs/community/langchain_community/llms/bigdl_llm.py
index f225587a73..3181e5f540 100644
--- a/libs/community/langchain_community/llms/bigdl_llm.py
+++ b/libs/community/langchain_community/llms/bigdl_llm.py
@@ -23,6 +23,10 @@ class BigdlLLM(IpexLLM):
         cls,
         model_id: str,
         model_kwargs: Optional[dict] = None,
+        *,
+        tokenizer_id: Optional[str] = None,
+        load_in_4bit: bool = True,
+        load_in_low_bit: Optional[str] = None,
         **kwargs: Any,
     ) -> LLM:
         """
@@ -31,6 +35,8 @@ class BigdlLLM(IpexLLM):
         Args:
             model_id: Path for the huggingface repo id to be downloaded or
                       the huggingface checkpoint folder.
+            tokenizer_id: Path for the huggingface repo id to be downloaded or
+                      the huggingface checkpoint folder which contains the tokenizer.
             model_kwargs: Keyword arguments to pass to the model and tokenizer.
             kwargs: Extra arguments to pass to the model and tokenizer.
 
@@ -52,12 +58,27 @@ class BigdlLLM(IpexLLM):
                 "Please install it with `pip install --pre --upgrade bigdl-llm[all]`."
             )
 
+        if load_in_low_bit is not None:
+            logger.warning(
+                """`load_in_low_bit` option is not supported in BigdlLLM and 
+                is ignored. For more data types support with `load_in_low_bit`, 
+                use IpexLLM instead."""
+            )
+
+        if not load_in_4bit:
+            raise ValueError(
+                "BigdlLLM only supports loading in 4-bit mode, "
+                "i.e. load_in_4bit = True. "
+                "Please install it with `pip install --pre --upgrade bigdl-llm[all]`."
+            )
+
         _model_kwargs = model_kwargs or {}
+        _tokenizer_id = tokenizer_id or model_id
 
         try:
-            tokenizer = AutoTokenizer.from_pretrained(model_id, **_model_kwargs)
+            tokenizer = AutoTokenizer.from_pretrained(_tokenizer_id, **_model_kwargs)
         except Exception:
-            tokenizer = LlamaTokenizer.from_pretrained(model_id, **_model_kwargs)
+            tokenizer = LlamaTokenizer.from_pretrained(_tokenizer_id, **_model_kwargs)
 
         try:
             model = AutoModelForCausalLM.from_pretrained(
@@ -86,6 +107,8 @@ class BigdlLLM(IpexLLM):
         cls,
         model_id: str,
         model_kwargs: Optional[dict] = None,
+        *,
+        tokenizer_id: Optional[str] = None,
         **kwargs: Any,
     ) -> LLM:
         """
@@ -94,6 +117,8 @@ class BigdlLLM(IpexLLM):
         Args:
 
             model_id: Path for the bigdl-llm transformers low-bit model folder.
+            tokenizer_id: Path for the huggingface repo id or local model folder
+                      which contains the tokenizer.
             model_kwargs: Keyword arguments to pass to the model and tokenizer.
             kwargs: Extra arguments to pass to the model and tokenizer.
 
@@ -117,10 +142,12 @@ class BigdlLLM(IpexLLM):
             )
 
         _model_kwargs = model_kwargs or {}
+        _tokenizer_id = tokenizer_id or model_id
+
         try:
-            tokenizer = AutoTokenizer.from_pretrained(model_id, **_model_kwargs)
+            tokenizer = AutoTokenizer.from_pretrained(_tokenizer_id, **_model_kwargs)
         except Exception:
-            tokenizer = LlamaTokenizer.from_pretrained(model_id, **_model_kwargs)
+            tokenizer = LlamaTokenizer.from_pretrained(_tokenizer_id, **_model_kwargs)
 
         try:
             model = AutoModelForCausalLM.load_low_bit(model_id, **_model_kwargs)
diff --git a/libs/community/langchain_community/llms/ipex_llm.py b/libs/community/langchain_community/llms/ipex_llm.py
index ed03770180..a8e60fe6b1 100644
--- a/libs/community/langchain_community/llms/ipex_llm.py
+++ b/libs/community/langchain_community/llms/ipex_llm.py
@@ -42,6 +42,10 @@ class IpexLLM(LLM):
         cls,
         model_id: str,
         model_kwargs: Optional[dict] = None,
+        *,
+        tokenizer_id: Optional[str] = None,
+        load_in_4bit: bool = True,
+        load_in_low_bit: Optional[str] = None,
         **kwargs: Any,
     ) -> LLM:
         """
@@ -50,52 +54,29 @@ class IpexLLM(LLM):
         Args:
             model_id: Path for the huggingface repo id to be downloaded or
                       the huggingface checkpoint folder.
+            tokenizer_id: Path for the huggingface repo id to be downloaded or
+                      the huggingface checkpoint folder which contains the tokenizer.
+            load_in_4bit: "Whether to load model in 4bit.
+                      Unused if `load_in_low_bit` is not None.
+            load_in_low_bit: Which low bit precisions to use when loading model.
+                      Example values: 'sym_int4', 'asym_int4', 'fp4', 'nf4', 'fp8', etc.
+                      Overrides `load_in_4bit` if specified.
             model_kwargs: Keyword arguments to pass to the model and tokenizer.
             kwargs: Extra arguments to pass to the model and tokenizer.
 
         Returns:
             An object of IpexLLM.
-        """
-        try:
-            from ipex_llm.transformers import (
-                AutoModel,
-                AutoModelForCausalLM,
-            )
-            from transformers import AutoTokenizer, LlamaTokenizer
 
-        except ImportError:
-            raise ValueError(
-                "Could not import ipex-llm or transformers. "
-                "Please install it with `pip install --pre --upgrade ipex-llm[all]`."
-            )
-
-        _model_kwargs = model_kwargs or {}
-
-        try:
-            tokenizer = AutoTokenizer.from_pretrained(model_id, **_model_kwargs)
-        except Exception:
-            tokenizer = LlamaTokenizer.from_pretrained(model_id, **_model_kwargs)
-
-        try:
-            model = AutoModelForCausalLM.from_pretrained(
-                model_id, load_in_4bit=True, **_model_kwargs
-            )
-        except Exception:
-            model = AutoModel.from_pretrained(
-                model_id, load_in_4bit=True, **_model_kwargs
-            )
-
-        if "trust_remote_code" in _model_kwargs:
-            _model_kwargs = {
-                k: v for k, v in _model_kwargs.items() if k != "trust_remote_code"
-            }
+        """
 
-        return cls(
+        return cls._load_model(
             model_id=model_id,
-            model=model,
-            tokenizer=tokenizer,
-            model_kwargs=_model_kwargs,
-            **kwargs,
+            tokenizer_id=tokenizer_id,
+            low_bit_model=False,
+            load_in_4bit=load_in_4bit,
+            load_in_low_bit=load_in_low_bit,
+            model_kwargs=model_kwargs,
+            kwargs=kwargs,
         )
 
     @classmethod
@@ -103,6 +84,8 @@ class IpexLLM(LLM):
         cls,
         model_id: str,
         model_kwargs: Optional[dict] = None,
+        *,
+        tokenizer_id: Optional[str] = None,
         **kwargs: Any,
     ) -> LLM:
         """
@@ -111,12 +94,36 @@ class IpexLLM(LLM):
         Args:
 
             model_id: Path for the ipex-llm transformers low-bit model folder.
+            tokenizer_id: Path for the huggingface repo id or local model folder
+                      which contains the tokenizer.
             model_kwargs: Keyword arguments to pass to the model and tokenizer.
             kwargs: Extra arguments to pass to the model and tokenizer.
 
         Returns:
             An object of IpexLLM.
         """
+
+        return cls._load_model(
+            model_id=model_id,
+            tokenizer_id=tokenizer_id,
+            low_bit_model=True,
+            load_in_4bit=False,  # not used for low-bit model
+            load_in_low_bit=None,  # not used for low-bit model
+            model_kwargs=model_kwargs,
+            kwargs=kwargs,
+        )
+
+    @classmethod
+    def _load_model(
+        cls,
+        model_id: str,
+        tokenizer_id: Optional[str] = None,
+        load_in_4bit: bool = False,
+        load_in_low_bit: Optional[str] = None,
+        low_bit_model: bool = False,
+        model_kwargs: Optional[dict] = None,
+        kwargs: Optional[dict] = None,
+    ) -> Any:
         try:
             from ipex_llm.transformers import (
                 AutoModel,
@@ -126,26 +133,62 @@ class IpexLLM(LLM):
 
         except ImportError:
             raise ValueError(
-                "Could not import ipex-llm or transformers. "
-                "Please install it with `pip install --pre --upgrade ipex-llm[all]`."
+                "Could not import ipex-llm. "
+                "Please install `ipex-llm` properly following installation guides: "
+                "https://github.com/intel-analytics/ipex-llm?tab=readme-ov-file#install-ipex-llm."
             )
 
         _model_kwargs = model_kwargs or {}
-        try:
-            tokenizer = AutoTokenizer.from_pretrained(model_id, **_model_kwargs)
-        except Exception:
-            tokenizer = LlamaTokenizer.from_pretrained(model_id, **_model_kwargs)
+        kwargs = kwargs or {}
+
+        _tokenizer_id = tokenizer_id or model_id
 
         try:
-            model = AutoModelForCausalLM.load_low_bit(model_id, **_model_kwargs)
+            tokenizer = AutoTokenizer.from_pretrained(_tokenizer_id, **_model_kwargs)
         except Exception:
-            model = AutoModel.load_low_bit(model_id, **_model_kwargs)
+            tokenizer = LlamaTokenizer.from_pretrained(_tokenizer_id, **_model_kwargs)
 
+        # restore model_kwargs
         if "trust_remote_code" in _model_kwargs:
             _model_kwargs = {
                 k: v for k, v in _model_kwargs.items() if k != "trust_remote_code"
             }
 
+        # load model with AutoModelForCausalLM and falls back to AutoModel on failure.
+        load_kwargs = {
+            "use_cache": True,
+            "trust_remote_code": True,
+        }
+
+        if not low_bit_model:
+            if load_in_low_bit is not None:
+                load_function_name = "from_pretrained"
+                load_kwargs["load_in_low_bit"] = load_in_low_bit  # type: ignore
+            else:
+                load_function_name = "from_pretrained"
+                load_kwargs["load_in_4bit"] = load_in_4bit
+        else:
+            load_function_name = "load_low_bit"
+
+        try:
+            # Attempt to load with AutoModelForCausalLM
+            model = cls._load_model_general(
+                AutoModelForCausalLM,
+                load_function_name=load_function_name,
+                model_id=model_id,
+                load_kwargs=load_kwargs,
+                model_kwargs=_model_kwargs,
+            )
+        except Exception:
+            # Fallback to AutoModel if there's an exception
+            model = cls._load_model_general(
+                AutoModel,
+                load_function_name=load_function_name,
+                model_id=model_id,
+                load_kwargs=load_kwargs,
+                model_kwargs=_model_kwargs,
+            )
+
         return cls(
             model_id=model_id,
             model=model,
@@ -154,6 +197,24 @@ class IpexLLM(LLM):
             **kwargs,
         )
 
+    @staticmethod
+    def _load_model_general(
+        model_class: Any,
+        load_function_name: str,
+        model_id: str,
+        load_kwargs: dict,
+        model_kwargs: dict,
+    ) -> Any:
+        """General function to attempt to load a model."""
+        try:
+            load_function = getattr(model_class, load_function_name)
+            return load_function(model_id, **{**load_kwargs, **model_kwargs})
+        except Exception as e:
+            logger.error(
+                f"Failed to load model using "
+                f"{model_class.__name__}.{load_function_name}: {e}"
+            )
+
     @property
     def _identifying_params(self) -> Mapping[str, Any]:
         """Get the identifying parameters."""
diff --git a/libs/community/tests/integration_tests/llms/test_bigdl_llm.py b/libs/community/tests/integration_tests/llms/test_bigdl_llm.py
index d214df8429..8d3340ce2e 100644
--- a/libs/community/tests/integration_tests/llms/test_bigdl_llm.py
+++ b/libs/community/tests/integration_tests/llms/test_bigdl_llm.py
@@ -1,23 +1,43 @@
 """Test BigdlLLM"""
+import os
+
+import pytest
 from langchain_core.outputs import LLMResult
 
 from langchain_community.llms.bigdl_llm import BigdlLLM
 
+model_ids_to_test = os.getenv("TEST_BIGDLLLM_MODEL_IDS") or ""
+skip_if_no_model_ids = pytest.mark.skipif(
+    not model_ids_to_test,
+    reason="TEST_BIGDLLLM_MODEL_IDS environment variable not set.",
+)
+model_ids_to_test = [model_id.strip() for model_id in model_ids_to_test.split(",")]  # type: ignore
+
 
-def test_call() -> None:
+@skip_if_no_model_ids
+@pytest.mark.parametrize(
+    "model_id",
+    model_ids_to_test,
+)
+def test_call(model_id: str) -> None:
     """Test valid call to bigdl-llm."""
     llm = BigdlLLM.from_model_id(
-        model_id="lmsys/vicuna-7b-v1.5",
+        model_id=model_id,
         model_kwargs={"temperature": 0, "max_length": 16, "trust_remote_code": True},
     )
     output = llm.invoke("Hello!")
     assert isinstance(output, str)
 
 
-def test_generate() -> None:
+@skip_if_no_model_ids
+@pytest.mark.parametrize(
+    "model_id",
+    model_ids_to_test,
+)
+def test_generate(model_id: str) -> None:
     """Test valid call to bigdl-llm."""
     llm = BigdlLLM.from_model_id(
-        model_id="lmsys/vicuna-7b-v1.5",
+        model_id=model_id,
         model_kwargs={"temperature": 0, "max_length": 16, "trust_remote_code": True},
     )
     output = llm.generate(["Hello!"])
diff --git a/libs/community/tests/integration_tests/llms/test_ipex_llm.py b/libs/community/tests/integration_tests/llms/test_ipex_llm.py
index a98bbf14be..163458029c 100644
--- a/libs/community/tests/integration_tests/llms/test_ipex_llm.py
+++ b/libs/community/tests/integration_tests/llms/test_ipex_llm.py
@@ -1,25 +1,88 @@
 """Test IPEX LLM"""
+import os
+from typing import Any
+
+import pytest
 from langchain_core.outputs import LLMResult
 
-from langchain_community.llms.ipex_llm import IpexLLM
+from langchain_community.llms import IpexLLM
+
+model_ids_to_test = os.getenv("TEST_IPEXLLM_MODEL_IDS") or ""
+skip_if_no_model_ids = pytest.mark.skipif(
+    not model_ids_to_test, reason="TEST_IPEXLLM_MODEL_IDS environment variable not set."
+)
+model_ids_to_test = [model_id.strip() for model_id in model_ids_to_test.split(",")]  # type: ignore
 
 
-def test_call() -> None:
-    """Test valid call to ipex-llm."""
+def load_model(model_id: str) -> Any:
     llm = IpexLLM.from_model_id(
-        model_id="lmsys/vicuna-7b-v1.5",
+        model_id=model_id,
         model_kwargs={"temperature": 0, "max_length": 16, "trust_remote_code": True},
     )
-    output = llm.invoke("Hello!")
-    assert isinstance(output, str)
+    return llm
 
 
-def test_generate() -> None:
-    """Test valid call to ipex-llm."""
+def load_model_more_types(model_id: str, load_in_low_bit: str) -> Any:
     llm = IpexLLM.from_model_id(
-        model_id="lmsys/vicuna-7b-v1.5",
+        model_id=model_id,
+        load_in_low_bit=load_in_low_bit,
         model_kwargs={"temperature": 0, "max_length": 16, "trust_remote_code": True},
     )
+    return llm
+
+
+@skip_if_no_model_ids
+@pytest.mark.parametrize(
+    "model_id",
+    model_ids_to_test,
+)
+def test_call(model_id: str) -> None:
+    """Test valid call."""
+    llm = load_model(model_id)
+    output = llm.invoke("Hello!")
+    assert isinstance(output, str)
+
+
+@skip_if_no_model_ids
+@pytest.mark.parametrize(
+    "model_id",
+    model_ids_to_test,
+)
+def test_asym_int4(model_id: str) -> None:
+    """Test asym int4 data type."""
+    llm = load_model_more_types(model_id=model_id, load_in_low_bit="asym_int4")
+    output = llm.invoke("Hello!")
+    assert isinstance(output, str)
+
+
+@skip_if_no_model_ids
+@pytest.mark.parametrize(
+    "model_id",
+    model_ids_to_test,
+)
+def test_generate(model_id: str) -> None:
+    """Test valid generate."""
+    llm = load_model(model_id)
     output = llm.generate(["Hello!"])
     assert isinstance(output, LLMResult)
     assert isinstance(output.generations, list)
+
+
+@skip_if_no_model_ids
+@pytest.mark.parametrize(
+    "model_id",
+    model_ids_to_test,
+)
+def test_save_load_lowbit(model_id: str) -> None:
+    """Test save and load lowbit model."""
+    saved_lowbit_path = "/tmp/saved_model"
+    llm = load_model(model_id)
+    llm.model.save_low_bit(saved_lowbit_path)
+    del llm
+    loaded_llm = IpexLLM.from_model_id_low_bit(
+        model_id=saved_lowbit_path,
+        tokenizer_id=model_id,
+        model_kwargs={"temperature": 0, "max_length": 16, "trust_remote_code": True},
+    )
+    output = loaded_llm.invoke("Hello!")
+    assert isinstance(output, str)