From 84bcc8090ce8fedb5641db4c3d13671c9e27ed2e Mon Sep 17 00:00:00 2001
From: Aleksandr Borzunov <hxrussia@gmail.com>
Date: Thu, 6 Apr 2023 15:10:02 +0000
Subject: [PATCH] Remove cpufeature from setup.cfg

---
 setup.cfg                          |  1 -
 src/petals/bloom/modeling_utils.py | 10 ----------
 src/petals/client/remote_model.py  |  3 +--
 3 files changed, 1 insertion(+), 13 deletions(-)

diff --git a/setup.cfg b/setup.cfg
index c485cd5..055f1c0 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -41,7 +41,6 @@ install_requires =
     tensor_parallel==1.0.23
     humanfriendly
     async-timeout>=4.0.2
-    cpufeature>=0.2.0
     packaging>=20.9
 
 [options.extras_require]
diff --git a/src/petals/bloom/modeling_utils.py b/src/petals/bloom/modeling_utils.py
index eddbb9d..f68b25b 100644
--- a/src/petals/bloom/modeling_utils.py
+++ b/src/petals/bloom/modeling_utils.py
@@ -29,16 +29,6 @@ class LMHead(nn.Module):
         self.word_embeddings = word_embeddings
 
         self.use_chunked_forward = config.use_chunked_forward
-        if self.use_chunked_forward == "auto":
-            if platform.machine() == "x86_64":
-                # Import of cpufeature may crash on non-x86_64 machines
-                from cpufeature import CPUFeature
-
-                # If the CPU supports AVX512, plain bfloat16 is ~10x faster than chunked_forward().
-                # Otherwise, it's ~8x slower.
-                self.use_chunked_forward = not (CPUFeature["AVX512f"] and CPUFeature["OS_AVX512"])
-            else:
-                self.use_chunked_forward = True
         self.chunked_forward_step = config.chunked_forward_step
         self._bf16_warning_shown = False
 
diff --git a/src/petals/client/remote_model.py b/src/petals/client/remote_model.py
index 937cd9c..42fee3e 100644
--- a/src/petals/client/remote_model.py
+++ b/src/petals/client/remote_model.py
@@ -44,8 +44,7 @@ class DistributedBloomConfig(BloomConfig):
     tuning_mode: Optional[str] = None  # One of the finetune options: [None, 'shallow_ptune', 'deep_ptune', 'adapters']
 
     # This settings matter for running the client with dtype bfloat16 on CPU.
-    # If the CPU doesn't support AVX512, chunked_forward() significantly speeds up computations.
-    use_chunked_forward: Union[str, bool] = "auto"
+    use_chunked_forward: bool = True
     chunked_forward_step: int = 16384