|
|
|
import fcntl
|
|
|
|
import os
|
|
|
|
import shutil
|
|
|
|
from contextlib import contextmanager
|
|
|
|
from pathlib import Path
|
|
|
|
from typing import Optional
|
|
|
|
|
|
|
|
import huggingface_hub
|
|
|
|
from hivemind.utils.logging import get_logger
|
|
|
|
|
|
|
|
logger = get_logger(__name__)
|
|
|
|
|
|
|
|
DEFAULT_CACHE_DIR = os.getenv("PETALS_CACHE", Path(Path.home(), ".cache", "petals"))
|
|
|
|
|
|
|
|
BLOCKS_LOCK_FILE = "blocks.lock"
|
|
|
|
|
|
|
|
|
|
|
|
@contextmanager
|
|
|
|
def _blocks_lock(cache_dir: Optional[str], mode: int):
|
|
|
|
if cache_dir is None:
|
|
|
|
cache_dir = DEFAULT_CACHE_DIR
|
|
|
|
lock_path = Path(cache_dir, BLOCKS_LOCK_FILE)
|
|
|
|
|
|
|
|
os.makedirs(lock_path.parent, exist_ok=True)
|
|
|
|
with open(lock_path, "wb+") as lock_fd:
|
|
|
|
fcntl.flock(lock_fd.fileno(), mode)
|
|
|
|
# The OS will release the lock when lock_fd is closed or the process is killed
|
|
|
|
yield
|
|
|
|
|
|
|
|
|
|
|
|
def allow_cache_reads(cache_dir: Optional[str]):
|
|
|
|
"""Allows simultaneous reads, guarantees that blocks won't be removed along the way (shared lock)"""
|
|
|
|
return _blocks_lock(cache_dir, fcntl.LOCK_SH)
|
|
|
|
|
|
|
|
|
|
|
|
def allow_cache_writes(cache_dir: Optional[str]):
|
|
|
|
"""Allows saving new blocks and removing the old ones (exclusive lock)"""
|
|
|
|
return _blocks_lock(cache_dir, fcntl.LOCK_EX)
|
|
|
|
|
|
|
|
|
|
|
|
def free_disk_space_for(
|
|
|
|
size: int,
|
|
|
|
*,
|
|
|
|
cache_dir: Optional[str],
|
|
|
|
max_disk_space: Optional[int],
|
|
|
|
os_quota: int = 1024**3, # Minimal space we should leave to keep OS function normally
|
|
|
|
):
|
|
|
|
if cache_dir is None:
|
|
|
|
cache_dir = DEFAULT_CACHE_DIR
|
|
|
|
cache_info = huggingface_hub.scan_cache_dir(cache_dir)
|
|
|
|
|
|
|
|
available_space = shutil.disk_usage(cache_dir).free - os_quota
|
|
|
|
if max_disk_space is not None:
|
|
|
|
available_space = min(available_space, max_disk_space - cache_info.size_on_disk)
|
Add LLaMA support (#323)
This PR:
1. **Abolishes the model conversion procedure.** Now, models are downloaded directly from original repositories like https://huggingface.co/bigscience/bloom. Servers download only shards with blocks to be hosted, and clients download only shards with input/output embeddings and layernorms.
- BLOOM is loaded from `bigscience/bloom`, but we use the DHT prefix `bigscience/bloom-petals` for backward compatibility. Same with smaller BLOOMs and BLOOMZ.
- LLaMA can be loaded from any repo like `username/llama-65b-hf`, but we use the DHT prefix `llama-65b-hf` (without the username) to accomodate blocks from different repos (there're a few of them with minor differences, such as `Llama` vs. `LLaMA` in the class name).
2. **Refactors the client to generalize it for multiple models.** Now, we have `petals.models` packages that contain model-specific code (e.g. `petals.models.bloom`, `petals.models.llama`). General code (e.g. CPU-efficient LM head, p-tuning) is kept in `petals.client`.
3. **Introduces** `WrappedLlamaBlock`, `DistributedLlamaConfig`, `DistributedLlamaForCausalLM`, `DistributedLlamaForSequenceClassification`, and `DistributedLlamaModel` compatible with Petals functionality (p-tuning, adapters, etc.).
4. **Introduces** `AutoDistributedConfig` that automatically chooses the correct config class (`DistributedLlamaConfig` or `DistributedBloomConfig`). The refactored configs contain all model-specific info for both clients and servers.
Upgrade instructions:
- Remove disk caches for blocks in old (converted) format to save disk space. That is, remove `~/.cache/petals/model--bigscience--bloom-petals` and `~/.cache/petals/model--bigscience--bloomz-petals` directories (if present).
11 months ago
|
|
|
|
|
|
|
gib = 1024**3
|
|
|
|
logger.debug(f"Disk space: required {size / gib:.1f} GiB, available {available_space / gib:.1f} GiB")
|
|
|
|
if size <= available_space:
|
|
|
|
return
|
|
|
|
|
|
|
|
cached_files = [file for repo in cache_info.repos for revision in repo.revisions for file in revision.files]
|
|
|
|
|
|
|
|
# Remove as few least recently used files as possible
|
|
|
|
removed_files = []
|
|
|
|
freed_space = 0
|
|
|
|
extra_space_needed = size - available_space
|
|
|
|
for file in sorted(cached_files, key=lambda file: file.blob_last_accessed):
|
|
|
|
os.remove(file.file_path) # Remove symlink
|
|
|
|
os.remove(file.blob_path) # Remove contents
|
|
|
|
|
|
|
|
removed_files.append(file)
|
|
|
|
freed_space += file.size_on_disk
|
|
|
|
if freed_space >= extra_space_needed:
|
|
|
|
break
|
|
|
|
if removed_files:
|
|
|
|
logger.info(f"Removed {len(removed_files)} files to free {freed_space / gib:.1f} GiB of disk space")
|
|
|
|
logger.debug(f"Removed paths: {[str(file.file_path) for file in removed_files]}")
|
|
|
|
|
|
|
|
if freed_space < extra_space_needed:
|
|
|
|
raise RuntimeError(
|
|
|
|
f"Insufficient disk space to load a block. Please free {(extra_space_needed - freed_space) / gib:.1f} GiB "
|
|
|
|
f"on the volume for {cache_dir} or increase --max_disk_space if you set it manually"
|
|
|
|
)
|