petals/src/petals/models/llama/block.py

"""
LLaMA intermediate layer
Based on https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py
See commit history for authorship.
"""
from typing import Optional, Tuple

import torch
from transformers.models.llama.modeling_llama import LlamaDecoderLayer, LlamaModel


class WrappedLlamaBlock(LlamaDecoderLayer):
    def forward(
        self,
        hidden_states: torch.Tensor,
        *args,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        layer_past: Optional[Tuple[torch.Tensor]] = None,
        use_cache: bool = False,
        **kwargs,
    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
        batch_size, seq_length, _ = hidden_states.shape

        seq_length_with_past = seq_length
        past_key_values_length = 0

        past_key_value = layer_past
        if past_key_value is not None:
            past_key_values_length = past_key_value[0].shape[2]
            seq_length_with_past = seq_length_with_past + past_key_values_length
            past_key_value = self._reorder_cache_from_bloom_to_llama(past_key_value, batch_size, past_key_values_length)

        if position_ids is None:
            device = hidden_states.device
            position_ids = torch.arange(
                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
            )
            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
        else:
            position_ids = position_ids.view(-1, seq_length).long()

        # embed positions
        if attention_mask is None:
            attention_mask = torch.ones(
                (batch_size, seq_length_with_past), dtype=torch.bool, device=hidden_states.device
            )
        attention_mask = LlamaModel._prepare_decoder_attention_mask(
            None, attention_mask, (batch_size, seq_length), hidden_states, past_key_values_length
        )

        outputs = super().forward(
            hidden_states,
            *args,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_value=past_key_value,
            use_cache=use_cache,
            **kwargs,
        )

        if use_cache:
            present_key_value = outputs[-1]
            present_key_value = self._reorder_cache_from_llama_to_bloom(
                present_key_value, batch_size, seq_length_with_past
            )
            outputs = outputs[:-1] + (present_key_value,)

        return outputs

    def _reorder_cache_from_bloom_to_llama(
        self, key_value: Tuple[torch.Tensor], batch_size: int, seq_length: int
    ) -> Tuple[torch.Tensor]:
        key_states, value_states = key_value
        key_states = key_states.permute(0, 2, 1)
        key_states = key_states.view(batch_size, self.self_attn.num_heads, seq_length, self.self_attn.head_dim)
        value_states = value_states.view(*key_states.shape)
        return (key_states, value_states)

    def _reorder_cache_from_llama_to_bloom(
        self, key_value: Tuple[torch.Tensor], batch_size: int, seq_length: int
    ) -> Tuple[torch.Tensor]:
        key_states, value_states = key_value
        value_states = value_states.view(batch_size * self.self_attn.num_heads, seq_length, self.self_attn.head_dim)
        key_states = key_states.view(*value_states.shape)
        key_states = key_states.permute(0, 2, 1)
        return (key_states, value_states)
Add LLaMA support (#323) This PR: 1. Abolishes the model conversion procedure. Now, models are downloaded directly from original repositories like https://huggingface.co/bigscience/bloom. Servers download only shards with blocks to be hosted, and clients download only shards with input/output embeddings and layernorms. - BLOOM is loaded from `bigscience/bloom`, but we use the DHT prefix `bigscience/bloom-petals` for backward compatibility. Same with smaller BLOOMs and BLOOMZ. - LLaMA can be loaded from any repo like `username/llama-65b-hf`, but we use the DHT prefix `llama-65b-hf` (without the username) to accomodate blocks from different repos (there're a few of them with minor differences, such as `Llama` vs. `LLaMA` in the class name). 2. Refactors the client to generalize it for multiple models. Now, we have `petals.models` packages that contain model-specific code (e.g. `petals.models.bloom`, `petals.models.llama`). General code (e.g. CPU-efficient LM head, p-tuning) is kept in `petals.client`. 3. Introduces `WrappedLlamaBlock`, `DistributedLlamaConfig`, `DistributedLlamaForCausalLM`, `DistributedLlamaForSequenceClassification`, and `DistributedLlamaModel` compatible with Petals functionality (p-tuning, adapters, etc.). 4. Introduces `AutoDistributedConfig` that automatically chooses the correct config class (`DistributedLlamaConfig` or `DistributedBloomConfig`). The refactored configs contain all model-specific info for both clients and servers. Upgrade instructions: - Remove disk caches for blocks in old (converted) format to save disk space. That is, remove `~/.cache/petals/model--bigscience--bloom-petals` and `~/.cache/petals/model--bigscience--bloomz-petals` directories (if present). 12 months ago			`"""`
			`LLaMA intermediate layer`
			`Based on https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py`
			`See commit history for authorship.`
			`"""`
			`from typing import Optional, Tuple`

			`import torch`
			`from transformers.models.llama.modeling_llama import LlamaDecoderLayer, LlamaModel`


			`class WrappedLlamaBlock(LlamaDecoderLayer):`
			`def forward(`
			`self,`
			`hidden_states: torch.Tensor,`
			`*args,`
			`attention_mask: Optional[torch.Tensor] = None,`
			`position_ids: Optional[torch.LongTensor] = None,`
			`layer_past: Optional[Tuple[torch.Tensor]] = None,`
			`use_cache: bool = False,`
			`**kwargs,`
			`) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:`
			`batch_size, seq_length, _ = hidden_states.shape`

			`seq_length_with_past = seq_length`
			`past_key_values_length = 0`

			`past_key_value = layer_past`
			`if past_key_value is not None:`
			`past_key_values_length = past_key_value[0].shape[2]`
			`seq_length_with_past = seq_length_with_past + past_key_values_length`
			`past_key_value = self._reorder_cache_from_bloom_to_llama(past_key_value, batch_size, past_key_values_length)`

			`if position_ids is None:`
			`device = hidden_states.device`
			`position_ids = torch.arange(`
			`past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device`
			`)`
			`position_ids = position_ids.unsqueeze(0).view(-1, seq_length)`
			`else:`
			`position_ids = position_ids.view(-1, seq_length).long()`

			`# embed positions`
			`if attention_mask is None:`
			`attention_mask = torch.ones(`
			`(batch_size, seq_length_with_past), dtype=torch.bool, device=hidden_states.device`
			`)`
			`attention_mask = LlamaModel._prepare_decoder_attention_mask(`
			`None, attention_mask, (batch_size, seq_length), hidden_states, past_key_values_length`
			`)`

			`outputs = super().forward(`
			`hidden_states,`
			`*args,`
			`attention_mask=attention_mask,`
			`position_ids=position_ids,`
			`past_key_value=past_key_value,`
			`use_cache=use_cache,`
			`**kwargs,`
			`)`

			`if use_cache:`
			`present_key_value = outputs[-1]`
			`present_key_value = self._reorder_cache_from_llama_to_bloom(`
			`present_key_value, batch_size, seq_length_with_past`
			`)`
			`outputs = outputs[:-1] + (present_key_value,)`

			`return outputs`

			`def _reorder_cache_from_bloom_to_llama(`
			`self, key_value: Tuple[torch.Tensor], batch_size: int, seq_length: int`
			`) -> Tuple[torch.Tensor]:`
			`key_states, value_states = key_value`
			`key_states = key_states.permute(0, 2, 1)`
			`key_states = key_states.view(batch_size, self.self_attn.num_heads, seq_length, self.self_attn.head_dim)`
			`value_states = value_states.view(*key_states.shape)`
			`return (key_states, value_states)`

			`def _reorder_cache_from_llama_to_bloom(`
			`self, key_value: Tuple[torch.Tensor], batch_size: int, seq_length: int`
			`) -> Tuple[torch.Tensor]:`
			`key_states, value_states = key_value`
			`value_states = value_states.view(batch_size * self.self_attn.num_heads, seq_length, self.self_attn.head_dim)`
			`key_states = key_states.view(*value_states.shape)`
			`key_states = key_states.permute(0, 2, 1)`
			`return (key_states, value_states)`