Added gpt max logit choices

2 years ago · e3d0140fe6
parent 5d79281c4c
commit e3d0140fe6
1 changed files with 96 additions and 33 deletions
--- a/manifest/api/models/huggingface.py
+++ b/manifest/api/models/huggingface.py
@ -19,6 +19,7 @@ from transformers import (
 from manifest.api.models.model import Model

 MODEL_REGISTRY = {
+    "EleutherAI/gpt-neo-125M": GPTNeoForCausalLM,
    "EleutherAI/gpt-neo-1.3B": GPTNeoForCausalLM,
    "EleutherAI/gpt-neo-2.7B": GPTNeoForCausalLM,
    "EleutherAI/gpt-j-6B": GPTJForCausalLM,
@ -303,44 +304,105 @@ class HuggingFaceModel(Model):
            the returned gold choice
        """
        max_input_len = self.pipeline.max_length
-        # Adapted from https://github.com/bigscience-workshop/t-zero
-        tokenized_inputs = self.pipeline.tokenizer(
-            prompt,
-            padding="longest",
-            max_length=max_input_len,
-            truncation=True,
-            add_special_tokens=False,
-        )
-        # Get max target length
-        max_target_len = max(
-            [
-                len(self.pipeline.tokenizer(ans_choi)["input_ids"])
+        if self.is_encdec:
+            # Adapted from https://github.com/bigscience-workshop/t-zero
+            tokenized_inputs = self.pipeline.tokenizer(
+                prompt,
+                padding="longest",
+                max_length=max_input_len,
+                truncation=True,
+                add_special_tokens=False,
+            )
+            # Get max target length
+            max_target_len = max(
+                [
+                    len(self.pipeline.tokenizer(ans_choi)["input_ids"])
+                    for ans_choi in gold_choices
+                ]
+            )
+            tokenized_targets = [
+                self.pipeline.tokenizer(
+                    ans_choi,
+                    # padding is on the right here.
+                    padding="max_length",
+                    max_length=min(max_target_len, max_input_len),
+                    truncation=True,
+                )
                for ans_choi in gold_choices
            ]
-        )
-        tokenized_targets = [
-            self.pipeline.tokenizer(
-                ans_choi,
-                # padding is on the right here.
-                padding="max_length",
-                max_length=min(max_target_len, max_input_len),
+
+            # Repeat input ids for each choice to form a batch
+            features = {
+                k: [tokenized_inputs[k] for _ in range(len(gold_choices))]
+                for k in tokenized_inputs.keys()
+            }
+            # Add choice tokens + mask
+            features["labels"] = [
+                tokenized_targets[k]["input_ids"] for k in range(len(gold_choices))
+            ]
+            features["labels_attention_mask"] = [
+                tokenized_targets[k]["attention_mask"] for k in range(len(gold_choices))
+            ]
+        else:
+            tokenized_inputs = self.pipeline.tokenizer(
+                prompt,
+                max_length=max_input_len,
                truncation=True,
+                add_special_tokens=False,
            )
-            for ans_choi in gold_choices
-        ]
+            tokenized_targets = [
+                self.pipeline.tokenizer(
+                    # Add starting whitespace fo gpt
+                    ans_choi if ans_choi.startswith((" ", "\n")) else f" {ans_choi}",
+                    max_length=max_input_len,
+                    truncation=True,
+                )
+                for ans_choi in gold_choices
+            ]
+            features = {
+                k: [] for k in list(tokenized_inputs.keys()) + ["labels_attention_mask"]
+            }
+            max_effective_input_len = 0
+            for tokenized_targ in tokenized_targets:
+                for k in tokenized_inputs.keys():
+                    # Make sure to leave room for the outputs
+                    features[k].append(
+                        tokenized_inputs[k][
+                            : min(
+                                len(tokenized_inputs[k]),
+                                max_input_len - len(tokenized_targ[k]),
+                            )
+                        ]
+                        + tokenized_targ[k]
+                    )
+                    max_effective_input_len = max(
+                        max_effective_input_len, len(features[k][-1])
+                    )
+                # Manuall add labels_attention_mask
+                features["labels_attention_mask"].append(
+                    [0]
+                    * min(
+                        len(tokenized_inputs["input_ids"]),
+                        max_input_len - len(tokenized_targ["input_ids"]),
+                    )
+                    + [1] * len(tokenized_targ["input_ids"])
+                )

-        # Repeat input ids for each choice to form a batch
-        features = {
-            k: [tokenized_inputs[k] for _ in range(len(gold_choices))]
-            for k in tokenized_inputs.keys()
-        }
-        # Add choice tokens + mask
-        features["labels"] = [
-            tokenized_targets[k]["input_ids"] for k in range(len(gold_choices))
-        ]
-        features["labels_attention_mask"] = [
-            tokenized_targets[k]["attention_mask"] for k in range(len(gold_choices))
-        ]
+            # Manually pad to max effective length
+            for k in features.keys():
+                for i in range(len(features[k])):
+                    if k == "input_ids":
+                        features[k][i] += [self.pipeline.tokenizer.pad_token_id] * (
+                            max_effective_input_len - len(features[k][i])
+                        )
+                    elif k in ["attention_mask", "labels_attention_mask"]:
+                        features[k][i] += [0] * (
+                            max_effective_input_len - len(features[k][i])
+                        )
+                    else:
+                        raise ValueError(f"Unknown key {k} for decoder only models")
+
+            features["labels"] = features["input_ids"]
        # Convert to tensors
        tensor_features = {}
        for k in features:
@ -356,6 +418,7 @@ class HuggingFaceModel(Model):
        ]
        stacked_logits = torch.vstack(logits)
        # Compute most likely option
+        # Adapted from https://github.com/bigscience-workshop/t-zero
        masked_log_probs = tensor_features["labels_attention_mask"].unsqueeze(
            -1
        ) * torch.log_softmax(stacked_logits, dim=-1)