transformers-community
/

contrastive-search

@@ -325,25 +325,18 @@ def _contrastive_search(
         if not sequential:
             # Replicates the new past_key_values to match the `top_k` candidates
-            past = model_kwargs["past_key_values"]
-            # If it is a static cache, modify it in-place layer after layer to save memory
-            if isinstance(past, DynamicCache) or (
-                isinstance(past, EncoderDecoderCache)
-                and isinstance(past.self_attention_cache, DynamicCache)
-            ):
-                past.batch_repeat_interleave(top_k)
             else:
-                new_key_values = []
-                for layer in past:
-                    items = []
-                    # item is either the key or the value matrix
-                    for item in layer:
-                        items.append(item.repeat_interleave(top_k, dim=0))
-                    new_key_values.append(tuple(items))
-                past = tuple(new_key_values)
-            model_kwargs["past_key_values"] = past
         if sequential:
             all_outputs = []
@@ -477,15 +470,10 @@ def _contrastive_search(
             ):
                 next_past_key_values.batch_select_indices(augmented_idx)
             else:
-                new_key_values = []
-                for layer in next_past_key_values:
-                    items = []
-                    # item is either the key or the value matrix
-                    for item in layer:
-                        items.append(item[augmented_idx, ...])
-                    new_key_values.append(tuple(items))
-                next_past_key_values = tuple(new_key_values)
         logit_for_next_step = torch.stack(torch.split(logits, top_k))[
             range(batch_size), selected_idx, :
@@ -569,13 +557,10 @@ def _contrastive_search(
             ):
                 model_kwargs["past_key_values"].crop(-1)
             else:
-                past_key_values = []
-                for layer in model_kwargs["past_key_values"]:
-                    layer_past_key_values = []
-                    for item in layer:
-                        layer_past_key_values.append(item[..., :-1, :])
-                    past_key_values.append(tuple(layer_past_key_values))
-                model_kwargs["past_key_values"] = tuple(past_key_values)
         if model.config.is_encoder_decoder:
             return GenerateEncoderDecoderOutput(

         if not sequential:
             # Replicates the new past_key_values to match the `top_k` candidates
+            if isinstance(outputs["past_key_values"], DynamicCache) or (
+                    isinstance(outputs["past_key_values"], EncoderDecoderCache)
+                    and isinstance(
+                        outputs["past_key_values"].self_attention_cache, DynamicCache
+                    )
+                ):
+                model_kwargs["past_key_values"] = model_kwargs["past_key_values"].batch_repeat_interleave(top_k)
             else:
+                raise ValueError(
+                    f"Unsupported cache type: {type(outputs['past_key_values'])}. Contrastive search requires "
+                    "dynamic cache, so set `cache_implementation='dynamic'` in the generation config."
+                )
         if sequential:
             all_outputs = []
             ):
                 next_past_key_values.batch_select_indices(augmented_idx)
             else:
+                raise ValueError(
+                    f"Unsupported cache type: {type(outputs['past_key_values'])}. Contrastive search requires "
+                    "dynamic cache, so set `cache_implementation='dynamic'` in the generation config."
+                )
         logit_for_next_step = torch.stack(torch.split(logits, top_k))[
             range(batch_size), selected_idx, :
             ):
                 model_kwargs["past_key_values"].crop(-1)
             else:
+                raise ValueError(
+                    f"Unsupported cache type: {type(outputs['past_key_values'])}. Contrastive search requires "
+                    "dynamic cache, so set `cache_implementation='dynamic'` in the generation config."
+                )
         if model.config.is_encoder_decoder:
             return GenerateEncoderDecoderOutput(