Crystalcareai
/

Quiet-Star-Custom

@@ -1246,6 +1246,9 @@ class QuietForCausalLM(QuietPreTrainedModel):
         self.policy_loss_beta = 1e6
         self.embedding_scale = 1e2
         self.reinforce_temperature = 3
         self.base_loss_beta = 1
         self.thinking_usefulness_head = nn.Linear(self.model.config.hidden_size, 1)
@@ -1626,16 +1629,20 @@ class QuietForCausalLM(QuietPreTrainedModel):
         sample_probs_history = []
         action_loglikelihoods_list = []
         if self.use_end_thought_token or self.use_start_thought_token:
             if not self.use_reparam_for_thought_embeddings:
-                start_embedding = self.start_embedding[0].unsqueeze(0) * self.embedding_scale
-                end_embedding = self.end_embedding[0].unsqueeze(0) * self.embedding_scale
             else:
-                start_embedding = self.start_embedding * self.embedding_scale
-                end_embedding = self.end_embedding * self.embedding_scale
             base_embeddings = self.model.embed_tokens.weight
             if self.train_only_thinking_embedding:
                 base_embeddings = base_embeddings.detach()
         # # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
         fwd_iters = 1 if self.original_mode else self.n_ahead + self.n_ahead_talk - 1
         for ahead_idx in range(fwd_iters):
@@ -1900,9 +1907,10 @@ class QuietForCausalLM(QuietPreTrainedModel):
                 contains_end = self.use_end_thought_token and (probabilities_2d[..., self.end_token_id].sum() > 0)
                 contains_thought = contains_start or contains_end
                 if not contains_thought:
                     with torch.set_grad_enabled(not self.train_only_thinking_embedding):
-                        inputs_embeds = probabilities_2d @ (self.model.embed_tokens.weight.to(probabilities.device).to(probabilities.dtype))
                 else:
                     thought_id = self.start_token_id if contains_start else self.end_token_id
                     cur_thought_embedding = start_embedding if contains_start else end_embedding
@@ -1915,7 +1923,7 @@ class QuietForCausalLM(QuietPreTrainedModel):
                             sampled_end = inputs_embeds.clone().detach()
                     else:
                         inputs_embeds = cur_thought_embedding.unsqueeze(0).repeat(batch_size, seq_len, 1)
-                        inputs_embeds = inputs_embeds.view(probabilities.size(0), probabilities.size(1), -1).to(self.model.embed_tokens.weight.dtype)
                 inputs_embeds = inputs_embeds.view(probabilities.size(0), probabilities.size(1), -1).to(self.model.embed_tokens.weight.dtype)
                 # Predict the usefulness of thinking at each token position
@@ -2127,6 +2135,32 @@ class QuietForCausalLM(QuietPreTrainedModel):
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
     def prepare_inputs_for_generation(

         self.policy_loss_beta = 1e6
         self.embedding_scale = 1e2
+        self.temperature = nn.Parameter(torch.tensor(1.0))
+        self.max_temperature = config.max_temperature
+        self.complexity_factor = config.complexity_factor
         self.reinforce_temperature = 3
         self.base_loss_beta = 1
         self.thinking_usefulness_head = nn.Linear(self.model.config.hidden_size, 1)
         sample_probs_history = []
         action_loglikelihoods_list = []
+        complexity_scores = self.compute_complexity_scores(input_ids, attention_mask)
+        temperature = self.temperature * complexity_scores.unsqueeze(-1)
         if self.use_end_thought_token or self.use_start_thought_token:
             if not self.use_reparam_for_thought_embeddings:
+                start_embedding = self.start_embedding[0].unsqueeze(0) * self.embedding_scale * temperature
+                end_embedding = self.end_embedding[0].unsqueeze(0) * self.embedding_scale * temperature
             else:
+                start_embedding = self.start_embedding * self.embedding_scale * temperature
+                end_embedding = self.end_embedding * self.embedding_scale * temperature
             base_embeddings = self.model.embed_tokens.weight
             if self.train_only_thinking_embedding:
                 base_embeddings = base_embeddings.detach()
         # # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
         fwd_iters = 1 if self.original_mode else self.n_ahead + self.n_ahead_talk - 1
         for ahead_idx in range(fwd_iters):
                 contains_end = self.use_end_thought_token and (probabilities_2d[..., self.end_token_id].sum() > 0)
                 contains_thought = contains_start or contains_end
                 if not contains_thought:
                     with torch.set_grad_enabled(not self.train_only_thinking_embedding):
+                        inputs_embeds = probabilities_2d @ (self.model.embed_tokens.weight.to(probabilities.device).to(probabilities.dtype) * temperature)
                 else:
                     thought_id = self.start_token_id if contains_start else self.end_token_id
                     cur_thought_embedding = start_embedding if contains_start else end_embedding
                             sampled_end = inputs_embeds.clone().detach()
                     else:
                         inputs_embeds = cur_thought_embedding.unsqueeze(0).repeat(batch_size, seq_len, 1)
+                inputs_embeds = inputs_embeds.view(probabilities.size(0), probabilities.size(1), -1).to(self.model.embed_tokens.weight.dtype)
                 inputs_embeds = inputs_embeds.view(probabilities.size(0), probabilities.size(1), -1).to(self.model.embed_tokens.weight.dtype)
                 # Predict the usefulness of thinking at each token position
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
+    def compute_complexity_scores(self, input_ids, attention_mask):
+        # Compute complexity scores based on input sequence characteristics
+        # Example: Normalize sequence lengths and consider the presence of rare tokens
+        seq_lengths = torch.sum(attention_mask, dim=-1)
+        max_length = torch.max(seq_lengths)
+        length_scores = seq_lengths / max_length
+        # Compute the proportion of rare tokens in each sequence
+        rare_token_ids = self.get_rare_token_ids()
+        rare_token_mask = torch.isin(input_ids, rare_token_ids)
+        rare_token_counts = torch.sum(rare_token_mask, dim=-1)
+        rare_token_scores = rare_token_counts / seq_lengths
+        # Combine length scores and rare token scores
+        complexity_scores = self.complexity_factor * length_scores + (1 - self.complexity_factor) * rare_token_scores
+        return complexity_scores
+    def get_rare_token_ids(self):
+        # Get the IDs of rare tokens based on a predefined frequency threshold
+        frequency_threshold = 1e-4
+        token_counts = torch.bincount(self.model.embed_tokens.weight.argmax(dim=-1))
+        total_tokens = torch.sum(token_counts)
+        rare_token_mask = token_counts / total_tokens < frequency_threshold
+        rare_token_ids = torch.nonzero(rare_token_mask).squeeze(-1)
+        return rare_token_ids
     def prepare_inputs_for_generation(