DeepXR
/

Helion-OSC

@@ -1,142 +1,580 @@
 """
 Helion-OSC Inference Script
 DeepXR/Helion-OSC - Mathematical Coding Language Model
 """
 import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM
-from typing import Optional, Dict, Any
 class HelionOSCInference:
-    """Inference wrapper for Helion-OSC model"""
     def __init__(
         self,
         model_name: str = "DeepXR/Helion-OSC",
         device: Optional[str] = None,
-        load_in_8bit: bool = False
     ):
         """
         Initialize the Helion-OSC model
         Args:
             model_name: HuggingFace model identifier
-            device: Device to load model on (cuda/cpu)
-            load_in_8bit: Whether to load model in 8-bit precision
         """
-        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
-        print(f"Loading Helion-OSC on {self.device}...")
-        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
-        model_kwargs = {"device_map": "auto"} if self.device == "cuda" else {}
-        if load_in_8bit:
             model_kwargs["load_in_8bit"] = True
-        self.model = AutoModelForCausalLM.from_pretrained(
-            model_name,
-            torch_dtype=torch.bfloat16 if self.device == "cuda" else torch.float32,
             **model_kwargs
         )
-        if self.device == "cpu":
-            self.model = self.model.to(self.device)
-        self.model.eval()
-        print("Model loaded successfully!")
     def generate(
         self,
-        prompt: str,
-        max_length: int = 512,
-        temperature: float = 0.7,
-        top_p: float = 0.95,
-        top_k: int = 50,
-        num_return_sequences: int = 1,
-        do_sample: bool = True,
         **kwargs
-    ) -> str:
         """
-        Generate code or text based on prompt
         Args:
-            prompt: Input prompt
-            max_length: Maximum length of generated text
-            temperature: Sampling temperature
-            top_p: Nucleus sampling parameter
-            top_k: Top-k sampling parameter
-            num_return_sequences: Number of sequences to generate
-            do_sample: Whether to use sampling
         Returns:
-            Generated text
         """
-        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
         with torch.no_grad():
             outputs = self.model.generate(
                 **inputs,
-                max_length=max_length,
-                temperature=temperature,
-                top_p=top_p,
-                top_k=top_k,
-                num_return_sequences=num_return_sequences,
-                do_sample=do_sample,
-                pad_token_id=self.tokenizer.eos_token_id,
-                **kwargs
             )
-        generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
-        return generated_text
-    def code_generation(self, prompt: str, max_length: int = 1024) -> str:
-        """Optimized for code generation tasks"""
         return self.generate(
             prompt,
             max_length=max_length,
-            temperature=0.7,
-            top_p=0.95,
-            do_sample=True
         )
-    def mathematical_reasoning(self, prompt: str, max_length: int = 512) -> str:
-        """Optimized for mathematical reasoning tasks"""
         return self.generate(
             prompt,
             max_length=max_length,
-            temperature=0.3,
-            top_p=0.9,
-            do_sample=False
         )
 def main():
-    """Example usage"""
     # Initialize model
-    helion = HelionOSCInference()
-    # Example 1: Code generation
-    code_prompt = "Write a Python function to calculate the factorial of a number using recursion:"
-    print("\n=== Code Generation ===")
-    print(f"Prompt: {code_prompt}")
-    result = helion.code_generation(code_prompt)
-    print(f"Output:\n{result}\n")
-    # Example 2: Mathematical reasoning
-    math_prompt = "Prove that the sum of first n natural numbers is n(n+1)/2:"
-    print("\n=== Mathematical Reasoning ===")
-    print(f"Prompt: {math_prompt}")
     result = helion.mathematical_reasoning(math_prompt)
-    print(f"Output:\n{result}\n")
-    # Example 3: Algorithm design
-    algo_prompt = "Design an efficient algorithm to find the longest palindromic substring:"
-    print("\n=== Algorithm Design ===")
-    print(f"Prompt: {algo_prompt}")
-    result = helion.generate(algo_prompt, max_length=1024)
-    print(f"Output:\n{result}\n")
 if __name__ == "__main__":

 """
 Helion-OSC Inference Script
 DeepXR/Helion-OSC - Mathematical Coding Language Model
+This module provides comprehensive inference capabilities for the Helion-OSC model,
+including specialized methods for different programming and mathematical tasks.
 """
 import torch
+import json
+import logging
+from typing import Optional, Dict, Any, List, Union
+from transformers import (
+    AutoTokenizer,
+    AutoModelForCausalLM,
+    GenerationConfig,
+    StoppingCriteria,
+    StoppingCriteriaList
+)
+from dataclasses import dataclass
+import warnings
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+@dataclass
+class GenerationParameters:
+    """Parameters for text generation"""
+    max_length: int = 2048
+    temperature: float = 0.7
+    top_p: float = 0.95
+    top_k: int = 50
+    repetition_penalty: float = 1.05
+    length_penalty: float = 1.0
+    do_sample: bool = True
+    num_return_sequences: int = 1
+    early_stopping: bool = False
+class CodeStoppingCriteria(StoppingCriteria):
+    """Custom stopping criteria for code generation"""
+    def __init__(self, stop_sequences: List[str], tokenizer):
+        self.stop_sequences = stop_sequences
+        self.tokenizer = tokenizer
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+        decoded = self.tokenizer.decode(input_ids[0], skip_special_tokens=True)
+        return any(seq in decoded for seq in self.stop_sequences)
 class HelionOSCInference:
+    """
+    Comprehensive inference wrapper for Helion-OSC model
+    Supports multiple generation modes:
+    - Code generation
+    - Mathematical reasoning
+    - Algorithm design
+    - Code debugging
+    - Documentation generation
+    """
     def __init__(
         self,
         model_name: str = "DeepXR/Helion-OSC",
         device: Optional[str] = None,
+        load_in_8bit: bool = False,
+        load_in_4bit: bool = False,
+        use_flash_attention: bool = True,
+        trust_remote_code: bool = True
     ):
         """
         Initialize the Helion-OSC model
         Args:
             model_name: HuggingFace model identifier
+            device: Device to load model on (cuda/cpu/mps)
+            load_in_8bit: Load model in 8-bit precision
+            load_in_4bit: Load model in 4-bit precision
+            use_flash_attention: Use flash attention for faster inference
+            trust_remote_code: Trust remote code from model repository
         """
+        self.model_name = model_name
+        self.device = self._get_device(device)
+        self.load_in_8bit = load_in_8bit
+        self.load_in_4bit = load_in_4bit
+        logger.info(f"Initializing Helion-OSC on {self.device}...")
+        # Load tokenizer
+        self.tokenizer = self._load_tokenizer(trust_remote_code)
+        # Load model
+        self.model = self._load_model(
+            use_flash_attention=use_flash_attention,
+            trust_remote_code=trust_remote_code
+        )
+        # Load generation configs
+        self.generation_configs = self._load_generation_configs()
+        logger.info("Model loaded successfully!")
+        self._print_model_info()
+    def _get_device(self, device: Optional[str]) -> str:
+        """Determine the best available device"""
+        if device:
+            return device
+        if torch.cuda.is_available():
+            return "cuda"
+        elif torch.backends.mps.is_available():
+            return "mps"
+        return "cpu"
+    def _load_tokenizer(self, trust_remote_code: bool):
+        """Load and configure tokenizer"""
+        logger.info("Loading tokenizer...")
+        tokenizer = AutoTokenizer.from_pretrained(
+            self.model_name,
+            trust_remote_code=trust_remote_code,
+            padding_side="left"
+        )
+        # Ensure pad token is set
+        if tokenizer.pad_token is None:
+            tokenizer.pad_token = tokenizer.eos_token
+        return tokenizer
+    def _load_model(self, use_flash_attention: bool, trust_remote_code: bool):
+        """Load and configure model"""
+        logger.info("Loading model...")
+        model_kwargs = {
+            "trust_remote_code": trust_remote_code,
+            "low_cpu_mem_usage": True
+        }
+        # Configure precision and quantization
+        if self.load_in_8bit:
             model_kwargs["load_in_8bit"] = True
+            logger.info("Loading in 8-bit precision")
+        elif self.load_in_4bit:
+            model_kwargs["load_in_4bit"] = True
+            model_kwargs["bnb_4bit_compute_dtype"] = torch.bfloat16
+            model_kwargs["bnb_4bit_use_double_quant"] = True
+            model_kwargs["bnb_4bit_quant_type"] = "nf4"
+            logger.info("Loading in 4-bit precision")
+        else:
+            if self.device == "cuda":
+                model_kwargs["torch_dtype"] = torch.bfloat16
+            else:
+                model_kwargs["torch_dtype"] = torch.float32
+        # Configure device mapping
+        if self.device == "cuda" and not (self.load_in_8bit or self.load_in_4bit):
+            model_kwargs["device_map"] = "auto"
+        # Load model
+        model = AutoModelForCausalLM.from_pretrained(
+            self.model_name,
             **model_kwargs
         )
+        # Move to device if needed
+        if self.device != "cuda" or (self.load_in_8bit or self.load_in_4bit):
+            if not (self.load_in_8bit or self.load_in_4bit):
+                model = model.to(self.device)
+        model.eval()
+        # Enable gradient checkpointing for memory efficiency if needed
+        if hasattr(model, 'gradient_checkpointing_enable'):
+            model.gradient_checkpointing_enable()
+        return model
+    def _load_generation_configs(self) -> Dict[str, GenerationParameters]:
+        """Load task-specific generation configurations"""
+        return {
+            "code_generation": GenerationParameters(
+                max_length=4096,
+                temperature=0.7,
+                top_p=0.95,
+                top_k=50,
+                repetition_penalty=1.05,
+                do_sample=True
+            ),
+            "mathematical_reasoning": GenerationParameters(
+                max_length=2048,
+                temperature=0.3,
+                top_p=0.9,
+                top_k=40,
+                repetition_penalty=1.0,
+                do_sample=False
+            ),
+            "code_completion": GenerationParameters(
+                max_length=1024,
+                temperature=0.6,
+                top_p=0.92,
+                top_k=45,
+                repetition_penalty=1.03,
+                do_sample=True
+            ),
+            "algorithm_design": GenerationParameters(
+                max_length=3072,
+                temperature=0.5,
+                top_p=0.93,
+                top_k=50,
+                repetition_penalty=1.08,
+                do_sample=True
+            ),
+            "debugging": GenerationParameters(
+                max_length=2048,
+                temperature=0.4,
+                top_p=0.88,
+                repetition_penalty=1.0,
+                do_sample=False
+            )
+        }
+    def _print_model_info(self):
+        """Print model information"""
+        try:
+            num_params = sum(p.numel() for p in self.model.parameters())
+            logger.info(f"Model parameters: {num_params:,}")
+            logger.info(f"Model dtype: {next(self.model.parameters()).dtype}")
+            logger.info(f"Device: {self.device}")
+        except Exception as e:
+            logger.warning(f"Could not get model info: {e}")
     def generate(
         self,
+        prompt: Union[str, List[str]],
+        task_type: str = "code_generation",
+        custom_params: Optional[GenerationParameters] = None,
+        stop_sequences: Optional[List[str]] = None,
+        return_full_text: bool = False,
         **kwargs
+    ) -> Union[str, List[str]]:
         """
+        Generate text based on prompt
         Args:
+            prompt: Input prompt or list of prompts
+            task_type: Type of task (code_generation, mathematical_reasoning, etc.)
+            custom_params: Custom generation parameters
+            stop_sequences: List of sequences to stop generation
+            return_full_text: Whether to return full text including prompt
+            **kwargs: Additional generation parameters
         Returns:
+            Generated text or list of generated texts
         """
+        # Get generation parameters
+        if custom_params:
+            params = custom_params
+        elif task_type in self.generation_configs:
+            params = self.generation_configs[task_type]
+        else:
+            logger.warning(f"Unknown task type '{task_type}', using default parameters")
+            params = GenerationParameters()
+        # Override with kwargs
+        for key, value in kwargs.items():
+            if hasattr(params, key):
+                setattr(params, key, value)
+        # Tokenize input
+        is_batch = isinstance(prompt, list)
+        inputs = self.tokenizer(
+            prompt,
+            return_tensors="pt",
+            padding=True,
+            truncation=True,
+            max_length=self.model.config.max_position_embeddings
+        ).to(self.device)
+        # Setup stopping criteria
+        stopping_criteria = None
+        if stop_sequences:
+            stopping_criteria = StoppingCriteriaList([
+                CodeStoppingCriteria(stop_sequences, self.tokenizer)
+            ])
+        # Generate
         with torch.no_grad():
             outputs = self.model.generate(
                 **inputs,
+                max_length=params.max_length,
+                temperature=params.temperature,
+                top_p=params.top_p,
+                top_k=params.top_k,
+                repetition_penalty=params.repetition_penalty,
+                length_penalty=params.length_penalty,
+                do_sample=params.do_sample,
+                num_return_sequences=params.num_return_sequences,
+                early_stopping=params.early_stopping,
+                pad_token_id=self.tokenizer.pad_token_id,
+                eos_token_id=self.tokenizer.eos_token_id,
+                stopping_criteria=stopping_criteria
             )
+        # Decode outputs
+        generated_texts = []
+        for output in outputs:
+            text = self.tokenizer.decode(output, skip_special_tokens=True)
+            if not return_full_text and not is_batch:
+                # Remove prompt from single generation
+                if isinstance(prompt, str):
+                    text = text[len(prompt):].strip()
+            generated_texts.append(text)
+        return generated_texts if is_batch or params.num_return_sequences > 1 else generated_texts[0]
+    def code_generation(
+        self,
+        prompt: str,
+        language: Optional[str] = None,
+        max_length: int = 4096,
+        **kwargs
+    ) -> str:
+        """
+        Generate code for a given prompt
+        Args:
+            prompt: Code generation prompt
+            language: Programming language (optional)
+            max_length: Maximum length of generated code
+            **kwargs: Additional generation parameters
+        Returns:
+            Generated code
+        """
+        if language:
+            prompt = f"Language: {language}\n{prompt}"
+        return self.generate(
+            prompt,
+            task_type="code_generation",
+            max_length=max_length,
+            **kwargs
+        )
+    def mathematical_reasoning(
+        self,
+        prompt: str,
+        max_length: int = 2048,
+        **kwargs
+    ) -> str:
+        """
+        Solve mathematical problems with step-by-step reasoning
+        Args:
+            prompt: Mathematical problem
+            max_length: Maximum length of solution
+            **kwargs: Additional generation parameters
+        Returns:
+            Mathematical solution with reasoning
+        """
+        return self.generate(
+            prompt,
+            task_type="mathematical_reasoning",
+            max_length=max_length,
+            **kwargs
+        )
+    def algorithm_design(
+        self,
+        prompt: str,
+        include_complexity: bool = True,
+        max_length: int = 3072,
+        **kwargs
+    ) -> str:
+        """
+        Design algorithms with complexity analysis
+        Args:
+            prompt: Algorithm design prompt
+            include_complexity: Whether to include complexity analysis
+            max_length: Maximum length of output
+            **kwargs: Additional generation parameters
+        Returns:
+            Algorithm design with analysis
+        """
+        if include_complexity:
+            prompt += "\n\nPlease include time and space complexity analysis."
         return self.generate(
             prompt,
+            task_type="algorithm_design",
             max_length=max_length,
+            **kwargs
         )
+    def debug_code(
+        self,
+        code: str,
+        error_message: Optional[str] = None,
+        max_length: int = 2048,
+        **kwargs
+    ) -> str:
+        """
+        Debug code and provide fixes
+        Args:
+            code: Code to debug
+            error_message: Optional error message
+            max_length: Maximum length of output
+            **kwargs: Additional generation parameters
+        Returns:
+            Debugging analysis and fixes
+        """
+        prompt = f"Debug the following code:\n\n```\n{code}\n```"
+        if error_message:
+            prompt += f"\n\nError message: {error_message}"
+        prompt += "\n\nProvide a detailed explanation and fixed code."
         return self.generate(
             prompt,
+            task_type="debugging",
+            max_length=max_length,
+            **kwargs
+        )
+    def complete_code(
+        self,
+        code_context: str,
+        max_length: int = 1024,
+        **kwargs
+    ) -> str:
+        """
+        Complete partial code
+        Args:
+            code_context: Partial code to complete
+            max_length: Maximum length of completion
+            **kwargs: Additional generation parameters
+        Returns:
+            Code completion
+        """
+        return self.generate(
+            code_context,
+            task_type="code_completion",
             max_length=max_length,
+            stop_sequences=["\n\n", "```", "###"],
+            **kwargs
         )
+    def batch_generate(
+        self,
+        prompts: List[str],
+        task_type: str = "code_generation",
+        batch_size: int = 4,
+        **kwargs
+    ) -> List[str]:
+        """
+        Generate responses for multiple prompts in batches
+        Args:
+            prompts: List of prompts
+            task_type: Type of task
+            batch_size: Batch size for processing
+            **kwargs: Additional generation parameters
+        Returns:
+            List of generated responses
+        """
+        results = []
+        for i in range(0, len(prompts), batch_size):
+            batch = prompts[i:i + batch_size]
+            batch_results = self.generate(batch, task_type=task_type, **kwargs)
+            if isinstance(batch_results, str):
+                batch_results = [batch_results]
+            results.extend(batch_results)
+        return results
 def main():
+    """Example usage and demonstrations"""
+    print("=" * 80)
+    print("Helion-OSC Inference Examples")
+    print("=" * 80)
     # Initialize model
+    helion = HelionOSCInference(
+        load_in_8bit=False,  # Set to True for lower memory usage
+        load_in_4bit=False   # Set to True for even lower memory usage
+    )
+    # Example 1: Code Generation
+    print("\n" + "=" * 80)
+    print("Example 1: Code Generation")
+    print("=" * 80)
+    code_prompt = """Write a Python function to implement a binary search tree with the following methods:
+- insert(value): Insert a new value
+- search(value): Search for a value
+- delete(value): Delete a value
+- inorder_traversal(): Return inorder traversal
+Include proper documentation and type hints."""
+    print(f"\nPrompt:\n{code_prompt}")
+    print("\nGenerating...")
+    result = helion.code_generation(code_prompt, language="python")
+    print(f"\nGenerated Code:\n{result}")
+    # Example 2: Mathematical Reasoning
+    print("\n" + "=" * 80)
+    print("Example 2: Mathematical Reasoning")
+    print("=" * 80)
+    math_prompt = """Prove that the sum of the first n natural numbers equals n(n+1)/2 using mathematical induction."""
+    print(f"\nPrompt:\n{math_prompt}")
+    print("\nGenerating...")
     result = helion.mathematical_reasoning(math_prompt)
+    print(f"\nSolution:\n{result}")
+    # Example 3: Algorithm Design
+    print("\n" + "=" * 80)
+    print("Example 3: Algorithm Design")
+    print("=" * 80)
+    algo_prompt = """Design an efficient algorithm to find the longest palindromic substring in a given string."""
+    print(f"\nPrompt:\n{algo_prompt}")
+    print("\nGenerating...")
+    result = helion.algorithm_design(algo_prompt, include_complexity=True)
+    print(f"\nAlgorithm:\n{result}")
+    # Example 4: Code Debugging
+    print("\n" + "=" * 80)
+    print("Example 4: Code Debugging")
+    print("=" * 80)
+    buggy_code = """
+def fibonacci(n):
+    if n <= 1:
+        return n
+    return fibonacci(n-1) + fibonacci(n-2)
+# This is too slow for large n
+result = fibonacci(100)
+"""
+    print(f"\nBuggy Code:\n{buggy_code}")
+    print("\nGenerating debugging analysis...")
+    result = helion.debug_code(buggy_code, error_message="Takes too long to compute")
+    print(f"\nDebug Analysis:\n{result}")
+    # Example 5: Batch Processing
+    print("\n" + "=" * 80)
+    print("Example 5: Batch Code Generation")
+    print("=" * 80)
+    batch_prompts = [
+        "Write a Python function to reverse a linked list",
+        "Write a JavaScript function to debounce API calls",
+        "Write a Rust function to parse JSON safely"
+    ]
+    print("\nProcessing batch prompts...")
+    results = helion.batch_generate(batch_prompts, batch_size=2)
+    for i, (prompt, result) in enumerate(zip(batch_prompts, results), 1):
+        print(f"\nPrompt {i}: {prompt}")
+        print(f"Result {i}:\n{result}\n")
+    print("=" * 80)
+    print("Examples completed!")
+    print("=" * 80)
 if __name__ == "__main__":