Helion-OSC / config.json
Trouter-Library's picture
Update config.json
84f5cf9 verified
raw
history blame
5.2 kB
{
"model_type": "helion-osc",
"architectures": ["HelionOSCForCausalLM"],
"vocab_size": 102400,
"hidden_size": 5120,
"num_hidden_layers": 48,
"num_attention_heads": 40,
"num_key_value_heads": 8,
"intermediate_size": 18432,
"hidden_act": "swiglu",
"max_position_embeddings": 16384,
"initializer_range": 0.02,
"rms_norm_eps": 1e-6,
"use_cache": true,
"pad_token_id": 0,
"bos_token_id": 1,
"eos_token_id": 2,
"tie_word_embeddings": false,
"rope_theta": 10000.0,
"rope_scaling": {
"type": "linear",
"factor": 2.0
},
"attention_bias": false,
"attention_dropout": 0.0,
"mlp_bias": false,
"torch_dtype": "bfloat16",
"transformers_version": "4.40.0",
"model_version": "1.0",
"use_flash_attention": true,
"sliding_window": null,
"gradient_checkpointing": false,
"task_specific_params": {
"code_generation": {
"max_length": 4096,
"temperature": 0.7,
"top_p": 0.95,
"top_k": 50,
"do_sample": true,
"repetition_penalty": 1.05,
"length_penalty": 1.0
},
"mathematical_reasoning": {
"max_length": 2048,
"temperature": 0.3,
"top_p": 0.9,
"top_k": 40,
"do_sample": false,
"repetition_penalty": 1.0,
"length_penalty": 1.2
},
"code_completion": {
"max_length": 1024,
"temperature": 0.6,
"top_p": 0.92,
"top_k": 45,
"do_sample": true,
"repetition_penalty": 1.03,
"stop_sequences": ["\n\n", "```", "###"]
},
"algorithm_design": {
"max_length": 3072,
"temperature": 0.5,
"top_p": 0.93,
"top_k": 50,
"do_sample": true,
"repetition_penalty": 1.08
},
"debugging": {
"max_length": 2048,
"temperature": 0.4,
"top_p": 0.88,
"do_sample": false,
"repetition_penalty": 1.0
}
},
"specialization": {
"domain": "coding_and_mathematics",
"primary_focus": "code_generation_with_mathematical_reasoning",
"verification_enabled": true,
"step_by_step_reasoning": true,
"languages_supported": [
"python",
"javascript",
"typescript",
"java",
"c",
"cpp",
"csharp",
"go",
"rust",
"ruby",
"php",
"swift",
"kotlin",
"scala",
"r",
"sql",
"bash",
"shell"
],
"features": [
"code_generation",
"code_completion",
"bug_detection",
"bug_fixing",
"mathematical_reasoning",
"theorem_proving",
"algorithm_design",
"algorithm_optimization",
"code_refactoring",
"documentation_generation",
"test_generation",
"complexity_analysis"
],
"mathematical_capabilities": [
"arithmetic",
"algebra",
"calculus",
"discrete_mathematics",
"linear_algebra",
"probability",
"statistics",
"number_theory",
"graph_theory",
"combinatorics"
]
},
"training_config": {
"training_precision": "bf16",
"optimizer": "adamw",
"learning_rate": 2e-5,
"warmup_steps": 2000,
"weight_decay": 0.01,
"max_grad_norm": 1.0
},
"quantization_config": {
"quant_method": "bitsandbytes",
"load_in_8bit": false,
"load_in_4bit": false,
"bnb_4bit_compute_dtype": "bfloat16",
"bnb_4bit_use_double_quant": true,
"bnb_4bit_quant_type": "nf4"
},
"generation_config": {
"temperature": 0.7,
"top_p": 0.95,
"top_k": 50,
"do_sample": true,
"max_new_tokens": 2048,
"min_new_tokens": 1,
"num_beams": 1,
"early_stopping": false,
"no_repeat_ngram_size": 3,
"encoder_no_repeat_ngram_size": 0,
"diversity_penalty": 0.0,
"repetition_penalty": 1.05,
"length_penalty": 1.0,
"exponential_decay_length_penalty": null
},
"special_tokens": {
"pad_token": "<|pad|>",
"bos_token": "<|begin_of_text|>",
"eos_token": "<|end_of_text|>",
"unk_token": "<|unk|>",
"code_start_token": "<|code_start|>",
"code_end_token": "<|code_end|>",
"math_start_token": "<|math_start|>",
"math_end_token": "<|math_end|>",
"reasoning_start_token": "<|reasoning_start|>",
"reasoning_end_token": "<|reasoning_end|>",
"explanation_start_token": "<|explanation_start|>",
"explanation_end_token": "<|explanation_end|>"
},
"supported_frameworks": [
"pytorch",
"tensorflow",
"onnx",
"jax"
],
"evaluation_metrics": {
"humaneval_pass_at_1": 0.852,
"humaneval_pass_at_10": 0.928,
"mbpp_pass_at_1": 0.795,
"mbpp_pass_at_10": 0.891,
"gsm8k_accuracy": 0.785,
"math_accuracy": 0.623,
"apps_accuracy": 0.412
},
"hardware_requirements": {
"minimum_vram_gb": 16,
"recommended_vram_gb": 24,
"minimum_ram_gb": 32,
"recommended_ram_gb": 64,
"cpu_cores": 8,
"gpu_support": true,
"multi_gpu_support": true,
"cpu_only_support": true
},
"deployment_options": {
"inference_frameworks": [
"vllm",
"text-generation-inference",
"ollama",
"llama.cpp"
],
"optimization_support": [
"quantization",
"pruning",
"distillation",
"tensorrt",
"onnx_runtime"
]
}
}