|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import torch
|
|
|
from transformers import (
|
|
|
AutoTokenizer,
|
|
|
AutoModelForCausalLM,
|
|
|
TrainingArguments,
|
|
|
Trainer
|
|
|
)
|
|
|
from peft import LoraConfig, get_peft_model
|
|
|
from datasets import load_dataset
|
|
|
import os
|
|
|
|
|
|
|
|
|
model_name = "microsoft/Phi-3-mini-4k-instruct"
|
|
|
output_dir = "./ominix-personality-v1"
|
|
|
|
|
|
|
|
|
print("📥 بارگذاری مدل Phi-3-mini...")
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
|
model = AutoModelForCausalLM.from_pretrained(
|
|
|
model_name,
|
|
|
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
|
|
|
device_map="auto",
|
|
|
trust_remote_code=True,
|
|
|
)
|
|
|
|
|
|
|
|
|
print("🧬 تنظیم LoRA برای آموزش سبک...")
|
|
|
peft_config = LoraConfig(
|
|
|
r=8,
|
|
|
lora_alpha=32,
|
|
|
target_modules=["q_proj", "v_proj"],
|
|
|
lora_dropout=0.05,
|
|
|
bias="none",
|
|
|
task_type="CAUSAL_LM"
|
|
|
)
|
|
|
|
|
|
model = get_peft_model(model, peft_config)
|
|
|
model.print_trainable_parameters()
|
|
|
|
|
|
|
|
|
print("📚 بارگذاری دادههای شخصیت...")
|
|
|
|
|
|
|
|
|
dataset = load_dataset('json', data_files='C:/Users/ir/Desktop/New folder (2)/all/python+/OMINIX-R1-V1/data/data.jsonl', split='train')
|
|
|
|
|
|
def formatting_prompts_func(examples):
|
|
|
texts = []
|
|
|
for messages in examples["messages"]:
|
|
|
|
|
|
text = tokenizer.apply_chat_template(messages, tokenize=False)
|
|
|
texts.append(text)
|
|
|
return {"text": texts}
|
|
|
|
|
|
dataset = dataset.map(formatting_prompts_func, batched=True)
|
|
|
|
|
|
def tokenize_function(examples):
|
|
|
return tokenizer(
|
|
|
examples["text"],
|
|
|
truncation=True,
|
|
|
padding="max_length",
|
|
|
max_length=512,
|
|
|
return_tensors="pt"
|
|
|
)
|
|
|
|
|
|
tokenized_dataset = dataset.map(tokenize_function, batched=True)
|
|
|
|
|
|
|
|
|
print("⚙️ تنظیم پارامترهای آموزش...")
|
|
|
|
|
|
training_args = TrainingArguments(
|
|
|
output_dir=output_dir,
|
|
|
per_device_train_batch_size=1,
|
|
|
gradient_accumulation_steps=4,
|
|
|
num_train_epochs=5,
|
|
|
learning_rate=1e-4,
|
|
|
fp16=torch.cuda.is_available(),
|
|
|
save_strategy="epoch",
|
|
|
logging_steps=10,
|
|
|
report_to="none",
|
|
|
optim="paged_adamw_8bit",
|
|
|
remove_unused_columns=False,
|
|
|
)
|
|
|
|
|
|
|
|
|
print("🚀 شروع آموزش Ominix...")
|
|
|
|
|
|
trainer = Trainer(
|
|
|
model=model,
|
|
|
args=training_args,
|
|
|
train_dataset=tokenized_dataset,
|
|
|
tokenizer=tokenizer,
|
|
|
)
|
|
|
|
|
|
trainer.train()
|
|
|
|
|
|
|
|
|
print("💾 ذخیره مدل نهایی...")
|
|
|
|
|
|
model.save_pretrained(output_dir)
|
|
|
tokenizer.save_pretrained(output_dir)
|
|
|
|
|
|
|
|
|
print("✅ آموزش کامل شد! تست سریع...")
|
|
|
|
|
|
test_messages = [
|
|
|
{"role": "user", "content": "تو کی هستی؟"},
|
|
|
]
|
|
|
|
|
|
input_ids = tokenizer.apply_chat_template(
|
|
|
test_messages,
|
|
|
return_tensors="pt"
|
|
|
).to(model.device)
|
|
|
|
|
|
outputs = model.generate(
|
|
|
input_ids,
|
|
|
max_new_tokens=200,
|
|
|
temperature=0.7,
|
|
|
do_sample=True
|
|
|
)
|
|
|
|
|
|
print("\n🧪 تست: ", tokenizer.decode(outputs[0], skip_special_tokens=False))
|
|
|
|
|
|
print(f"\n🎉 Ominix با موفقیت آموزش دید و در '{output_dir}' ذخیره شد!")
|
|
|
print("🦅 حالا وقتشه که دنیا رو با شخصیت منحصربهفردت معرفی کنی!") |