# ============================================================ # Unit 4: Upload REINFORCE CartPole Model to Hugging Face # ============================================================ import gymnasium as gym import numpy as np import os import shutil import torch import torch.nn as nn import torch.nn.functional as F from huggingface_hub import HfApi, create_repo from torch.distributions import Categorical # ============================================================ # 配置参数(修改这里) # ============================================================ USERNAME = "ImaghT" # 替换为你的用户名 MODEL_NAME = "reinforce-CartPole-v1" MODEL_FILE = "/home/eason/Workspace/Result_DRL/reinforce_cartpole.pth" # 绝对路径 ENV_ID = "CartPole-v1" N_EVAL_EPISODES = 100 repo_id = f"{USERNAME}/{MODEL_NAME}" # ============================================================ # 策略网络定义(与训练时相同) # ============================================================ class Policy(nn.Module): def __init__(self, s_size, a_size, h_size=128): super(Policy, self).__init__() self.fc1 = nn.Linear(s_size, h_size) self.fc2 = nn.Linear(h_size, a_size) def forward(self, x): x = F.relu(self.fc1(x)) x = self.fc2(x) return F.softmax(x, dim=1) # ============================================================ # 1. 加载训练好的模型 # ============================================================ print("Loading trained model...") if not os.path.exists(MODEL_FILE): print(f"❌ Error: Model file '{MODEL_FILE}' not found!") print("Please run the training script first to generate the model.") exit(1) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") checkpoint = torch.load(MODEL_FILE, map_location=device, weights_only=False) # 重建模型 s_size = checkpoint['s_size'] a_size = checkpoint['a_size'] hidden_size = checkpoint['hidden_size'] policy = Policy(s_size, a_size, hidden_size).to(device) policy.load_state_dict(checkpoint['policy_state_dict']) policy.eval() print(f"✅ Model loaded from {MODEL_FILE}") print(f" State size: {s_size}, Action size: {a_size}") print(f" Hidden size: {hidden_size}") # ============================================================ # 2. 创建评估环境 # ============================================================ print("\nCreating evaluation environment...") eval_env = gym.make(ENV_ID) print(f"✅ Environment {ENV_ID} ready") # ============================================================ # 3. 运行评估 # ============================================================ print("="*60) print(f"Starting Evaluation ({N_EVAL_EPISODES} episodes)...") print("="*60) episode_rewards = [] episode_lengths = [] for episode in range(N_EVAL_EPISODES): state, _ = eval_env.reset() episode_reward = 0 episode_length = 0 done = False while not done: with torch.no_grad(): state_tensor = torch.from_numpy(state).float().unsqueeze(0).to(device) probs = policy.forward(state_tensor) m = Categorical(probs) action = m.sample().item() state, reward, terminated, truncated, _ = eval_env.step(action) episode_reward += reward episode_length += 1 done = terminated or truncated episode_rewards.append(episode_reward) episode_lengths.append(episode_length) if (episode + 1) % 10 == 0: print(f"Episode {episode + 1}/{N_EVAL_EPISODES}: " f"Reward = {episode_reward:.2f}, Length = {episode_length}") # ============================================================ # 4. 计算统计数据 # ============================================================ mean_reward = np.mean(episode_rewards) std_reward = np.std(episode_rewards) min_reward = np.min(episode_rewards) max_reward = np.max(episode_rewards) mean_length = np.mean(episode_lengths) score = mean_reward - std_reward print("\n" + "="*60) print("Evaluation Results:") print(f" Mean Reward: {mean_reward:.2f}") print(f" Std Reward: {std_reward:.2f}") print(f" Min Reward: {min_reward:.2f}") print(f" Max Reward: {max_reward:.2f}") print(f" Mean Length: {mean_length:.2f}") print(f" Score (mean - std): {score:.2f}") print(f" Baseline Required: 350.0") if score >= 350: print(f" Status: ✅ PASSED") else: print(f" Status: ❌ NOT PASSED (need {350 - score:.2f} more points)") print("="*60 + "\n") # ============================================================ # 5. 创建 README.md(完全避免f-string中的#符号) # ============================================================ # 使用字符串格式化而不是f-string来避免#符号问题 readme_template = """--- library_name: reinforce tags: - CartPole-v1 - deep-reinforcement-learning - reinforcement-learning - policy-gradient - reinforce model-index: - name: REINFORCE results: - task: type: reinforcement-learning name: reinforcement-learning dataset: name: CartPole-v1 type: CartPole-v1 metrics: - type: mean_reward value: {mean_reward:.2f} +/- {std_reward:.2f} name: mean_reward verified: false --- # **REINFORCE** Agent playing **CartPole-v1** This is a trained model of a **REINFORCE** agent playing **CartPole-v1** using PyTorch and the [Deep Reinforcement Learning Course](https://huggingface.co/deep-rl-course/unit4). ## Algorithm REINFORCE is a policy gradient method that: - Directly optimizes the policy π(a|s) - Uses Monte Carlo sampling to estimate returns - Updates parameters in the direction of higher expected returns - Belongs to the family of Policy Gradient methods ## Evaluation Results | Metric | Value | |--------|-------| | Mean Reward | {mean_reward:.2f} | | Std Reward | {std_reward:.2f} | | Min Reward | {min_reward:.2f} | | Max Reward | {max_reward:.2f} | | Mean Episode Length | {mean_length:.2f} | | Score (mean - std) | {score:.2f} | | Evaluation Episodes | {N_EVAL_EPISODES} | ## Usage ```python import torch import torch.nn as nn import torch.nn.functional as F import gymnasium as gym import numpy as np class Policy(nn.Module): def __init__(self, s_size, a_size, h_size=128): super(Policy, self).__init__() self.fc1 = nn.Linear(s_size, h_size) self.fc2 = nn.Linear(h_size, a_size) def forward(self, x): x = F.relu(self.fc1(x)) x = self.fc2(x) return F.softmax(x, dim=1) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") checkpoint = torch.load("reinforce_cartpole.pth", map_location=device) policy = Policy(checkpoint['s_size'], checkpoint['a_size'], checkpoint['hidden_size']) policy.load_state_dict(checkpoint['policy_state_dict']) policy.eval() env = gym.make("CartPole-v1") state, _ = env.reset() for step in range(1000): state_tensor = torch.from_numpy(state).float().unsqueeze(0) with torch.no_grad(): probs = policy(state_tensor) action = torch.argmax(probs, dim=1).item() state, reward, terminated, truncated, _ = env.step(action) if terminated or truncated: state, _ = env.reset() ## Training Configuration - **Algorithm**: REINFORCE (Policy Gradient) - **Policy Network**: 2-layer MLP (128 hidden units) - **Optimizer**: Adam - **Learning Rate**: 0.003 - **Discount Factor**: 0.99 - **Training Episodes**: 800 - **Device**: {device} ## Training Hyperparameters - Episodes: 800 - Max steps per episode: 1000 - Learning rate: 0.01 - Gamma (discount factor): 0.99 - Hidden layer size: 128 - Optimizer: Adam """ # 使用.format()方法而不是f-string readme_content = readme_template.format( mean_reward=mean_reward, std_reward=std_reward, min_reward=min_reward, max_reward=max_reward, mean_length=mean_length, score=score, N_EVAL_EPISODES=N_EVAL_EPISODES, device=device ) # ============================================================ # 6. 准备上传文件 # ============================================================ print("Preparing files for upload...") upload_folder = "./upload_temp" os.makedirs(upload_folder, exist_ok=True) # 创建README.md readme_path = os.path.join(upload_folder, "README.md") with open(readme_path, "w", encoding="utf-8") as f: f.write(readme_content) print(f"✅ Created README.md") # 复制模型文件 model_dest = os.path.join(upload_folder, os.path.basename(MODEL_FILE)) shutil.copy(MODEL_FILE, model_dest) print(f"✅ Copied {MODEL_FILE}") # 创建配置文件 config_content = """{{ "env_id": "{ENV_ID}", "algorithm": "REINFORCE", "library": "reinforce", "s_size": {s_size}, "a_size": {a_size}, "hidden_size": {hidden_size}, "mean_reward": {mean_reward:.2f}, "std_reward": {std_reward:.2f}, "evaluation_episodes": {N_EVAL_EPISODES} }}""".format( ENV_ID=ENV_ID, s_size=s_size, a_size=a_size, hidden_size=hidden_size, mean_reward=mean_reward, std_reward=std_reward, N_EVAL_EPISODES=N_EVAL_EPISODES ) config_path = os.path.join(upload_folder, "config.json") with open(config_path, "w", encoding="utf-8") as f: f.write(config_content) print(f"✅ Created config.json") # ============================================================ # 7. 上传到 Hugging Face # ============================================================ print(f"\nUploading to {repo_id}...") api = HfApi() try: create_repo(repo_id, repo_type="model", exist_ok=True) print(f"✅ Repository created/verified") except Exception as e: print(f"⚠️ Repository warning: {e}") try: api.upload_folder( folder_path=upload_folder, repo_id=repo_id, repo_type="model", commit_message=f"REINFORCE CartPole - Mean: {mean_reward:.2f}, Std: {std_reward:.2f}, Score: {score:.2f}" ) print(f"\n{'='*60}") print("✅ Upload Successful!") print(f"{'='*60}") print(f"🔗 Model Page: https://huggingface.co/{repo_id}") print(f"🏆 Check Progress: https://huggingface.co/spaces/ThomasSimonini/Check-my-progress-Deep-RL-Course") print(f"{'='*60}\n") print("📋 Important Information:") print(f" • Environment: {ENV_ID}") print(f" • Library tag: reinforce") print(f" • Required score: 350.0") print(f" • Your score: {score:.2f}") print(f" • Status: {'✅ PASSED' if score >= 350 else '❌ FAILED'}") except Exception as e: print(f"\n❌ Upload failed: {e}") print("Please check your Hugging Face token and internet connection.") finally: shutil.rmtree(upload_folder) print("🧹 Cleaned up temporary files") print("\n✨ Done!")