|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import numpy as np |
|
|
import matplotlib.pyplot as plt |
|
|
from collections import deque |
|
|
import gymnasium as gym |
|
|
import torch |
|
|
import torch.nn as nn |
|
|
import torch.nn.functional as F |
|
|
import torch.optim as optim |
|
|
import pickle |
|
|
import os |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") |
|
|
print(f"Using device: {device}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ENV_ID = "CartPole-v1" |
|
|
env = gym.make(ENV_ID) |
|
|
eval_env = gym.make(ENV_ID) |
|
|
|
|
|
s_size = env.observation_space.shape[0] |
|
|
a_size = env.action_space.n |
|
|
|
|
|
print(f"Environment: {ENV_ID}") |
|
|
print(f"Observation Space: {s_size}, Action Space: {a_size}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Policy(nn.Module): |
|
|
""" |
|
|
策略网络:输入状态,输出动作概率分布 |
|
|
""" |
|
|
def __init__(self, s_size, a_size, h_size=128): |
|
|
""" |
|
|
初始化策略网络 |
|
|
Args: |
|
|
s_size: 状态空间维度 |
|
|
a_size: 动作空间维度 |
|
|
h_size: 隐藏层大小 |
|
|
""" |
|
|
super(Policy, self).__init__() |
|
|
self.fc1 = nn.Linear(s_size, h_size) |
|
|
self.fc2 = nn.Linear(h_size, a_size) |
|
|
|
|
|
|
|
|
def forward(self, x): |
|
|
""" |
|
|
前向传播 |
|
|
Args: |
|
|
x: 输入状态 |
|
|
Returns: |
|
|
动作概率分布 |
|
|
""" |
|
|
x = F.relu(self.fc1(x)) |
|
|
|
|
|
x = self.fc2(x) |
|
|
return F.softmax(x, dim=1) |
|
|
|
|
|
def act(self, state): |
|
|
""" |
|
|
根据当前策略选择动作 |
|
|
Args: |
|
|
state: 当前状态 |
|
|
Returns: |
|
|
action: 选择的动作 |
|
|
log_prob: 该动作的对数概率(用于梯度计算) |
|
|
""" |
|
|
|
|
|
state = torch.from_numpy(state).float().unsqueeze(0).to(device) |
|
|
|
|
|
|
|
|
probs = self.forward(state) |
|
|
|
|
|
|
|
|
m = torch.distributions.Categorical(probs) |
|
|
|
|
|
|
|
|
action = m.sample() |
|
|
|
|
|
|
|
|
return action.item(), m.log_prob(action) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def reinforce(policy, optimizer, n_training_episodes, max_t, gamma, print_every): |
|
|
""" |
|
|
REINFORCE算法训练函数 |
|
|
Args: |
|
|
policy: 策略网络 |
|
|
optimizer: 优化器 |
|
|
n_training_episodes: 训练轮数 |
|
|
max_t: 每轮最大步数 |
|
|
gamma: 折扣因子 |
|
|
print_every: 打印间隔 |
|
|
Returns: |
|
|
scores: 每轮得分列表 |
|
|
""" |
|
|
scores_deque = deque(maxlen=100) |
|
|
scores = [] |
|
|
|
|
|
for i_episode in range(1, n_training_episodes + 1): |
|
|
saved_log_probs = [] |
|
|
rewards = [] |
|
|
state, _ = env.reset() |
|
|
|
|
|
|
|
|
for t in range(max_t): |
|
|
|
|
|
action, log_prob = policy.act(state) |
|
|
saved_log_probs.append(log_prob) |
|
|
|
|
|
|
|
|
state, reward, terminated, truncated, _ = env.step(action) |
|
|
rewards.append(reward) |
|
|
|
|
|
|
|
|
if terminated or truncated: |
|
|
break |
|
|
|
|
|
|
|
|
scores_deque.append(sum(rewards)) |
|
|
scores.append(sum(rewards)) |
|
|
|
|
|
|
|
|
returns = deque(maxlen=max_t) |
|
|
n_steps = len(rewards) |
|
|
|
|
|
|
|
|
G = 0 |
|
|
for r in reversed(rewards): |
|
|
G = r + gamma * G |
|
|
returns.appendleft(G) |
|
|
|
|
|
|
|
|
returns = torch.tensor(returns).to(device) |
|
|
returns = (returns - returns.mean()) / (returns.std() + 1e-8) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
policy_loss = [] |
|
|
for log_prob, return_val in zip(saved_log_probs, returns): |
|
|
policy_loss.append(-log_prob * return_val) |
|
|
|
|
|
|
|
|
policy_loss = torch.cat(policy_loss).sum() |
|
|
|
|
|
|
|
|
optimizer.zero_grad() |
|
|
policy_loss.backward() |
|
|
optimizer.step() |
|
|
|
|
|
|
|
|
if i_episode % print_every == 0: |
|
|
print(f'Episode {i_episode}\tAverage Score: {np.mean(scores_deque):.2f}') |
|
|
|
|
|
return scores |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def evaluate_policy(policy, eval_env, n_eval_episodes=10): |
|
|
""" |
|
|
评估策略性能 |
|
|
Args: |
|
|
policy: 训练好的策略网络 |
|
|
eval_env: 评估环境 |
|
|
n_eval_episodes: 评估轮数 |
|
|
Returns: |
|
|
episode_rewards: 每轮奖励列表 |
|
|
mean_reward: 平均奖励 |
|
|
std_reward: 奖励标准差 |
|
|
""" |
|
|
episode_rewards = [] |
|
|
|
|
|
for i in range(n_eval_episodes): |
|
|
state, _ = eval_env.reset() |
|
|
episode_reward = 0 |
|
|
done = False |
|
|
|
|
|
while not done: |
|
|
|
|
|
with torch.no_grad(): |
|
|
state_tensor = torch.from_numpy(state).float().unsqueeze(0).to(device) |
|
|
probs = policy.forward(state_tensor) |
|
|
action = torch.argmax(probs, dim=1).item() |
|
|
|
|
|
state, reward, terminated, truncated, _ = eval_env.step(action) |
|
|
episode_reward += reward |
|
|
done = terminated or truncated |
|
|
|
|
|
episode_rewards.append(episode_reward) |
|
|
print(f"Eval Episode {i+1}: Reward = {episode_reward}") |
|
|
|
|
|
mean_reward = np.mean(episode_rewards) |
|
|
std_reward = np.std(episode_rewards) |
|
|
|
|
|
print(f"\nEvaluation Results:") |
|
|
print(f"Mean Reward: {mean_reward:.2f}") |
|
|
print(f"Std Reward: {std_reward:.2f}") |
|
|
print(f"Score (mean - std): {mean_reward - std_reward:.2f}") |
|
|
|
|
|
return episode_rewards, mean_reward, std_reward |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
HIDDEN_SIZE = 128 |
|
|
LEARNING_RATE = 3e-3 |
|
|
N_TRAINING_EPISODES = 800 |
|
|
MAX_T = 1000 |
|
|
GAMMA = 0.99 |
|
|
PRINT_EVERY = 100 |
|
|
|
|
|
print("="*60) |
|
|
print("Starting REINFORCE Training") |
|
|
print("="*60) |
|
|
|
|
|
|
|
|
policy = Policy(s_size, a_size, HIDDEN_SIZE).to(device) |
|
|
optimizer = optim.Adam(policy.parameters(), lr=LEARNING_RATE) |
|
|
|
|
|
print(f"Policy Network: {policy}") |
|
|
print(f"Optimizer: Adam (lr={LEARNING_RATE})") |
|
|
print(f"Training Episodes: {N_TRAINING_EPISODES}") |
|
|
print(f"Max Steps per Episode: {MAX_T}") |
|
|
print(f"Discount Factor: {GAMMA}") |
|
|
print() |
|
|
|
|
|
|
|
|
scores = reinforce(policy, optimizer, N_TRAINING_EPISODES, MAX_T, GAMMA, PRINT_EVERY) |
|
|
|
|
|
print("\n" + "="*60) |
|
|
print("Training Completed!") |
|
|
print("="*60) |
|
|
|
|
|
|
|
|
MODEL_PATH = "reinforce_cartpole.pth" |
|
|
torch.save({ |
|
|
'policy_state_dict': policy.state_dict(), |
|
|
'optimizer_state_dict': optimizer.state_dict(), |
|
|
's_size': s_size, |
|
|
'a_size': a_size, |
|
|
'hidden_size': HIDDEN_SIZE, |
|
|
'scores': scores |
|
|
}, MODEL_PATH) |
|
|
print(f"✅ Model saved to {MODEL_PATH}") |
|
|
|
|
|
|
|
|
print("\n" + "="*60) |
|
|
print("Evaluating Trained Policy") |
|
|
print("="*60) |
|
|
|
|
|
episode_rewards, mean_reward, std_reward = evaluate_policy(policy, eval_env, n_eval_episodes=10) |
|
|
|
|
|
|
|
|
plt.figure(figsize=(12, 4)) |
|
|
|
|
|
plt.subplot(1, 2, 1) |
|
|
plt.plot(scores) |
|
|
plt.title('Training Scores') |
|
|
plt.xlabel('Episode') |
|
|
plt.ylabel('Score') |
|
|
plt.grid(True) |
|
|
|
|
|
plt.subplot(1, 2, 2) |
|
|
|
|
|
window_size = 100 |
|
|
if len(scores) >= window_size: |
|
|
moving_avg = [np.mean(scores[i:i+window_size]) for i in range(len(scores)-window_size+1)] |
|
|
plt.plot(range(window_size-1, len(scores)), moving_avg) |
|
|
plt.title(f'Moving Average (window={window_size})') |
|
|
plt.xlabel('Episode') |
|
|
plt.ylabel('Average Score') |
|
|
plt.grid(True) |
|
|
|
|
|
plt.tight_layout() |
|
|
plt.savefig('training_results.png', dpi=150, bbox_inches='tight') |
|
|
plt.show() |
|
|
|
|
|
print(f"\n🎉 Final Results:") |
|
|
print(f" Mean Reward: {mean_reward:.2f}") |
|
|
print(f" Std Reward: {std_reward:.2f}") |
|
|
print(f" Score: {mean_reward - std_reward:.2f}") |
|
|
print(f" Required for CartPole-v1: 350.0") |
|
|
|
|
|
if mean_reward - std_reward >= 350: |
|
|
print(f" Status: ✅ PASSED!") |
|
|
else: |
|
|
print(f" Status: ❌ Need {350 - (mean_reward - std_reward):.2f} more points") |
|
|
|