Easy RL 连续动作处理:DDPG算法在Pendulum环境中的应用
引言:连续动作空间的挑战与解决方案
你是否曾在强化学习实践中遇到过这样的困境:当环境中的动作不再是离散的左/右、上/下,而是需要精确控制力度、角度等连续值时,传统的Q-learning或DQN算法便显得力不从心?在机器人控制、自动驾驶、机械臂操作等实际场景中,连续动作空间才是常态。本文将深入解析深度确定性策略梯度(Deep Deterministic Policy Gradient, DDPG)算法如何突破这一限制,并通过Pendulum(倒立摆)环境的实战案例,带你掌握连续动作问题的解决之道。
读完本文后,你将能够:
- 理解DDPG算法融合Actor-Critic与DQN优势的核心设计
- 掌握OU噪声生成、目标网络软更新等关键技术细节
- 独立实现DDPG算法并在Pendulum环境中获得稳定控制效果
- 分析算法参数对性能的影响并进行调优
DDPG算法原理:解决连续动作难题的双网络架构
2.1 从DQN到DDPG的进化之路
算法 | 动作空间 | 策略类型 | 核心创新 | 局限性 |
---|---|---|---|---|
DQN | 离散 | 异策略(Off-policy) | 经验回放、目标网络 | 无法直接处理连续动作 |
DPG | 连续 | 确定性策略 | 策略梯度与价值函数结合 | 高方差、样本效率低 |
DDPG | 连续 | 异策略确定性策略 | 目标网络软更新、Actor-Critic架构 | 对超参数敏感、训练不稳定 |
DDPG本质上是DQN思想在连续动作空间的扩展,通过引入Actor-Critic框架实现了确定性策略的高效学习。其核心创新点在于:
2.2 关键技术组件解析
2.2.1 双网络架构
- Actor网络:输入状态s,输出确定性动作a=μ(s|θμ),负责策略学习
- Critic网络:输入状态s和动作a,输出Q值Q(s,a|θQ),负责评估动作价值
- 目标Actor网络μ':缓慢跟踪主网络参数,提供稳定的目标动作a'=μ'(s'|θμ')
- 目标Critic网络Q':缓慢跟踪主网络参数,提供稳定的目标Q值y=r+γQ'(s',a')
2.2.2 软更新机制 传统DQN采用硬更新(定期复制参数),而DDPG采用软更新策略: θμ' ← τθμ + (1-τ)θμ' θQ' ← τθQ + (1-τ)θQ' 其中τ∈[0,1]是软更新系数(通常取0.01),这种渐进式更新有效降低了训练震荡。
2.2.3 OU噪声探索 在连续动作空间中,DDPG通过Ornstein-Uhlenbeck过程生成时间相关的探索噪声: dx = θ(μ-x)dt + σdWt 其中θ控制均值回归速度,σ控制噪声强度,dWt是维纳过程增量。噪声会随训练进程逐渐衰减。
代码实现:从零构建DDPG算法
3.1 环境与参数配置
Pendulum-v1环境特性:
- 状态空间:3维(摆角、角速度)
- 动作空间:1维连续值(-2, 2),代表施加在摆上的力矩
- 奖励函数:r = -θ² - 0.1θ'² - 0.001a²,目标是保持倒立状态(θ≈0)
def get_args():
"""DDPG算法超参数配置"""
parser = argparse.ArgumentParser(description="DDPG参数")
parser.add_argument('--env_name', default='Pendulum-v1', type=str)
parser.add_argument('--train_eps', default=300, type=int) # 训练回合数
parser.add_argument('--test_eps', default=20, type=int) # 测试回合数
parser.add_argument('--gamma', default=0.99, type=float) # 折扣因子
parser.add_argument('--critic_lr', default=1e-3, type=float) # Critic学习率
parser.add_argument('--actor_lr', default=1e-4, type=float) # Actor学习率
parser.add_argument('--memory_capacity', default=8000, type=int) # 经验池容量
parser.add_argument('--batch_size', default=128, type=int) # 批处理大小
parser.add_argument('--tau', default=1e-2, type=float) # 软更新系数
parser.add_argument('--hidden_dim', default=256, type=int) # 网络隐藏层维度
return parser.parse_args([])
3.2 核心网络实现
3.2.1 Actor网络
import torch
import torch.nn as nn
import torch.nn.functional as F
class Actor(nn.Module):
def __init__(self, n_states, n_actions, hidden_dim=256, init_w=3e-3):
super(Actor, self).__init__()
self.linear1 = nn.Linear(n_states, hidden_dim)
self.linear2 = nn.Linear(hidden_dim, hidden_dim)
self.linear3 = nn.Linear(hidden_dim, n_actions)
# 初始化输出层参数为较小值,避免初始动作过大
self.linear3.weight.data.uniform_(-init_w, init_w)
self.linear3.bias.data.uniform_(-init_w, init_w)
def forward(self, x):
x = F.relu(self.linear1(x))
x = F.relu(self.linear2(x))
x = torch.tanh(self.linear3(x)) # 输出范围[-1,1],需后续映射到动作空间
return x
3.2.2 Critic网络
class Critic(nn.Module):
def __init__(self, n_states, n_actions, hidden_dim=256, init_w=3e-3):
super(Critic, self).__init__()
# 状态输入分支
self.linear1 = nn.Linear(n_states + n_actions, hidden_dim)
self.linear2 = nn.Linear(hidden_dim, hidden_dim)
self.linear3 = nn.Linear(hidden_dim, 1)
# 初始化输出层参数
self.linear3.weight.data.uniform_(-init_w, init_w)
self.linear3.bias.data.uniform_(-init_w, init_w)
def forward(self, state, action):
# 拼接状态和动作作为输入
x = torch.cat([state, action], 1)
x = F.relu(self.linear1(x))
x = F.relu(self.linear2(x))
x = self.linear3(x) # Q值输出
return x
3.3 经验回放与OU噪声
3.3.1 经验回放缓冲区
from collections import deque
import random
class ReplayBuffer:
def __init__(self, capacity: int):
self.buffer = deque(maxlen=capacity)
def push(self, transitions):
"""存储(s,a,r,s',done)"""
self.buffer.append(transitions)
def sample(self, batch_size: int):
"""随机采样批量样本"""
if batch_size > len(self.buffer):
batch_size = len(self.buffer)
batch = random.sample(self.buffer, batch_size)
return zip(*batch) # 返回(s_list,a_list,r_list,s'_list,done_list)
def __len__(self):
return len(self.buffer)
3.3.2 OU噪声生成器
class OUNoise(object):
"""Ornstein-Uhlenbeck噪声过程"""
def __init__(self, action_space, mu=0.0, theta=0.15, max_sigma=0.3, min_sigma=0.3, decay_period=100000):
self.mu = mu # 均值
self.theta = theta # 均值回归系数
self.sigma = max_sigma # 噪声强度
self.max_sigma = max_sigma
self.min_sigma = min_sigma
self.decay_period = decay_period # 噪声衰减周期
self.action_dim = action_space.shape[0]
self.low = action_space.low
self.high = action_space.high
self.reset()
def reset(self):
"""重置噪声状态"""
self.obs = np.ones(self.action_dim) * self.mu
def evolve_obs(self):
"""生成下一个噪声状态"""
x = self.obs
dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.action_dim)
self.obs = x + dx
return self.obs
def get_action(self, action, t=0):
"""向动作添加噪声并剪切到合法范围"""
ou_obs = self.evolve_obs()
# 随时间衰减噪声强度
self.sigma = self.max_sigma - (self.max_sigma - self.min_sigma) * min(1.0, t / self.decay_period)
return np.clip(action + ou_obs, self.low, self.high)
3.4 DDPG算法核心
import torch.optim as optim
import numpy as np
class DDPG:
def __init__(self, models, memories, cfg):
self.device = torch.device(cfg['device'])
# 初始化网络
self.critic = models['critic'].to(self.device)
self.target_critic = models['critic'].to(self.device)
self.actor = models['actor'].to(self.device)
self.target_actor = models['actor'].to(self.device)
# 复制参数到目标网络
for target_param, param in zip(self.target_critic.parameters(), self.critic.parameters()):
target_param.data.copy_(param.data)
for target_param, param in zip(self.target_actor.parameters(), self.actor.parameters()):
target_param.data.copy_(param.data)
# 优化器
self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=cfg['critic_lr'])
self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=cfg['actor_lr'])
self.memory = memories['memory']
self.batch_size = cfg['batch_size']
self.gamma = cfg['gamma'] # 折扣因子
self.tau = cfg['tau'] # 软更新系数
def sample_action(self, state):
"""采样动作(带梯度计算)"""
state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
action = self.actor(state)
return action.detach().cpu().numpy()[0, 0]
@torch.no_grad()
def predict_action(self, state):
"""预测动作(测试时使用,不带梯度)"""
state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
action = self.actor(state)
return action.cpu().numpy()[0, 0]
def update(self):
"""更新网络参数"""
if len(self.memory) < self.batch_size:
return # 经验不足时不更新
# 采样批量样本
state, action, reward, next_state, done = self.memory.sample(self.batch_size)
# 转换为张量
state = torch.FloatTensor(np.array(state)).to(self.device)
next_state = torch.FloatTensor(np.array(next_state)).to(self.device)
action = torch.FloatTensor(np.array(action)).to(self.device)
reward = torch.FloatTensor(reward).unsqueeze(1).to(self.device)
done = torch.FloatTensor(np.float32(done)).unsqueeze(1).to(self.device)
# 更新Actor网络(策略梯度)
actor_loss = self.critic(state, self.actor(state))
actor_loss = -actor_loss.mean() # 最大化Q值等价于最小化负Q值
# 更新Critic网络(TD学习)
next_action = self.target_actor(next_state)
target_value = self.target_critic(next_state, next_action.detach())
expected_value = reward + (1.0 - done) * self.gamma * target_value
actual_value = self.critic(state, action)
critic_loss = nn.MSELoss()(actual_value, expected_value.detach())
# 反向传播与优化
self.actor_optimizer.zero_grad()
actor_loss.backward()
self.actor_optimizer.step()
self.critic_optimizer.zero_grad()
critic_loss.backward()
self.critic_optimizer.step()
# 软更新目标网络
for target_param, param in zip(self.target_critic.parameters(), self.critic.parameters()):
target_param.data.copy_(target_param.data * (1.0 - self.tau) + param.data * self.tau)
for target_param, param in zip(self.target_actor.parameters(), self.actor.parameters()):
target_param.data.copy_(target_param.data * (1.0 - self.tau) + param.data * self.tau)
3.5 训练与测试流程
import gym
import matplotlib.pyplot as plt
import seaborn as sns
class NormalizedActions(gym.ActionWrapper):
"""动作空间归一化包装器"""
def action(self, action):
low_bound = self.action_space.low
upper_bound = self.action_space.high
# 将[-1,1]范围的动作映射到原动作空间
action = low_bound + (action + 1.0) * 0.5 * (upper_bound - low_bound)
return np.clip(action, low_bound, upper_bound)
def reverse_action(self, action):
low_bound = self.action_space.low
upper_bound = self.action_space.high
# 将原动作空间动作映射回[-1,1]
action = 2 * (action - low_bound) / (upper_bound - low_bound) - 1
return np.clip(action, -1, 1)
def train(cfg, env, agent):
"""训练函数"""
print("开始训练!")
ou_noise = OUNoise(env.action_space) # 初始化OU噪声
rewards = [] # 记录所有回合奖励
for i_ep in range(cfg['train_eps']):
state = env.reset()
ou_noise.reset() # 重置噪声
ep_reward = 0
for i_step in range(cfg['max_steps']):
action = agent.sample_action(state)
action = ou_noise.get_action(action, i_step+1) # 添加探索噪声
next_state, reward, done, _ = env.step(action)
ep_reward += reward
# 存储经验
agent.memory.push((state, action, reward, next_state, done))
# 更新网络
agent.update()
state = next_state
if done:
break
rewards.append(ep_reward)
# 打印训练进度
if (i_ep+1) % 10 == 0:
print(f"回合:{i_ep+1}/{cfg['train_eps']},奖励:{ep_reward:.2f}")
print("完成训练!")
return {'rewards': rewards}
def test(cfg, env, agent):
"""测试函数"""
print("开始测试!")
rewards = []
for i_ep in range(cfg['test_eps']):
state = env.reset()
ep_reward = 0
for i_step in range(cfg['max_steps']):
action = agent.predict_action(state) # 无噪声确定性动作
next_state, reward, done, _ = env.step(action)
ep_reward += reward
state = next_state
if done:
break
rewards.append(ep_reward)
print(f"回合:{i_ep+1}/{cfg['test_eps']},奖励:{ep_reward:.2f}")
print("完成测试!")
return {'rewards': rewards}
def smooth(data, weight=0.9):
"""平滑奖励曲线"""
smoothed = []
last = data[0]
for point in data:
smoothed_val = last * weight + (1 - weight) * point
smoothed.append(smoothed_val)
last = smoothed_val
return smoothed
def plot_rewards(rewards, cfg):
"""绘制奖励曲线"""
sns.set()
plt.figure()
plt.title(f"DDPG在Pendulum-v1环境上的训练曲线")
plt.xlabel("回合数")
plt.ylabel("奖励值")
plt.plot(rewards, label="原始奖励")
plt.plot(smooth(rewards),
创作声明:本文部分内容由AI辅助生成(AIGC),仅供参考