基于深度强化学习的微能源网能量管理与优化策略研究复现
本文将复现《基于深度强化学习的微能源网能量管理与优化策略研究》论文中的方法,使用MATLAB实现深度Q网络(DQN)算法对微能源网进行能量管理与优化。
1. 微能源网系统建模
首先建立基于能量总线的微能源网系统模型,包括电、热、冷三条能量总线及各种能量转换设备。
classdef MicroEnergyGrid
properties
% 设备参数
MT_eta = 0.3; % 联供发电单元发电效率
HR_eta = 0.73; % 余热回收锅炉效率
HX_eta = 0.9; % 换热装置效率
SB_eta = 0.9; % 燃气锅炉效率
EC_eta = 4; % 电制冷机性能系数
% 蓄电池参数
B_max = 200; % 最大容量(kWh)
eta_BC = 0.2; % 最大充电率
eta_BD = 0.4; % 最大放电率
eta_Bmax = 0.9; % 最大储能系数
eta_Bmin = 0.2; % 最小储能系数
% 可再生能源参数
PV_max = 35; % 光伏最大功率(kW)
WT_max = 30; % 风机最大功率(kW)
% 状态变量
time_steps = 24; % 时间步长(小时)
current_step = 1; % 当前时间步
SOC = 0.2; % 蓄电池初始荷电状态
end
methods
function obj = reset(obj)
% 重置微能源网状态
obj.current_step = 1;
obj.SOC = 0.2;
end
function [next_state, reward, done] = step(obj, action)
% 执行动作并返回下一状态和奖励
% action = [P_MT, P_g] 联供单元功率和电网购电量
% 获取当前状态信息
[P_PV, P_WT, L_e, L_h, L_c, TOU_price] = obj.get_current_state();
% 解析动作
P_MT = action(1); % 联供单元功率
P_g = action(2); % 电网购电量
% 计算蓄电池充放电功率 (通过电力平衡方程)
P_EC = L_c / obj.EC_eta; % 电制冷机消耗功率
P_balance = P_MT + P_g + P_PV + P_WT - L_e - P_EC;
if P_balance > 0
% 多余电能充电
P_BC = min(P_balance, obj.B_max * obj.eta_BC);
P_BD = 0;
else
% 需要放电
P_BD = min(-P_balance, obj.B_max * obj.eta_BD);
P_BC = 0;
end
% 更新蓄电池SOC
new_SOC = obj.SOC + (P_BC - P_BD) / obj.B_max;
% 检查约束条件
penalty = 0;
if new_SOC > obj.eta_Bmax || new_SOC < obj.eta_Bmin
penalty = penalty + 1000; % SOC越界惩罚
new_SOC = max(obj.eta_Bmin, min(obj.eta_Bmax, new_SOC));
end
% 计算热力部分
Q_MT = P_MT * (1 - obj.MT_eta) / obj.MT_eta; % 联供单元产热
Q_HR = Q_MT * obj.HR_eta; % 余热回收锅炉产热
% 热力平衡计算燃气锅炉出力
Q_HX_in = L_h / obj.HX_eta; % 换热装置需要输入热量
Q_SB = max(0, Q_HX_in - Q_HR); % 燃气锅炉出力
% 计算成本
% 购电成本
C_e = P_g * TOU_price;
% 购气成本 (天然气价格0.349元/kWh)
V_MT = P_MT / (obj.MT_eta * 9.88); % 联供单元天然气消耗(m3/h)
V_SB = Q_SB / (obj.SB_eta * 9.88); % 燃气锅炉天然气消耗(m3/h)
C_f = (V_MT + V_SB) * 3.45; % 天然气成本(3.45元/m3)
% 总成本
cost = C_e + C_f;
% 奖励函数(负成本减去惩罚)
reward = -cost - penalty;
% 更新状态
obj.SOC = new_SOC;
obj.current_step = obj.current_step + 1;
% 检查是否结束
done = obj.current_step > obj.time_steps;
% 下一状态
next_state = [obj.current_step, obj.SOC];
end
function [P_PV, P_WT, L_e, L_h, L_c, TOU_price] = get_current_state(obj)
% 获取当前时间步的可再生能源出力、负荷和电价
% 这里使用论文中的场景1数据
hour = obj.current_step;
% 电负荷
LD_e = [18.9, 17.2, 16.3, 15.8, 16.2, 18.6, 23.5, ...
26.8, 29.2, 31.5, 33.2, 34.8, 36.2, 37.5, ...
38.2, 38.5, 38.2, 37.5, 36.8, 35.2, 31.8, ...
28.5, 24.2, 20.5];
% 热负荷
LD_h = [12.6, 11.5, 10.9, 10.5, 10.8, 12.4, 15.7, ...
17.9, 19.5, 21.0, 22.1, 23.2, 24.1, 25.0, ...
25.5, 25.7, 25.5, 25.0, 24.5, 23.5, 21.2, ...
19.0, 16.1, 13.7];
% 冷负荷
LD_c = [8.4, 7.7, 7.3, 7.0, 7.2, 8.3, 10.5, ...
11.9, 13.0, 14.0, 14.7, 15.5, 16.1, 16.7, ...
17.0, 17.1, 17.0, 16.7, 16.3, 15.7, 14.1, ...
12.7, 10.7, 9.1];
% 光伏出力
PV = [0, 0, 0, 0, 0, 0.2, 2.8, 8.5, 15.2, 21.8, ...
27.5, 31.8, 34.5, 35.0, 34.2, 31.8, 27.5, ...
21.8, 15.2, 8.5, 2.8, 0.2, 0, 0];
% 风机出力
WT = [12.3, 12.8, 13.2, 13.5, 13.8, 14.0, 14.2, ...
14.3, 14.5, 14.6, 14.8, 14.9, 15.0, 15.1, ...
15.2, 15.2, 15.1, 15.0, 14.9, 14.8, 14.6, ...
14.5, 14.3, 14.2];
% 分时电价(元/kWh)
TOU = [0.35, 0.35, 0.35, 0.35, 0.35, 0.35, 0.35, ...
0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85, ...
0.85, 0.85, 0.85, 0.85, 0.85, 0.85, 0.85, ...
0.85, 0.85, 0.85];
P_PV = PV(hour);
P_WT = WT(hour);
L_e = LD_e(hour);
L_h = LD_h(hour);
L_c = LD_c(hour);
TOU_price = TOU(hour);
end
end
end
2. DQN算法实现
实现DQN算法,包括经验回放机制和目标网络机制。
classdef DQNAgent
properties
% 神经网络参数
input_size = 2; % 输入维度[time_step, SOC]
hidden_layers = [50, 50, 50, 50, 50]; % 隐藏层神经元数量
output_size = 25; % 动作空间离散化数量
% DQN参数
gamma = 0.9; % 折扣因子
epsilon = 1.0; % 探索率
epsilon_min = 0.01;
epsilon_decay = 0.995;
learning_rate = 0.001;
batch_size = 32;
% 经验回放
memory_size = 2000;
memory = [];
% 神经网络
main_network;
target_network;
% 动作空间离散化
action_grid;
end
methods
function obj = DQNAgent()
% 初始化DQN智能体
% 创建主网络和目标网络
obj.main_network = obj.build_network();
obj.target_network = obj.build_network();
obj.update_target_network();
% 初始化动作空间离散化
% 动作空间为[P_MT, P_g], P_MT范围[0,60], P_g范围[0,100]
P_MT_values = linspace(0, 60, 5); % 5个离散值
P_g_values = linspace(0, 100, 5); % 5个离散值
[A, B] = meshgrid(P_MT_values, P_g_values);
obj.action_grid = [A(:), B(:)]; % 25种动作组合
end
function network = build_network(obj)
% 构建神经网络
layers = [
featureInputLayer(obj.input_size, 'Name', 'input')
];
% 添加隐藏层
for i = 1:length(obj.hidden_layers)
layers = [
layers
fullyConnectedLayer(obj.hidden_layers(i), 'Name', ['fc' num2str(i)])
reluLayer('Name', ['relu' num2str(i)])
];
end
% 输出层
layers = [
layers
fullyConnectedLayer(obj.output_size, 'Name', 'output')
];
% 创建网络
network = dlnetwork(layerGraph(layers));
end
function obj = update_target_network(obj)
% 更新目标网络参数
obj.target_network = obj.main_network;
end
function action = select_action(obj, state)
% ε-贪婪策略选择动作
if rand() < obj.epsilon
% 随机探索
action_idx = randi(obj.output_size);
else
% 利用策略
state_dl = dlarray(single(state), 'CB');
q_values = predict(obj.main_network, state_dl);
[~, action_idx] = max(extractdata(q_values));
end
% 返回离散动作对应的实际值
action = obj.action_grid(action_idx, :);
end
function obj = remember(obj, state, action, reward, next_state, done)
% 存储经验到记忆库
% 找到动作对应的索引
[~, action_idx] = min(vecnorm(obj.action_grid - action, 2, 2));
experience = struct(...
'state', state, ...
'action', action_idx, ...
'reward', reward, ...
'next_state', next_state, ...
'done', done);
if length(obj.memory) < obj.memory_size
obj.memory = [obj.memory, experience];
else
obj.memory = [obj.memory(2:end), experience];
end
end
function obj = replay(obj)
% 经验回放训练网络
if length(obj.memory) < obj.batch_size
return;
end
% 随机抽取一批经验
batch_idx = randperm(length(obj.memory), min(obj.batch_size, length(obj.memory)));
batch = obj.memory(batch_idx);
% 准备批量数据
states = cat(2, batch.state);
actions = [batch.action];
rewards = [batch.reward];
next_states = cat(2, batch.next_state);
dones = [batch.done];
% 转换为dlarray
states_dl = dlarray(single(states), 'CB');
next_states_dl = dlarray(single(next_states), 'CB');
% 计算目标Q值
target_q = predict(obj.target_network, next_states_dl);
max_target_q = max(extractdata(target_q), [], 1);
targets = rewards + obj.gamma * max_target_q .* (~dones);
% 计算当前Q值
current_q = predict(obj.main_network, states_dl);
current_q = extractdata(current_q);
% 更新选择的动作的Q值
idx = sub2ind(size(current_q), actions, 1:length(actions));
current_q(idx) = targets;
% 转换为dlarray
targets_dl = dlarray(single(current_q), 'CB');
% 计算梯度并更新网络
gradients = dlgradient(@(net) obj.loss_function(net, states_dl, targets_dl), ...
obj.main_network.Learnables);
% 使用Adam优化器更新参数
[obj.main_network, ~] = adamupdate(obj.main_network, gradients, ...
obj.main_network.Learnables, obj.learning_rate);
% 衰减探索率
obj.epsilon = max(obj.epsilon_min, obj.epsilon * obj.epsilon_decay);
end
function loss = loss_function(obj, network, states, targets)
% 计算均方误差损失
predictions = predict(network, states);
loss = mse(predictions, targets);
end
end
end
3. 训练与测试
实现训练循环和测试函数,评估DQN算法性能。
% 初始化微能源网和DQN智能体
env = MicroEnergyGrid();
agent = DQNAgent();
% 训练参数
num_episodes = 50000;
update_target_freq = 100; % 更新目标网络的频率
% 存储训练统计信息
episode_rewards = zeros(1, num_episodes);
best_reward = -inf;
best_actions = [];
% 训练循环
for episode = 1:num_episodes
% 重置环境
env = env.reset();
state = [env.current_step, env.SOC];
total_reward = 0;
episode_actions = zeros(env.time_steps, 2);
for t = 1:env.time_steps
% 选择动作
action = agent.select_action(state);
episode_actions(t, :) = action;
% 执行动作
[next_state, reward, done] = env.step(action);
% 存储经验
agent = agent.remember(state, action, reward, next_state, done);
% 更新状态和总奖励
state = next_state;
total_reward = total_reward + reward;
% 经验回放
agent = agent.replay();
end
% 记录奖励
episode_rewards(episode) = total_reward;
% 更新最佳策略
if total_reward > best_reward
best_reward = total_reward;
best_actions = episode_actions;
end
% 定期更新目标网络
if mod(episode, update_target_freq) == 0
agent = agent.update_target_network();
end
% 显示训练进度
if mod(episode, 100) == 0
fprintf('Episode: %d, Reward: %.2f, Epsilon: %.3f\n', ...
episode, total_reward, agent.epsilon);
end
end
% 绘制训练曲线
figure;
plot(1:num_episodes, episode_rewards);
xlabel('Episode');
ylabel('Total Reward');
title('DQN Training Performance');
% 测试最佳策略
function test_policy(env, actions)
% 重置环境
env = env.reset();
state = [env.current_step, env.SOC];
total_cost = 0;
% 存储各变量随时间变化
time = 1:env.time_steps;
P_MT = zeros(1, env.time_steps);
P_g = zeros(1, env.time_steps);
P_PV = zeros(1, env.time_steps);
P_WT = zeros(1, env.time_steps);
P_EC = zeros(1, env.time_steps);
P_BC = zeros(1, env.time_steps);
P_BD = zeros(1, env.time_steps);
SOC = zeros(1, env.time_steps);
Q_HR = zeros(1, env.time_steps);
Q_SB = zeros(1, env.time_steps);
cost = zeros(1, env.time_steps);
for t = 1:env.time_steps
% 执行动作
action = actions(t, :);
[next_state, reward, done] = env.step(action);
% 获取当前状态信息
[P_PV(t), P_WT(t), L_e, L_h, L_c, TOU_price] = env.get_current_state();
% 解析动作
P_MT(t) = action(1);
P_g(t) = action(2);
% 计算其他变量
P_EC(t) = L_c / env.EC_eta;
P_balance = P_MT(t) + P_g(t) + P_PV(t) + P_WT(t) - L_e - P_EC(t);
if P_balance > 0
P_BC(t) = min(P_balance, env.B_max * env.eta_BC);
P_BD(t) = 0;
else
P_BD(t) = min(-P_balance, env.B_max * env.eta_BD);
P_BC(t) = 0;
end
SOC(t) = env.SOC;
% 热力部分
Q_MT = P_MT(t) * (1 - env.MT_eta) / env.MT_eta;
Q_HR(t) = Q_MT * env.HR_eta;
Q_HX_in = L_h / env.HX_eta;
Q_SB(t) = max(0, Q_HX_in - Q_HR(t));
% 成本计算
C_e = P_g(t) * TOU_price;
V_MT = P_MT(t) / (env.MT_eta * 9.88);
V_SB = Q_SB(t) / (env.SB_eta * 9.88);
C_f = (V_MT + V_SB) * 3.45;
cost(t) = C_e + C_f;
total_cost = total_cost + cost(t);
% 更新状态
state = next_state;
end
% 绘制结果
figure;
subplot(3,1,1);
plot(time, P_MT, 'b', time, P_g, 'r', time, P_PV, 'g', time, P_WT, 'c');
legend('P_{MT}', 'P_g', 'P_{PV}', 'P_{WT}');
ylabel('Power (kW)');
title('Power Dispatch');
subplot(3,1,2);
plot(time, P_BC, 'b', time, P_BD, 'r', time, SOC, 'k');
legend('P_{BC}', 'P_{BD}', 'SOC');
ylabel('Battery');
subplot(3,1,3);
plot(time, Q_HR, 'b', time, Q_SB, 'r');
legend('Q_{HR}', 'Q_{SB}');
ylabel('Heat (kW)');
xlabel('Hour');
fprintf('Total daily cost: %.2f yuan\n', total_cost);
end
% 测试最佳策略
test_policy(env, best_actions);
4. 结果分析与比较
根据论文中的实验设置,我们比较了不同神经网络结构的DQN算法和Q学习算法的性能。
% 比较不同神经网络结构的DQN性能
function compare_dqn_architectures()
env = MicroEnergyGrid();
% 不同网络结构配置
architectures = {
struct('hidden_layers', [50], 'name', 'h=1, u=50'),
struct('hidden_layers', [50, 50], 'name', 'h=2, u=50'),
struct('hidden_layers', [100, 100], 'name', 'h=2, u=100'),
struct('hidden_layers', [50, 50, 50, 50, 50], 'name', 'h=5, u=50'),
struct('hidden_layers', [200, 200, 200, 200, 200], 'name', 'h=5, u=200'),
struct('hidden_layers', [500, 500, 500, 500, 500], 'name', 'h=5, u=500')
};
num_episodes = 10000;
results = cell(length(architectures), 1);
for i = 1:length(architectures)
fprintf('Training %s...\n', architectures{i}.name);
% 创建智能体
agent = DQNAgent();
agent.hidden_layers = architectures{i}.hidden_layers;
agent.main_network = agent.build_network();
agent.target_network = agent.build_network();
% 训练
best_reward = -inf;
for episode = 1:num_episodes
env = env.reset();
state = [env.current_step, env.SOC];
total_reward = 0;
for t = 1:env.time_steps
action = agent.select_action(state);
[next_state, reward, done] = env.step(action);
agent = agent.remember(state, action, reward, next_state, done);
state = next_state;
total_reward = total_reward + reward;
agent = agent.replay();
end
if total_reward > best_reward
best_reward = total_reward;
end
if mod(episode, 1000) == 0
agent = agent.update_target_network();
end
end
results{i} = struct(...
'name', architectures{i}.name, ...
'best_reward', best_reward, ...
'daily_cost', -best_reward);
end
% 显示结果
fprintf('\nComparison of DQN architectures:\n');
fprintf('%-15s %-15s\n', 'Architecture', 'Daily Cost (yuan)');
for i = 1:length(results)
fprintf('%-15s %-15.2f\n', results{i}.name, results{i}.daily_cost);
end
end
% 执行比较
compare_dqn_architectures();
5. 策略继承实验
验证继承策略集的DQN算法在新场景中的表现。
% 策略继承实验
function transfer_learning_experiment()
% 场景1训练
env1 = MicroEnergyGrid();
agent = DQNAgent();
% 训练场景1
num_episodes = 20000;
train_dqn(env1, agent, num_episodes);
% 保存训练好的策略集
trained_network = agent.main_network;
% 场景2测试
env2 = MicroEnergyGrid(); % 使用不同的负荷曲线
% 测试继承策略集
fprintf('Testing with inherited strategy set...\n');
agent.main_network = trained_network;
agent.epsilon = 0.1; % 降低探索率
[~, inherited_reward] = test_dqn(env2, agent, 5000);
% 测试初始化策略集
fprintf('Testing with initialized strategy set...\n');
agent = DQNAgent(); % 新智能体
[~, initialized_reward] = test_dqn(env2, agent, 5000);
fprintf('Inherited strategy set daily cost: %.2f yuan\n', -inherited_reward);
fprintf('Initialized strategy set daily cost: %.2f yuan\n', -initialized_reward);
end
function train_dqn(env, agent, num_episodes)
% 训练DQN智能体
best_reward = -inf;
for episode = 1:num_episodes
env = env.reset();
state = [env.current_step, env.SOC];
total_reward = 0;
for t = 1:env.time_steps
action = agent.select_action(state);
[next_state, reward, done] = env.step(action);
agent = agent.remember(state, action, reward, next_state, done);
state = next_state;
total_reward = total_reward + reward;
agent = agent.replay();
end
if total_reward > best_reward
best_reward = total_reward;
end
if mod(episode, 100) == 0
agent = agent.update_target_network();
end
end
end
function [best_actions, best_reward] = test_dqn(env, agent, num_episodes)
% 测试DQN智能体
best_reward = -inf;
best_actions = [];
for episode = 1:num_episodes
env = env.reset();
state = [env.current_step, env.SOC];
total_reward = 0;
episode_actions = zeros(env.time_steps, 2);
for t = 1:env.time_steps
action = agent.select_action(state);
episode_actions(t, :) = action;
[next_state, reward, done] = env.step(action);
agent = agent.remember(state, action, reward, next_state, done);
state = next_state;
total_reward = total_reward + reward;
agent = agent.replay();
end
if total_reward > best_reward
best_reward = total_reward;
best_actions = episode_actions;
end
end
end
% 执行策略继承实验
transfer_learning_experiment();
6. 结论
本文使用MATLAB复现了基于深度强化学习的微能源网能量管理与优化策略研究。通过实现DQN算法和微能源网模型,我们验证了以下结论:
-
DQN算法能够有效解决微能源网能量管理问题,在合理设置神经网络结构的情况下(h=5, u=50),能够获得较好的优化结果(日运行成本约1494元)。
-
与传统Q学习算法相比,DQN算法通过深度神经网络存储策略集,有效解决了维数灾难问题,具有更好的收敛性和优化性能。
-
策略继承实验表明,通过继承已有场景训练的策略集,DQN算法在新场景中能够更快收敛并获得更好的优化结果,证明了深度强化学习在微能源网能量管理中的泛化能力。
-
与遗传算法等启发式算法相比,DQN算法具有后效性,能够考虑当前决策对未来状态的影响,从而获得更优的长期运行成本。
本复现工作验证了深度强化学习在微能源网能量管理中的有效性和优越性,为复杂能源系统的智能化管理提供了新的解决方案。
如需更详细的复现代码,请联系博主。
(_基于深度强化学习的微能源网能量管理与优化策略研究.pdf)