Глубокая оптимизация Q-Learning - Цифровое Кемерово

Глубокая оптимизация Q-Learning ⇐ Python

1 сообщение • Страница 1 из 1

Anonymous

Цитата

Сообщение Anonymous » 18 дек 2024, 20:56

Я работаю над диссертацией, в которой мне нужно разработать агент DQN для определения оптимального энергетического плана на следующий день. Цель состоит в том, чтобы сбалансировать максимальное время автономной работы, оптимизацию использования энергии и обеспечение комфорта пользователя за счет использования исторических данных о потреблении энергии, производстве и погоде. Однако, поскольку я относительно новичок в глубоком обучении, текущая политика, изученная агентом, не кажется оптимальной. Буду очень признателен за любые советы, как улучшить результаты и повысить производительность агента.
Вот код:

agent.py
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import deque

class DQN(nn.Module):
def init(self, state_size, action_size):
super(DQN, self).init()
self.fc1 = nn.Linear(state_size, 128)
self.fc2 = nn.Linear(128, 128)
self.fc3 = nn.Linear(128, action_size)

def forward(self, x):
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
return self.fc3(x)

class DQNAgent:
def init(self, state_size, action_size):
self.state_size = state_size
self.action_size = action_size

# Hyperparameters
self.gamma = 0.95
self.epsilon = 1.0
self.epsilon_min = 0.01
self.epsilon_decay = 0.999
self.lr = 0.001
self.batch_size = 32
self.memory = deque(maxlen=2000)

# Q-Network
self.model = DQN(state_size, action_size)
self.target_model = DQN(state_size, action_size)
self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr)

self.update_target_model()

def update_target_model(self):
self.target_model.load_state_dict(self.model.state_dict())

def remember(self, state, action, reward, next_state, done):
self.memory.append((state, action, reward, next_state, done))

def act(self, state):
if np.random.rand() self.epsilon_min:
self.epsilon *= self.epsilon_decay

def train_dqn(self, env, episodes, train_days=3, target_update_freq=5000):
step_count = 0

for e in range(episodes):
total_reward = 0

for day in range(1, train_days + 1):
state = env.reset(day=day)
done = False
day_reward = 0

while not done:
action = self.act(state)
next_state, reward, done = env.step(action, day=day)

if next_state is not None:
self.remember(state, action, reward, next_state, done)
state = next_state

day_reward += reward
step_count += 1

if step_count % target_update_freq == 0:
self.update_target_model()

total_reward += day_reward

self.replay()

print(f"Episode {e + 1}/{episodes}, Total Reward: {total_reward:.2f}, Epsilon: {self.epsilon:.4f}")

def test_dqn(self, env, test_day):
self.epsilon = 0.0
state = env.reset(day=test_day, testing_mode=True)
done = False
total_reward = 0

while not done:
action = self.act(state)
print(f"Time Step {env.time_step}, Action: {env.get_action_name(action)}")

# Perform the step
next_state, reward, done = env.step(action, day=test_day)
if next_state is not None:
state = next_state

total_reward += reward

print(f"Total Reward for Day {test_day}: {total_reward:.2f}")

reward_function.py

class RewardFunction:
REWARD = {
"Low": 0.5,
"Medium": 0.75,
"High": 1.0
}
PENALTY = {
"Low": -0.5,
"Medium": -0.75,
"High": -1.0
}

def __init__(self, battery_capacity, appliance_preferences):
self.battery_capacity = battery_capacity
self.appliance_preferences = appliance_preferences
self.max_daily_activations = 2

self.active_appliances = {}
self.appliance_activation_count = {}
self.delayed_appliance_penalties = {}
self.battery_idle_count = 0

self.charge_count = 0
self.discharge_count = 0

self.running_time_tracker = {}

def reset_daily_counters(self):
self.appliance_activation_count = {}
self.active_appliances = {}
self.delayed_appliance_penalties = {}
self.battery_idle_count = 0
self.charge_count = 0
self.discharge_count = 0
self.running_time_tracker = {}

def calculate_reward(self, action, current_data, hour, battery_charge):
production = current_data['production']
consumption = current_data['consumption']
reward = 0

if hour == 0:
self.reset_daily_counters()

# Battery Management Actions
if action == 0: # Charge battery
reward, battery_charge = self._charge_battery(production, consumption, battery_charge, reward)
elif action == 1: # Discharge battery
reward, battery_charge = self._discharge_battery(battery_charge, production, consumption, reward)
elif action == 2: # Idle battery
reward = self._idle_battery(reward, production, consumption)

# Appliance Actions
if action >= 3:
reward = self._manage_appliance_with_running_time(action, consumption, production, hour, battery_charge, reward)

# Energy Balancing
reward = self._balance_energy(production, consumption, reward)

return reward, battery_charge

def _charge_battery(self, production, consumption, battery_charge, reward):
total_consumption = sum(consumption)
net_production = production - total_consumption # Excess energy

self.charge_count += 1
reward += self.PENALTY["Medium"] * self.charge_count

if battery_charge >= self.battery_capacity:
reward += self.PENALTY['Medium']
elif net_production > 0 and battery_charge < self.battery_capacity:
charge_amount = min(net_production, self.battery_capacity - battery_charge)
battery_charge += charge_amount
reward += self.REWARD["High"]

return reward, battery_charge

def _discharge_battery(self, battery_charge, production, consumption, reward):
net_demand = sum(consumption) - production # Energy deficit

self.discharge_count += 1
reward += self.PENALTY["Medium"] * self.discharge_count

if net_demand > 0 and battery_charge > 0:
self.discharge_count += 1
discharge_amount = min(net_demand, battery_charge)
battery_charge -= discharge_amount
reward += self.REWARD["High"]
elif battery_charge = len(appliance_list):
return reward

appliance_name = appliance_list[appliance_index]
appliance = self.appliance_preferences[appliance_name]
comfort_level = appliance.get("comfort_level", 1)
appliance_consumption = consumption[appliance_index]
preferred_times = appliance.get("time_intervals", [])
mandatory = appliance.get('mandatory_usage', False)
running_time = appliance.get('running_time', 1)

if action % 2 != 0: # Activate appliance
if appliance_name in self.running_time_tracker:
last_activated_hour = self.running_time_tracker[appliance_name]
if hour < last_activated_hour + running_time:
reward += self.PENALTY["High"] * comfort_level * self.appliance_activation_count[appliance_name]
# Penalize reactivation before running time ends

self.running_time_tracker[appliance_name] = hour
self.appliance_activation_count.setdefault(appliance_name, 0)
self.appliance_activation_count[appliance_name] += 1

if self.appliance_activation_count[appliance_name] > self.max_daily_activations:
reward += self.PENALTY["High"] * comfort_level * self.appliance_activation_count[appliance_name]

if self._is_in_preferred_time(hour, preferred_times):
reward += self.REWARD["High"] * comfort_level if mandatory else self.REWARD["Medium"]
else:
reward += self.PENALTY["Medium"] * comfort_level if mandatory else self.PENALTY['High'] * comfort_level

if battery_charge >= appliance_consumption:
battery_charge -= appliance_consumption
reward += self.REWARD["High"]
else:
reward += self.PENALTY["Medium"]

elif action % 2 == 0: # Delay appliance
self.delayed_appliance_penalties[appliance_name] = self.delayed_appliance_penalties.get(
appliance_name, 0) + 1

if self.delayed_appliance_penalties[appliance_name] > self.max_daily_activations:
penalty = self.PENALTY["High"] * comfort_level * self.delayed_appliance_penalties[appliance_name]
reward += penalty * 2 if mandatory else penalty

if production == 0 and battery_charge == 0:
reward += self.REWARD["Low"]
self.delayed_appliance_penalties[appliance_name] = 0

return reward

def _balance_energy(self, production, consumption, reward):
energy_diff = production - sum(consumption)
if abs(energy_diff) 0:
reward += self.PENALTY["Low"] # Penalty for unused production
else:
reward += self.REWARD["High"] # Reward for managing deficits
return reward

def _is_in_preferred_time(self, hour, preferred_times):
return any(start 23:
done = True

next_state = self._get_state(day) if not done else None
return next_state, reward, done

def _get_state(self, day, data=False):
weather = self.weather_forecast[day][self.time_step]

if self.testing_mode:
production = self.avg_production[self.time_step]
consumption = [
self.avg_consumption[appliance][self.time_step]
for appliance in self.appliance_names
]
else:
production = self.hourly_productions.get(day, [0] * 24)[self.time_step]
consumption = [
self.appliance_consumptions.get(day, {}).get(appliance, [0] * 24)[self.time_step]
for appliance in self.appliance_names
]

if data:
return {
"solar_radiation": weather["solarRadiation"],
"temperature": weather["temperature"],
"humidity": weather["humidity"],
"production": production,
"consumption": consumption
}

return np.array([
self.time_step,
self.battery_charge,
self.battery_capacity,
weather["solarRadiation"],
weather["temperature"],
weather["humidity"],
production,
*consumption
])

def get_action_name(self, action_index):
return self.actions.get(action_index, "Unknown Action")

Подробнее здесь: https://stackoverflow.com/questions/792 ... timization

1734544560

Anonymous

Я работаю над диссертацией, в которой мне нужно разработать агент DQN для определения оптимального энергетического плана на следующий день. Цель состоит в том, чтобы сбалансировать максимальное время автономной работы, оптимизацию использования энергии и обеспечение комфорта пользователя за счет использования исторических данных о потреблении энергии, производстве и погоде. Однако, поскольку я относительно новичок в глубоком обучении, текущая политика, изученная агентом, не кажется оптимальной.  Буду очень признателен за любые советы, как улучшить результаты и повысить производительность агента.
Вот код:
[h4]agent.py
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import deque

class DQN(nn.Module):
def __init__(self, state_size, action_size):
super(DQN, self).__init__()
self.fc1 = nn.Linear(state_size, 128)
self.fc2 = nn.Linear(128, 128)
self.fc3 = nn.Linear(128, action_size)

def forward(self, x):
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
return self.fc3(x)

class DQNAgent:
def __init__(self, state_size, action_size):
self.state_size = state_size
self.action_size = action_size

# Hyperparameters
self.gamma = 0.95
self.epsilon = 1.0
self.epsilon_min = 0.01
self.epsilon_decay = 0.999
self.lr = 0.001
self.batch_size = 32
self.memory = deque(maxlen=2000)

# Q-Network
self.model = DQN(state_size, action_size)
self.target_model = DQN(state_size, action_size)
self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr)

self.update_target_model()

def update_target_model(self):
self.target_model.load_state_dict(self.model.state_dict())

def remember(self, state, action, reward, next_state, done):
self.memory.append((state, action, reward, next_state, done))

def act(self, state):
if np.random.rand()   self.epsilon_min:
self.epsilon *= self.epsilon_decay

def train_dqn(self, env, episodes, train_days=3, target_update_freq=5000):
step_count = 0

for e in range(episodes):
total_reward = 0

for day in range(1, train_days + 1):
state = env.reset(day=day)
done = False
day_reward = 0

while not done:
action = self.act(state)
next_state, reward, done = env.step(action, day=day)

if next_state is not None:
self.remember(state, action, reward, next_state, done)
state = next_state

day_reward += reward
step_count += 1

if step_count % target_update_freq == 0:
self.update_target_model()

total_reward += day_reward

self.replay()

print(f"Episode {e + 1}/{episodes}, Total Reward: {total_reward:.2f}, Epsilon: {self.epsilon:.4f}")

def test_dqn(self, env, test_day):
self.epsilon = 0.0
state = env.reset(day=test_day, testing_mode=True)
done = False
total_reward = 0

while not done:
action = self.act(state)
print(f"Time Step {env.time_step}, Action: {env.get_action_name(action)}")

# Perform the step
next_state, reward, done = env.step(action, day=test_day)
if next_state is not None:
state = next_state

total_reward += reward

print(f"Total Reward for Day {test_day}: {total_reward:.2f}")


reward_function.py[/h4]
class RewardFunction:
REWARD = {
"Low": 0.5,
"Medium": 0.75,
"High": 1.0
}
PENALTY = {
"Low": -0.5,
"Medium": -0.75,
"High": -1.0
}

def __init__(self, battery_capacity, appliance_preferences):
self.battery_capacity = battery_capacity
self.appliance_preferences = appliance_preferences
self.max_daily_activations = 2

self.active_appliances = {}
self.appliance_activation_count = {}
self.delayed_appliance_penalties = {}
self.battery_idle_count = 0

self.charge_count = 0
self.discharge_count = 0

self.running_time_tracker = {}

def reset_daily_counters(self):
self.appliance_activation_count = {}
self.active_appliances = {}
self.delayed_appliance_penalties = {}
self.battery_idle_count = 0
self.charge_count = 0
self.discharge_count = 0
self.running_time_tracker = {}

def calculate_reward(self, action, current_data, hour, battery_charge):
production = current_data['production']
consumption = current_data['consumption']
reward = 0

if hour == 0:
self.reset_daily_counters()

# Battery Management Actions
if action == 0:  # Charge battery
reward, battery_charge = self._charge_battery(production, consumption, battery_charge, reward)
elif action == 1:  # Discharge battery
reward, battery_charge = self._discharge_battery(battery_charge, production, consumption, reward)
elif action == 2:  # Idle battery
reward = self._idle_battery(reward, production, consumption)

# Appliance Actions
if action >= 3:
reward = self._manage_appliance_with_running_time(action, consumption, production, hour, battery_charge, reward)

# Energy Balancing
reward = self._balance_energy(production, consumption, reward)

return reward, battery_charge

def _charge_battery(self, production, consumption, battery_charge, reward):
total_consumption = sum(consumption)
net_production = production - total_consumption  # Excess energy

self.charge_count += 1
reward += self.PENALTY["Medium"] * self.charge_count

if battery_charge >= self.battery_capacity:
reward += self.PENALTY['Medium']
elif net_production > 0 and battery_charge <  self.battery_capacity:
charge_amount = min(net_production, self.battery_capacity - battery_charge)
battery_charge += charge_amount
reward += self.REWARD["High"]

return reward, battery_charge

def _discharge_battery(self, battery_charge, production, consumption, reward):
net_demand = sum(consumption) - production  # Energy deficit

self.discharge_count += 1
reward += self.PENALTY["Medium"] * self.discharge_count

if net_demand > 0 and battery_charge > 0:
self.discharge_count += 1
discharge_amount = min(net_demand, battery_charge)
battery_charge -= discharge_amount
reward += self.REWARD["High"]
elif battery_charge = len(appliance_list):
return reward

appliance_name = appliance_list[appliance_index]
appliance = self.appliance_preferences[appliance_name]
comfort_level = appliance.get("comfort_level", 1)
appliance_consumption = consumption[appliance_index]
preferred_times = appliance.get("time_intervals", [])
mandatory = appliance.get('mandatory_usage', False)
running_time = appliance.get('running_time', 1)

if action % 2 != 0:  # Activate appliance
if appliance_name in self.running_time_tracker:
last_activated_hour = self.running_time_tracker[appliance_name]
if hour < last_activated_hour + running_time:
reward += self.PENALTY["High"] * comfort_level * self.appliance_activation_count[appliance_name]
# Penalize reactivation before running time ends

self.running_time_tracker[appliance_name] = hour
self.appliance_activation_count.setdefault(appliance_name, 0)
self.appliance_activation_count[appliance_name] += 1

if self.appliance_activation_count[appliance_name] > self.max_daily_activations:
reward += self.PENALTY["High"] * comfort_level * self.appliance_activation_count[appliance_name]

if self._is_in_preferred_time(hour, preferred_times):
reward += self.REWARD["High"] * comfort_level if mandatory else self.REWARD["Medium"]
else:
reward += self.PENALTY["Medium"] * comfort_level if mandatory else self.PENALTY['High'] * comfort_level

if battery_charge >= appliance_consumption:
battery_charge -= appliance_consumption
reward += self.REWARD["High"]
else:
reward += self.PENALTY["Medium"]

elif action % 2 == 0:  # Delay appliance
self.delayed_appliance_penalties[appliance_name] = self.delayed_appliance_penalties.get(
appliance_name, 0) + 1

if self.delayed_appliance_penalties[appliance_name] > self.max_daily_activations:
penalty = self.PENALTY["High"] * comfort_level * self.delayed_appliance_penalties[appliance_name]
reward += penalty * 2 if mandatory else penalty

if production == 0 and battery_charge == 0:
reward += self.REWARD["Low"]
self.delayed_appliance_penalties[appliance_name] = 0

return reward

def _balance_energy(self, production, consumption, reward):
energy_diff = production - sum(consumption)
if abs(energy_diff)   0:
reward += self.PENALTY["Low"]  # Penalty for unused production
else:
reward += self.REWARD["High"]  # Reward for managing deficits
return reward

def _is_in_preferred_time(self, hour, preferred_times):
return any(start  23:
done = True

next_state = self._get_state(day) if not done else None
return next_state, reward, done

def _get_state(self, day, data=False):
weather = self.weather_forecast[day][self.time_step]

if self.testing_mode:
production = self.avg_production[self.time_step]
consumption = [
self.avg_consumption[appliance][self.time_step]
for appliance in self.appliance_names
]
else:
production = self.hourly_productions.get(day, [0] * 24)[self.time_step]
consumption = [
self.appliance_consumptions.get(day, {}).get(appliance, [0] * 24)[self.time_step]
for appliance in self.appliance_names
]

if data:
return {
"solar_radiation": weather["solarRadiation"],
"temperature": weather["temperature"],
"humidity": weather["humidity"],
"production": production,
"consumption": consumption
}

return np.array([
self.time_step,
self.battery_charge,
self.battery_capacity,
weather["solarRadiation"],
weather["temperature"],
weather["humidity"],
production,
*consumption
])

def get_action_name(self, action_index):
return self.actions.get(action_index, "Unknown Action")

 

Подробнее здесь: [url]https://stackoverflow.com/questions/79291927/deep-q-learning-optimization[/url]