Я работаю над обучением агента RPPO для обработки системы управления температурой. Вот какой -то фрагмент кода. < /P>
class TempControlSeqEnv(gym.Env):
def __init__(self, curriculum_phase, time_steps=5):
super(TempControlSeqEnv, self).__init__()
self.time_steps = time_steps
self.num_sensors = 6
self.stage = 0
self.stage_3_init = []
self.stage_1_counter = 0
self.stage_2_counter = 0
self.stage_3_counter = 0
self.momentum = np.array([0.0, 0.0], dtype=np.float64)
self.curriculum_phase = curriculum_phase
self.X_data = []
self.X_actions = []
#step
self.aux_action = [0.0,0.0,0.0,0.0,0.0,0.0]
#Observation is a sequence: shape (time_steps, 6 TCs + stage)
self.observation_space = self.observation_space = spaces.Dict({
"obs":spaces.Box(
low=1200,
high=1300,
shape=(time_steps, self.num_sensors + 1),
dtype=np.float32
)
})
#Actions are +0, +1, +2, +3 for Z2 and Z5Z6
self.action_space = spaces.MultiDiscrete([4, 4])
self.max_steps = 24
self.current_step = 0
self.history = deque(maxlen=self.time_steps)
self.state = None
def reset(self):
self.current_step = 0
self.stage = 0
self.stage_3_init = []
self.stage_1_counter = 0
self.stage_2_counter = 0
self.stage_3_counter = 0
self.history.clear()
#Initialize history with random readings
reading = self._generate_temp_reading()
self.X_data = [reading.tolist()]
self.X_actions = []
for _ in range(self.time_steps):
self.history.append(np.concatenate([reading, [self.stage]], axis=0))
self.state = np.array(self.history)
return {"obs": self.state}
def get_stage(self):
return self.stage
def step(self, action):
#force no action during phase change
if self.stage == 1:
action = np.array([0, 0], dtype=np.int32)
obs, reward, done, info = self._true_step(action)
return obs, reward, done, info
def _true_step(self, action):
self.current_step += 1
self.momentum *= 0.4
#State_finder
action = np.asarray(action)
self.aux_action[:] = [0.0, float(action[0]), 0.0, 0.0, float(action[1]), float(action[1])]
if len(self.X_actions) > 0:
self.X_actions.append(self.aux_action)
else:
self.X_actions = [self.aux_action]
timestep_list = list(range(1, self.current_step+1))
nested_timestep_list = [[x] for x in timestep_list]
pred = PredNextState(self.X_data, action, nested_timestep_list, self.momentum)
new_x = np.add(self.X_data[-1], np.round(pred[-1], 1))
new_x = np.clip(new_x, 1200, 1300)
print("new x = ", new_x)
logging.info(f"Predicted new x(next stage) = {new_x}")
print("action = ", self.aux_action)
logging.info(f"Current action = {self.aux_action}")
self.X_data.append(new_x)
self.history.append(np.concatenate([new_x, [self.stage]], axis=0))
del timestep_list, nested_timestep_list, pred
self.state = np.array(self.history)
print("stage = ", self.stage)
logging.info(f"Current stage = {self.stage}")
print("momentum = ", self.momentum)
logging.info(f"Current momentum = {self.momentum}")
#Placeholder for reward function
#Stage 0, pre phase change
if self.curriculum_phase == "pretrain":
if self.stage == 0:
if self.stage_1_counter > 6: #assume phase change 6 hours in, configurable
self.stage = 1
else:
self.stage_1_counter += 1
#Reward Z2
init_Z2 = np.round(self.X_data[0][1])
target_Z2_stage_1 = 1228
target_Z2 = init_Z2 + self.current_step
if target_Z2 > target_Z2_stage_1:
target_Z2 = target_Z2_stage_1
reward_Z2 = ComputeZoneReward(new_x[1] + self.momentum[0], target_Z2, action[0])
#Reward Z6
init_Z6 = np.round(self.X_data[0][5])
target_Z6_stage_1 = 1246
target_Z6 = init_Z6 + self.current_step
if target_Z6 > target_Z6_stage_1:
target_Z6 = target_Z6_stage_1
reward_Z6 = ComputeZoneReward(new_x[5] + self.momentum[1], target_Z6, action[1])
reward = reward_Z2 + reward_Z6
print("Z2 reward: ", reward_Z2)
print("Z6 reward: ", reward_Z6)
logging.info(f"Z2 reward = {reward_Z2}")
logging.info(f"Z6 reward = {reward_Z6}")
#Stage 1, during phase change
elif self.stage == 1:
if self.stage_2_counter > 2: #assumes exit phase change after 2 hours and temperature change < 0.5, configurable
if (self.X_data[-1][1] - self.X_data[-2][1]) < 0.5:
self.stage = 2
self.stage_3_init = self.X_data[-1]
else:
self.stage_2_counter += 1
#No change
reward = -np.abs(action[0] + action[1])
print("stage 1 no reward")
logging.info("stage 1 no reward")
elif self.stage == 2:
reward = 0.0
print("stage 2 during pretrain no reward")
logging.info("stage 2 during pretrain no reward")
elif self.curriculum_phase == "stage2_only":
if self.stage != 2:
reward = 0.0
print("stage 0 or 1 after pretrain no reward")
logging.info("stage 0 or 1 after pretrain no reward")
#Stage 3, post phase change
else:
self.stage_3_counter += 1
#Reward Z2
init_Z2 = np.round(self.stage_3_init[1])
target_Z2_stage_3 = 1239
target_Z2 = init_Z2 + self.stage_3_counter
if target_Z2 > target_Z2_stage_3:
target_Z2 = target_Z2_stage_3
reward_Z2 = ComputeZoneReward(new_x[1] + self.momentum[0], target_Z2, action[0])
#Reward Z6
init_Z6 = np.round(self.stage_3_init[5])
target_Z6_stage_3 = 1253
target_Z6 = init_Z6 + self.stage_3_counter
if target_Z6 > target_Z6_stage_3:
target_Z6 = target_Z6_stage_3
reward_Z6 = ComputeZoneReward(new_x[5] + self.momentum[1], target_Z6, action[1])
reward = reward_Z2 + reward_Z6
print("Z2 reward: ", reward_Z2)
print("Z6 reward: ", reward_Z6)
logging.info(f"Z2 reward = {reward_Z2}")
logging.info(f"Z6 reward = {reward_Z6}")
print("reward = ", reward)
logging.info(f"Total reward = {reward}")
print(psutil.Process(os.getpid()).memory_info().rss / 1024 ** 2, "MB")
del new_x
gc.collect()
done = self.current_step >= self.max_steps
info = {}
self.momentum += np.round(np.array(action)/3, 2)
return {"obs": self.state.astype(np.float32)}, reward, done, info
#Placeholder for init temperature
def _generate_temp_reading(self):
init_temp = [[1216.3, 1218, 1234.6, 1241.8, 1239, 1245.4],
[1222.6, 1224.8, 1237.2, 1242.4, 1239.9, 1241.4],
[1216.2, 1224, 1240.2, 1242.5, 1239.3, 1243.7],
[1215.9, 1219.2, 1241.2, 1241.7, 1241, 1241.9],
[1219.5, 1222.4, 1236.5, 1241.1, 1240.4, 1242.8],
[1217.9, 1221.1, 1240, 1243.1, 1237.8, 1240.7],
[1222.6, 1224.8, 1237.2, 1242.4, 1239.9, 1241.4],
[1224.5, 1226.5, 1241.3, 1243.7, 1240.5, 1243.2],
[1221.5, 1223.5, 1242.4, 1244.4, 1239.6, 1241.1],
[1222.6, 1225.3, 1240.7, 1243.9, 1241.4, 1244.4],
[1216.2, 1224, 1240.2, 1242.5, 1239.3, 1243.7],
[1219.1, 1226.2, 1239.1, 1241.8, 1240.1, 1244.3],
[1218, 1224.7, 1239.4, 1242.6, 1239.3, 1243.5],
[1220.8, 1225.7, 1240.6, 1242.7, 1239, 1241.8],
[1221.4, 1226.5, 1237.7, 1241.7, 1240.5, 1244.3]]
return np.array(random.choice(init_temp)).astype(np.float32)
def set_curriculum_phase(self, phase: str):
assert phase in ["pretrain", "stage2_only"]
self.curriculum_phase = phase
if phase == "pretrain":
self.max_steps = 10
else:
self.max_steps = 24
def seed(self, seed=None):
self.np_random, seed = gym.utils.seeding.np_random(seed)
return [seed]
def render(self, mode='human'):
print(f"Step {self.current_step}: {self.history[-1]}")
#Create vectorized environment
env = TempControlSeqEnv(curriculum_phase="pretrain")
vec_env = DummyVecEnv([lambda: env])
vec_env = VecNormalize(vec_env, norm_obs=True, norm_reward=True)
vec_env.envs[0].set_curriculum_phase("pretrain")
#Instantiate Recurrent PPO agents with LSTM policy
model_A = RecurrentPPO("MultiInputLstmPolicy", vec_env, verbose=1, tensorboard_log="./ppo_rnn_tb/")
for _ in range(10):
model_A.learn(2000)
model_A.save("TC_PPO_v0.10_pretrain.zip")
vec_env.envs[0].set_curriculum_phase("stage2_only")
model_B = RecurrentPPO.load("TC_PPO_v0.10_pretrain.zip", vec_env, verbose=2)
for _ in range(10):
model_B.learn(2000)
model_B.save("TC_PPO_v0.10_stage2.zip")
< /code>
Во время обучения стало ясно, что потеря градиента политики сидела на 0 и не участвует в обучении с самого начала. Потеря стоимости неуклонно уменьшается, хотя. Что вызывает эта проблема? < /P>
------------------------------------------
| time/ | |
| fps | 0 |
| iterations | 2 |
| time_elapsed | 562 |
| total_timesteps | 256 |
| train/ | |
| approx_kl | 0.0015819944 |
| clip_fraction | 0 |
| clip_range | 0.2 |
| entropy_loss | -2.77 |
| explained_variance | 6.14e-06 |
| learning_rate | 0.0003 |
| loss | 470 |
| n_updates | 10 |
| policy_gradient_loss | -0.00765 |
| value_loss | 965 |
------------------------------------------
Подробнее здесь: https://stackoverflow.com/questions/796 ... ss-and-exp
Модель Recurrentppo от SB3-контриба всегда дает мне потерю градиента политики и объясняет дисперсию близко к 0 ⇐ Python
-
- Похожие темы
- Ответы
- Просмотры
- Последнее сообщение