Пространство действий модели армирования одномерное, но на этапе тестирования модель выводит действие в двухмерном виде.

Пространство действий модели армирования одномерное, но на этапе тестирования модель выводит действие в двухмерном виде. ⇐ Python

1 сообщение • Страница 1 из 1

Anonymous

Пространство действий модели армирования одномерное, но на этапе тестирования модель выводит действие в двухмерном виде.

Цитата

Сообщение Anonymous » 10 янв 2025, 17:09

Я обучил модель PPO с пространством действий self.action_space = Gym.spaces.Box(-1, 1, (1,), data_type)" с помощью rllib
Но когда я использую обученную модель для ручного вызова forward_inference, действие вывода становится 2-мерным тензором (как показано на рисунке ниже), что не соответствует 1 многомерное пространство действий в обучение.
В чем проблема?
Вот рисунок отладки

Вот исходный код
import asyncio
import sys

import gymnasium as gym
import numpy as np
import torch
from ray.rllib.algorithms import Algorithm

from ray.rllib.algorithms.ppo import PPOConfig

if sys.platform == 'win32':
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())

# check_point_dir = 'C:\\Users\\24761\\PycharmProjects\\MyFinRL\\checkpoints'
check_point_dir = None

data_type = np.float32

# Define your problem using python and Farama-Foundation's gymnasium API:
class SimpleCorridor(gym.Env):
"""Corridor in which an agent must learn to move right to reach the exit.

---------------------
| S | 1 | 2 | 3 | G | S=start; G=goal; corridor_length=5
---------------------

Possible actions to chose from are: 0=left; 1=right
Observations are floats indicating the current field index, e.g. 0.0 for
starting position, 1.0 for the field next to the starting position, etc..
Rewards are -0.1 for all steps, except when reaching the goal (+1.0).
"""

def __init__(self, config):
self.end_pos = config["corridor_length"]
self.cur_pos = 0.0
# self.action_space = gym.spaces.Discrete(2) # left and right
self.action_space = gym.spaces.Box(-1, 1, (1,), data_type) # left and right
# self.observation_space = gym.spaces.Box(0.0, self.end_pos, (1,), data_type)
self.observation_space = gym.spaces.Box(-10000, 10000, (1,), data_type)

def reset(self, *, seed=None, options=None):
"""Resets the episode.

Returns:
Initial observation of the new episode and an info dict.
"""
self.cur_pos = 0.0
# Return initial observation.
return np.array([self.cur_pos], data_type), {}

def step(self, action):
"""Takes a single step in the episode given `action`.

Returns:
New observation, reward, terminated-flag, truncated-flag, info-dict (empty).
"""
print(action)
# Walk left.
if action < -0.1 and self.cur_pos > 0:
self.cur_pos -= 1
# Walk right.
elif action > 0.1:
self.cur_pos += 1
# Set `terminated` flag when end of corridor (goal) reached.
terminated = self.cur_pos >= self.end_pos
truncated = False
# +1 when goal reached, otherwise -1.
reward = 1.0 if terminated else -0.1
return np.array([self.cur_pos], data_type), reward, terminated, truncated, {}

def train(epoch: int = 20):
# Create an RLlib Algorithm instance from a PPOConfig object.
config = (
PPOConfig().environment(
# Env class to use (here: our gym.Env sub-class from above).
SimpleCorridor,
# Config dict to be passed to our custom env's constructor.
# Use corridor with 20 fields (including S and G).
env_config={"corridor_length": 20},
)
# Parallelize environment rollouts.
.env_runners(num_env_runners=3)
)
# Construct the actual (PPO) algorithm object from the config.
algo = config.build()
rl_module = algo.get_module()

# Train for n iterations and report results (mean episode rewards).
# Since we have to move at least 19 times in the env to reach the goal and
# each move gives us -0.1 reward (except the last move at the end: +1.0),
# Expect to reach an optimal episode reward of `-0.1*18 + 1.0 = -0.8`.
for i in range(epoch):
results = algo.train()
print(f"Iter: {i}; avg. results={results['env_runners']}")

# save checkpoint
save_result = algo.save(checkpoint_dir=check_point_dir)
path_to_checkpoint = save_result.checkpoint.path
print(
"An Algorithm checkpoint has been created inside directory: "
f"'{path_to_checkpoint}'."
)
algo.stop()

return rl_module

def test(check_point_path: str = None, check_point_module = None):
if check_point_path is None and check_point_module is None:
raise ValueError("Please provide a checkpoint path")

if check_point_module is not None:
rl_module = check_point_module
else:
check_point = Algorithm.from_checkpoint(check_point_path)
rl_module = check_point.get_module()

# Perform inference (action computations) based on given env observations.
# Note that we are using a slightly different env here (len 10 instead of 20),
# however, this should still work as the agent has (hopefully) learned
# to "just always walk right!"
env = SimpleCorridor({"corridor_length": 20})
# Get the initial observation (should be: [0.0] for the starting position).
obs, info = env.reset()
terminated = truncated = False
total_reward = 0.0

# Play one episode.
while not terminated and not truncated:
# Compute a single action, given the current observation
# from the environment.
inference_result = rl_module.forward_inference(
{"obs": torch.from_numpy(obs).unsqueeze(0)}
)
action = inference_result["action_dist_inputs"].numpy()[
0
]
# Apply the computed action in the environment.
obs, reward, terminated, truncated, info = env.step(action)
# Sum up rewards for reporting purposes.
total_reward += reward

print(obs)
# Report results.
print(f"Played 1 episode; total-reward={total_reward}")

if __name__ == '__main__':
test(check_point_module=train())

Подробнее здесь: https://stackoverflow.com/questions/792 ... test-stage

1736518158

Anonymous

Я обучил модель PPO с пространством действий self.action_space = Gym.spaces.Box(-1, 1, (1,), data_type)" с помощью rllib
Но когда я использую обученную модель для ручного вызова [b]forward_inference[/b], действие вывода становится [b]2-мерным тензором[/b] (как показано на рисунке ниже), что не соответствует 1 многомерное пространство действий в обучение.
В чем проблема?
Вот рисунок отладки
[img]https ://i.sstatic.net/TpXxcb9J.png[/img]

Вот исходный код
import asyncio
import sys

import gymnasium as gym
import numpy as np
import torch
from ray.rllib.algorithms import Algorithm

from ray.rllib.algorithms.ppo import PPOConfig

if sys.platform == 'win32':
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())

# check_point_dir = 'C:\\Users\\24761\\PycharmProjects\\MyFinRL\\checkpoints'
check_point_dir = None

data_type = np.float32

# Define your problem using python and Farama-Foundation's gymnasium API:
class SimpleCorridor(gym.Env):
"""Corridor in which an agent must learn to move right to reach the exit.

---------------------
| S | 1 | 2 | 3 | G |   S=start; G=goal; corridor_length=5
---------------------

Possible actions to chose from are: 0=left; 1=right
Observations are floats indicating the current field index, e.g. 0.0 for
starting position, 1.0 for the field next to the starting position, etc..
Rewards are -0.1 for all steps, except when reaching the goal (+1.0).
"""

def __init__(self, config):
self.end_pos = config["corridor_length"]
self.cur_pos = 0.0
# self.action_space = gym.spaces.Discrete(2)  # left and right
self.action_space = gym.spaces.Box(-1, 1, (1,), data_type)  # left and right
# self.observation_space = gym.spaces.Box(0.0, self.end_pos, (1,), data_type)
self.observation_space = gym.spaces.Box(-10000, 10000, (1,), data_type)

def reset(self, *, seed=None, options=None):
"""Resets the episode.

Returns:
Initial observation of the new episode and an info dict.
"""
self.cur_pos = 0.0
# Return initial observation.
return np.array([self.cur_pos], data_type), {}

def step(self, action):
"""Takes a single step in the episode given `action`.

Returns:
New observation, reward, terminated-flag, truncated-flag, info-dict (empty).
"""
print(action)
# Walk left.
if action < -0.1 and self.cur_pos > 0:
self.cur_pos -= 1
# Walk right.
elif action > 0.1:
self.cur_pos += 1
# Set `terminated` flag when end of corridor (goal) reached.
terminated = self.cur_pos >= self.end_pos
truncated = False
# +1 when goal reached, otherwise -1.
reward = 1.0 if terminated else -0.1
return np.array([self.cur_pos], data_type), reward, terminated, truncated, {}

def train(epoch: int = 20):
# Create an RLlib Algorithm instance from a PPOConfig object.
config = (
PPOConfig().environment(
# Env class to use (here: our gym.Env sub-class from above).
SimpleCorridor,
# Config dict to be passed to our custom env's constructor.
# Use corridor with 20 fields (including S and G).
env_config={"corridor_length": 20},
)
# Parallelize environment rollouts.
.env_runners(num_env_runners=3)
)
# Construct the actual (PPO) algorithm object from the config.
algo = config.build()
rl_module = algo.get_module()

# Train for n iterations and report results (mean episode rewards).
# Since we have to move at least 19 times in the env to reach the goal and
# each move gives us -0.1 reward (except the last move at the end: +1.0),
# Expect to reach an optimal episode reward of `-0.1*18 + 1.0 = -0.8`.
for i in range(epoch):
results = algo.train()
print(f"Iter: {i}; avg.  results={results['env_runners']}")

# save checkpoint
save_result = algo.save(checkpoint_dir=check_point_dir)
path_to_checkpoint = save_result.checkpoint.path
print(
"An Algorithm checkpoint has been created inside directory: "
f"'{path_to_checkpoint}'."
)
algo.stop()

return rl_module

def test(check_point_path: str = None, check_point_module = None):
if check_point_path is None and check_point_module is None:
raise ValueError("Please provide a checkpoint path")

if check_point_module is not None:
rl_module = check_point_module
else:
check_point = Algorithm.from_checkpoint(check_point_path)
rl_module = check_point.get_module()

# Perform inference (action computations) based on given env observations.
# Note that we are using a slightly different env here (len 10 instead of 20),
# however, this should still work as the agent has (hopefully) learned
# to "just always walk right!"
env = SimpleCorridor({"corridor_length": 20})
# Get the initial observation (should be: [0.0] for the starting position).
obs, info = env.reset()
terminated = truncated = False
total_reward = 0.0

# Play one episode.
while not terminated and not truncated:
# Compute a single action, given the current observation
# from the environment.
inference_result = rl_module.forward_inference(
{"obs": torch.from_numpy(obs).unsqueeze(0)}
)
action = inference_result["action_dist_inputs"].numpy()[
0
]
# Apply the computed action in the environment.
obs, reward, terminated, truncated, info = env.step(action)
# Sum up rewards for reporting purposes.
total_reward += reward

print(obs)
# Report results.
print(f"Played 1 episode; total-reward={total_reward}")

if __name__ == '__main__':
test(check_point_module=train())
 

Подробнее здесь: [url]https://stackoverflow.com/questions/79271203/the-action-space-of-a-reinforcement-model-is-1-dimensional-but-when-test-stage[/url]