import random
from tqdm import tqdm
# Mapa de recompensas
original_rewards = np.array([
[ 0, 0, 30, 0, 20, 0, 0, 0, 0, 0],
[ 0, -10, 0, 0, 0, 0, -10, 0, 0, 10],
[ 0, 0, 0, -10, 0, 0, 0, 0, -10, 0],
[ 0, 20, 0, 30, 0, -10, 0, 0, 0, 0],
[ 0, 0, 0, 0, 0, 0, 0, -10, 0, 0],
[ 0, 0, -10, 0, 0, 30, 0, 0, 30, 0],
[ 0, 0, 0, -10, 0, 0, 0, 0, 0, 0],
[ 0, -10, 0, 0, 0, 0, 0, -10, 0, 0],
[ 0, 0, 0, -10, 0, 0, 0, 0, 0, 0],
[ 50, 0, 0, 0, 0, -10, 0, 0, 0, 0],
])
# Parâmetros
n_rows, n_cols = original_rewards.shape
n_states = n_rows * n_cols
n_actions = 4 # cima, baixo, esquerda, direita
# Hiperparâmetros
alpha = 0.1 # taxa de aprendizado
gamma = 0.95 # fator de desconto
epsilon = 0.1 # política epsilon-greedy
episodes = 200000
max_steps = 10
# Inicializa Q-table
Q = np.zeros((n_states, n_actions))
# Mapeamento de ações
actions = {
0: (-1, 0), # cima
1: ( 1, 0), # baixo
2: ( 0, -1), # esquerda
3: ( 0, 1), # direita
}
# Funções auxiliares
def state_to_index(row, col):
return row * n_cols + col
def index_to_state(index):
return divmod(index, n_cols)
def step(row, col, action):
dr, dc = actions[action]
new_row = min(max(row + dr, 0), n_rows - 1)
new_col = min(max(col + dc, 0), n_cols - 1)
return new_row, new_col
# Treinamento
for episode in tqdm(range(episodes), desc="Treinamento"):
row, col = random.randint(0, n_rows-1), random.randint(0, n_cols-1)
# Recompensas consumíveis (resetadas por episódio)
rewards = original_rewards.copy()
for step_num in range(max_steps):
state = state_to_index(row, col)
# Política epsilon-greedy
if random.uniform(0, 1) < epsilon:
action = random.randint(0, n_actions - 1)
else:
action = np.argmax(Q[state])
# Executa ação
next_row, next_col = step(row, col, action)
next_state = state_to_index(next_row, next_col)
# Coleta recompensa atual e zera para simular "consumo"
reward = rewards[next_row, next_col]
rewards[next_row, next_col] = 0
# Atualização Q-learning
best_next_q = np.max(Q[next_state])
Q[state, action] += alpha * (reward + gamma * best_next_q - Q[state, action])
# Avança para o próximo estado
row, col = next_row, next_col
#
test_row, test_col = 0, 0
total_reward = 0
print("Melhor caminho em 10 passos a partir de (0,0):")
rewards = original_rewards.copy()
for _ in range(10):
state = state_to_index(test_row, test_col)
action = np.argmax(Q[state])
test_row, test_col = step(test_row, test_col, action)
r = rewards[test_row, test_col]
rewards[test_row, test_col] = 0 # consome a recompensa
total_reward += r
print(f" -> ({test_row},{test_col}) Recompensa: {r}")
print(f"\nRecompensa total: {total_reward}")
print(f"\nRecompensa total: {total_reward}")
< /code>
Я действительно не знаю, что еще делать. Я не знаю. Помогите!>
Подробнее здесь: https://stackoverflow.com/questions/796 ... er-of-rewa