и после обширного делужи. Отличное спасибо всем, кто может заметить некоторые ошибки! < /P>
инициализация < /h2>
Код: Выделить всё
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import math
SIZE = 20
states = np.zeros((SIZE+1,SIZE+1))
policy = np.zeros((SIZE+1,SIZE+1))
theta = 0.3
gamma = 0.9
< /code>
Оценка < /h2>
Я попытался итерации через все (возвращать, запросить) пары, чтобы все пара (s ', r) могли быть включены. < /p>
def gather_value(state, action):
s1, s2 = state
v_s = 0
for req1 in range(s1+1):
for req2 in range(s2+1):
for ret1 in range(20-s1+1):
for ret2 in range(20-s2+1):
p = calculate_prob(req1, req2, ret1, ret2)
reward, s1_prime, s2_prime = calculate_reward(req1, req2, s1, s2, ret1, ret2, action)
if s1_prime > 20 or s2_prime > 20 or s1_prime < 0 or s2_prime < 0:
continue
v_s = v_s + p * (reward + gamma * states[int(s1_prime), int(s2_prime)])
return v_s
# The probability of the request and return values
def calculate_prob(req1, req2, ret1, ret2):
p1 = math.exp(-3) * 3**req1 / math.factorial(req1)
p2 = math.exp(-4) * 4**req2 / math.factorial(req2)
p3 = math.exp(-3) * 3**ret1 / math.factorial(ret1)
p4 = math.exp(-2) * 2**ret2 / math.factorial(ret2)
return p1 * p2 * p3 * p4
def calculate_reward(req1, req2, s1, s2, ret1, ret2, action):
avail1, avail2 = s1-action, s2+action
avail1 = min(20, max(0, avail1))
avail2 = min(20, max(0, avail2))
rent1, rent2 = min(req1, avail1), min(req2, avail2)
ret = -2 * abs(action)
rental = (rent1 + rent2) * 10
return ret + rental, min(20, s1-rent1+ret1), min(20, s2-rent2+ret2)
< /code>
Тогда, < /p>
def policy_evaluation():
break_loop = False
while not break_loop:
change = 0
for i in range(SIZE+1):
for j in range(SIZE+1):
prev_state = states[i,j]
states[i,j] = gather_value([i,j], policy[i,j])
change = max(change, abs(prev_state - states[i,j]))
if change < theta:
break_loop = True
< /code>
улучшение < /h2>
def determine_action(state):
max_value, max_action = -math.inf, 0
for action in range(-5, 6):
if not feasible(state, action):
continue
value = gather_value(state, action)
# print("outside determine", value, max_value)
if value > max_value:
max_action = action
max_value = value
return max_action
def feasible(state, action):
s1, s2 = state
a = abs(action)
return (
(action > 0 and s1 >= a and s2 + a
def policy_improvement():
policy_stable = True
for i in range(SIZE+1):
for j in range(SIZE+1):
old_action = policy[i,j]
policy[i,j] = determine_action([i,j])
if policy[i,j] != old_action:
policy_stable = False
if policy_stable:
print("Iteration complete! ", states, policy)
return True
return False
def show_graph(graph):
# Create custom colormap matching the screenshot
colors = [
'#1a2c7b', '#2a4ca7', '#3a7bb7', '#5aa8c8', '#8ad2d1',
'#c8e6be', '#f6f9c4', '#f9d77c', '#f9a76a', '#f86a5a', '#e93a4a'
]
cmap = ListedColormap(colors)
# Create plot
plt.figure(figsize=(6, 6))
plt.imshow(graph, cmap=cmap, vmin=-5, vmax=5, origin='lower', extent=[0, 20, 0, 20])
plt.colorbar(ticks=range(-5, 6), label='Cars moved')
plt.xlabel('Cars at location 2')
plt.ylabel('Cars at location 1')
plt.title('Optimal Policy')
plt.show()
< /code>
и < /p>
iter_count = 1
while True:
print("Start iteration: ", iter_count)
policy_evaluation()
if policy_improvement():
break
print("Finished iteration with policy: ", policy)
show_graph(policy)
iter_count += 1
< /code>
Извините за длинный пост. Я отлаживал это почти на день и думал о том, чтобы попросить здесь помощь. Любая помощь очень ценится !! src = "https://i.sstatic.net/echfqjzp.png"/>
Я не могу выяснить, где возникает проблема границы и как градиент в обратном порядке.
Подробнее здесь: https://stackoverflow.com/questions/796 ... code-check