Anonymous
Почему мой ИИ, обучающийся с подкреплением, не увеличивает вознаграждение с течением времени?
Сообщение
Anonymous » 17 янв 2025, 08:20
Я работаю над созданием искусственного интеллекта для обучения с подкреплением для игры Master Mind. Я следовал руководству (
по игровой змее, но менял его, чтобы он работал в моей игре. Я дошел до того, что ИИ может строить догадки, но, похоже, ситуация не улучшается.
Это мой агент.py
Код: Выделить всё
import torch
import random
import numpy as np
from collections import deque
from game import MasterMindAI
from model import Linear_QNet, QTrainer
from helper import plot
MAX_MEMORY = 100_000
BATCH_SIZE = 1000
LR = 0.001
class Agent:
def __init__(self):
self.n_games = 0
self.epsilon = 0 # randomness
self.gamma = 0.9 # discount rate
self.memory = deque(maxlen=MAX_MEMORY) # popleft()
self.model = Linear_QNet(4, 256, 4)
self.trainer = QTrainer(self.model, lr=LR, gamma=self.gamma)
def get_state(self, game):
compare = game.get_compare()
guesses = game.get_guesses()
while len(compare) < 8:
compare.append([-1, -1, -1, -1])
while len(compare) >= 8:
compare.pop()
while len(guesses) < 8:
guesses.append([-1, -1, -1, -1])
while len(guesses) >= 8:
guesses.pop()
# print("compare ", compare)
# print("guesses ", guesses)
state = np.array(compare + guesses,dtype=int)
#print("state", state)
# state = [
# #DOES THIS WORK
# compare,
# guesses
# ]
# return np.array(state, dtype=int)
return state
def remember(self, state, action, reward, next_state, done):
self.memory.append((state, action, reward, next_state, done)) # popleft if MAX_MEMORY is reached
def train_long_memory(self):
if len(self.memory) > BATCH_SIZE:
mini_sample = random.sample(self.memory, BATCH_SIZE) # list of tuples
else:
mini_sample = self.memory
states, actions, rewards, next_states, dones = zip(*mini_sample)
self.trainer.train_step(states, actions, rewards, next_states, dones)
def train_short_memory(self, state, action, reward, next_state, done):
self.trainer.train_step(state, action, reward, next_state, done)
def get_action(self, state):
# random moves: tradeoff exploration / exploitation
self.epsilon = 800 - self.n_games
final_move = [0,0,0,0]
if random.randint(0, 2000) < self.epsilon:
final_move = [random.randint(1, 5) for _ in range(4)]
else:
state_tensor = torch.tensor(state, dtype=torch.float)
prediction = self.model(state_tensor) # Get model prediction
move = torch.argmax(prediction).item() # Choose the highest predicted value
#
# print("move ", move)
# Assuming the output needs to be transformed to 4 integers:
final_move = [
(move % 5) + 1, # First digit (1-5)
((move // 5) % 5) + 1, # Second digit (1-5)
((move // 25) % 5) + 1, # Third digit (1-5)
((move // 125) % 5) + 1 # Fourth digit (1-5)
]
return final_move
def train():
plot_scores = []
plot_mean_scores = []
total_score = 0
plot_reward = []
plot_mean_reward = []
total_reward = 0
record = 10
agent = Agent()
game = MasterMindAI()
game.start_game()
while True:
# get old state
state_old = agent.get_state(game)
# get move
final_move = agent.get_action(state_old)
# perform move and get new state
reward, done, score = game.play_step(final_move)
# print("reward in here", reward)
state_new = agent.get_state(game)
# train short memory
agent.train_short_memory(state_old, final_move, reward, state_new, done)
# remember
agent.remember(state_old, final_move, reward, state_new, done)
if done:
#print("reward", reward)
# train long memory, plot result
game.start_game()
agent.n_games += 1
agent.train_long_memory()
if score < record:
record = score
agent.model.save()
print('Game', agent.n_games, 'Score', score, 'Record:', record)
# plot_scores.append(score)
# total_score += score
# mean_score = total_score / agent.n_games
# plot_mean_scores.append(mean_score)
# plot(plot_scores, plot_mean_scores)
plot_reward.append(reward)
total_reward += reward
mean_reward = total_reward / agent.n_games
plot_mean_reward.append(mean_reward)
plot(plot_reward, plot_mean_reward)
if __name__ == '__main__':
train()
Это model.py
Код: Выделить всё
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import os
class Linear_QNet(nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super().__init__()
self.linear1 = nn.Linear(input_size, hidden_size)
self.linear2 = nn.Linear(hidden_size, output_size)
def forward(self, x):
x = F.relu(self.linear1(x))
x = self.linear2(x)
return x
def save(self, file_name='model.pth'):
model_folder_path = './model'
if not os.path.exists(model_folder_path):
os.makedirs(model_folder_path)
file_name = os.path.join(model_folder_path, file_name)
torch.save(self.state_dict(), file_name)
class QTrainer:
def __init__(self, model, lr, gamma):
self.lr = lr
self.gamma = gamma
self.model = model
self.optimizer = optim.Adam(model.parameters(), lr=self.lr)
self.criterion = nn.MSELoss()
def train_step(self, state, action, reward, next_state, done):
state = torch.tensor(state, dtype=torch.float)
next_state = torch.tensor(next_state, dtype=torch.float)
action = torch.tensor(action, dtype=torch.long)
reward = torch.tensor(reward, dtype=torch.float)
# (n, x)
if len(state.shape) == 2:
# (1, x)
state = torch.unsqueeze(state, 0)
next_state = torch.unsqueeze(next_state, 0)
action = torch.unsqueeze(action, 0)
reward = torch.unsqueeze(reward, 0)
done = (done, )
# 1: predicted Q values with current state
pred = self.model(state)
target = pred.clone()
for idx in range(len(done)):
Q_new = reward[idx]
if not done[idx]:
Q_new = reward[idx] + self.gamma * torch.max(self.model(next_state[idx]))
target[idx][torch.argmax(action[idx]).item()] = Q_new
#target[idx][action[idx]] = Q_new
# 2: Q_new = r + y * max(next_predicted Q value) -> only do this if not done
# pred.clone()
# preds[argmax(action)] = Q_new
self.optimizer.zero_grad()
loss = self.criterion(target, pred)
loss.backward()
self.optimizer.step()
И мой game.py
Код: Выделить всё
import random
class MasterMindAI:
def start_game(self):
self.hidden = [random.randint(1, 5) for _ in range(4)]
self.guessMax = 8
self.guessMade = 0
self.reward = 0
self.allGuess = []
self.compare = []
def get_guesses(self):
if(self.allGuess == []):
return [[-1,-1,-1,-1]]
else:
return self.allGuess
def get_compare(self):
if(self.compare == []):
return [[-1,-1,-1,-1]]
else:
return self.compare
#Action
def guess_list(self):
self.guessMade += 1
guess = input("Enter four numbers separated by spaces: ")
nums = [int(num) for num in guess.split()]
return nums
def play_step(self,action):
game_over = False
self.compare.insert(0, self.checkHidden(self.hidden, action))
self.guessMade += 1
for num in self.compare[0]:
if num == 2:
self.reward += 3
elif num == 1:
self.reward += 1
self.allGuess.insert(0, action)
if(action == self.hidden or self.guessMade >= self.guessMax ):
game_over = True
if(action == self.hidden):
self.reward += (self.guessMax-self.guessMade) * 20
else:
self.reward += 0
#self.display()
#print("reward ", self.reward)
return self.reward, game_over, self.guessMade
def display(self):
for i in range(self.guessMax - self.guessMade):
print("_ _ _ _ |")
for lis, lis2 in zip(self.allGuess, self.compare):
if(lis != [-1,-1,-1,-1]):
print(*lis, "|", *lis2)
def checkHidden(self, answer, guess):
spot = [0,1,2,3]
ans = answer.copy()
lis = []
if(guess[0] == self.hidden[0]):
lis.append(2)
spot[0] = -1
ans[0] = -1
if(guess[1] == self.hidden[1]):
lis.append(2)
spot[1] = -1
ans[1] = -1
if(guess[2] == self.hidden[2]):
lis.append(2)
spot[2] = -1
ans[2] = -1
if(guess[3] == self.hidden[3]):
lis.append(2)
spot[3] = -1
ans[3] = -1
if 0 in spot:
if guess[0] in ans:
lis.append(1)
ans[0] = -1
if 1 in spot:
if guess[1] in ans:
lis.append(1)
ans[1] = -1
if 2 in spot:
if guess[2] in ans:
lis.append(1)
ans[2] = -1
if 3 in spot:
if guess[3] in ans:
lis.append(1)
ans[3] = -1
while len(lis) < 4:
lis.append(0)
return lis
# def play(self):
# self.start_game()
# self.display()
#play_step
#recent_guess = self.guess_list()
#self.play_step(recent_guess)
#
# while(recent_guess != self.hidden and self.guessMade < self.guessMax):
# self.display()
# #
# recent_guess = self.guess_list()
# self.play_step(recent_guess)
# #
# if (recent_guess == self.hidden):
# reward = (self.guessMax - self.guessMade) * 10
# return 1, reward
# else:
# reward = 0
# return 0, reward
if __name__ == "__main__":
ai = MasterMindAI()
num, reward = ai.play()
Я просто застрял на том, где мне нужно устранить неполадки.
Спасибо за любую помощь
Подробнее здесь:
https://stackoverflow.com/questions/793 ... -over-time
1737091209
Anonymous
Я работаю над созданием искусственного интеллекта для обучения с подкреплением для игры Master Mind. Я следовал руководству ([youtube]L8ypSXwyBds[/youtube] по игровой змее, но менял его, чтобы он работал в моей игре. Я дошел до того, что ИИ может строить догадки, но, похоже, ситуация не улучшается. Это мой агент.py [code]import torch import random import numpy as np from collections import deque from game import MasterMindAI from model import Linear_QNet, QTrainer from helper import plot MAX_MEMORY = 100_000 BATCH_SIZE = 1000 LR = 0.001 class Agent: def __init__(self): self.n_games = 0 self.epsilon = 0 # randomness self.gamma = 0.9 # discount rate self.memory = deque(maxlen=MAX_MEMORY) # popleft() self.model = Linear_QNet(4, 256, 4) self.trainer = QTrainer(self.model, lr=LR, gamma=self.gamma) def get_state(self, game): compare = game.get_compare() guesses = game.get_guesses() while len(compare) < 8: compare.append([-1, -1, -1, -1]) while len(compare) >= 8: compare.pop() while len(guesses) < 8: guesses.append([-1, -1, -1, -1]) while len(guesses) >= 8: guesses.pop() # print("compare ", compare) # print("guesses ", guesses) state = np.array(compare + guesses,dtype=int) #print("state", state) # state = [ # #DOES THIS WORK # compare, # guesses # ] # return np.array(state, dtype=int) return state def remember(self, state, action, reward, next_state, done): self.memory.append((state, action, reward, next_state, done)) # popleft if MAX_MEMORY is reached def train_long_memory(self): if len(self.memory) > BATCH_SIZE: mini_sample = random.sample(self.memory, BATCH_SIZE) # list of tuples else: mini_sample = self.memory states, actions, rewards, next_states, dones = zip(*mini_sample) self.trainer.train_step(states, actions, rewards, next_states, dones) def train_short_memory(self, state, action, reward, next_state, done): self.trainer.train_step(state, action, reward, next_state, done) def get_action(self, state): # random moves: tradeoff exploration / exploitation self.epsilon = 800 - self.n_games final_move = [0,0,0,0] if random.randint(0, 2000) < self.epsilon: final_move = [random.randint(1, 5) for _ in range(4)] else: state_tensor = torch.tensor(state, dtype=torch.float) prediction = self.model(state_tensor) # Get model prediction move = torch.argmax(prediction).item() # Choose the highest predicted value # # print("move ", move) # Assuming the output needs to be transformed to 4 integers: final_move = [ (move % 5) + 1, # First digit (1-5) ((move // 5) % 5) + 1, # Second digit (1-5) ((move // 25) % 5) + 1, # Third digit (1-5) ((move // 125) % 5) + 1 # Fourth digit (1-5) ] return final_move def train(): plot_scores = [] plot_mean_scores = [] total_score = 0 plot_reward = [] plot_mean_reward = [] total_reward = 0 record = 10 agent = Agent() game = MasterMindAI() game.start_game() while True: # get old state state_old = agent.get_state(game) # get move final_move = agent.get_action(state_old) # perform move and get new state reward, done, score = game.play_step(final_move) # print("reward in here", reward) state_new = agent.get_state(game) # train short memory agent.train_short_memory(state_old, final_move, reward, state_new, done) # remember agent.remember(state_old, final_move, reward, state_new, done) if done: #print("reward", reward) # train long memory, plot result game.start_game() agent.n_games += 1 agent.train_long_memory() if score < record: record = score agent.model.save() print('Game', agent.n_games, 'Score', score, 'Record:', record) # plot_scores.append(score) # total_score += score # mean_score = total_score / agent.n_games # plot_mean_scores.append(mean_score) # plot(plot_scores, plot_mean_scores) plot_reward.append(reward) total_reward += reward mean_reward = total_reward / agent.n_games plot_mean_reward.append(mean_reward) plot(plot_reward, plot_mean_reward) if __name__ == '__main__': train() [/code] Это model.py [code]import torch import torch.nn as nn import torch.optim as optim import torch.nn.functional as F import os class Linear_QNet(nn.Module): def __init__(self, input_size, hidden_size, output_size): super().__init__() self.linear1 = nn.Linear(input_size, hidden_size) self.linear2 = nn.Linear(hidden_size, output_size) def forward(self, x): x = F.relu(self.linear1(x)) x = self.linear2(x) return x def save(self, file_name='model.pth'): model_folder_path = './model' if not os.path.exists(model_folder_path): os.makedirs(model_folder_path) file_name = os.path.join(model_folder_path, file_name) torch.save(self.state_dict(), file_name) class QTrainer: def __init__(self, model, lr, gamma): self.lr = lr self.gamma = gamma self.model = model self.optimizer = optim.Adam(model.parameters(), lr=self.lr) self.criterion = nn.MSELoss() def train_step(self, state, action, reward, next_state, done): state = torch.tensor(state, dtype=torch.float) next_state = torch.tensor(next_state, dtype=torch.float) action = torch.tensor(action, dtype=torch.long) reward = torch.tensor(reward, dtype=torch.float) # (n, x) if len(state.shape) == 2: # (1, x) state = torch.unsqueeze(state, 0) next_state = torch.unsqueeze(next_state, 0) action = torch.unsqueeze(action, 0) reward = torch.unsqueeze(reward, 0) done = (done, ) # 1: predicted Q values with current state pred = self.model(state) target = pred.clone() for idx in range(len(done)): Q_new = reward[idx] if not done[idx]: Q_new = reward[idx] + self.gamma * torch.max(self.model(next_state[idx])) target[idx][torch.argmax(action[idx]).item()] = Q_new #target[idx][action[idx]] = Q_new # 2: Q_new = r + y * max(next_predicted Q value) -> only do this if not done # pred.clone() # preds[argmax(action)] = Q_new self.optimizer.zero_grad() loss = self.criterion(target, pred) loss.backward() self.optimizer.step() [/code] И мой game.py [code]import random class MasterMindAI: def start_game(self): self.hidden = [random.randint(1, 5) for _ in range(4)] self.guessMax = 8 self.guessMade = 0 self.reward = 0 self.allGuess = [] self.compare = [] def get_guesses(self): if(self.allGuess == []): return [[-1,-1,-1,-1]] else: return self.allGuess def get_compare(self): if(self.compare == []): return [[-1,-1,-1,-1]] else: return self.compare #Action def guess_list(self): self.guessMade += 1 guess = input("Enter four numbers separated by spaces: ") nums = [int(num) for num in guess.split()] return nums def play_step(self,action): game_over = False self.compare.insert(0, self.checkHidden(self.hidden, action)) self.guessMade += 1 for num in self.compare[0]: if num == 2: self.reward += 3 elif num == 1: self.reward += 1 self.allGuess.insert(0, action) if(action == self.hidden or self.guessMade >= self.guessMax ): game_over = True if(action == self.hidden): self.reward += (self.guessMax-self.guessMade) * 20 else: self.reward += 0 #self.display() #print("reward ", self.reward) return self.reward, game_over, self.guessMade def display(self): for i in range(self.guessMax - self.guessMade): print("_ _ _ _ |") for lis, lis2 in zip(self.allGuess, self.compare): if(lis != [-1,-1,-1,-1]): print(*lis, "|", *lis2) def checkHidden(self, answer, guess): spot = [0,1,2,3] ans = answer.copy() lis = [] if(guess[0] == self.hidden[0]): lis.append(2) spot[0] = -1 ans[0] = -1 if(guess[1] == self.hidden[1]): lis.append(2) spot[1] = -1 ans[1] = -1 if(guess[2] == self.hidden[2]): lis.append(2) spot[2] = -1 ans[2] = -1 if(guess[3] == self.hidden[3]): lis.append(2) spot[3] = -1 ans[3] = -1 if 0 in spot: if guess[0] in ans: lis.append(1) ans[0] = -1 if 1 in spot: if guess[1] in ans: lis.append(1) ans[1] = -1 if 2 in spot: if guess[2] in ans: lis.append(1) ans[2] = -1 if 3 in spot: if guess[3] in ans: lis.append(1) ans[3] = -1 while len(lis) < 4: lis.append(0) return lis # def play(self): # self.start_game() # self.display() #play_step #recent_guess = self.guess_list() #self.play_step(recent_guess) # # while(recent_guess != self.hidden and self.guessMade < self.guessMax): # self.display() # # # recent_guess = self.guess_list() # self.play_step(recent_guess) # # # if (recent_guess == self.hidden): # reward = (self.guessMax - self.guessMade) * 10 # return 1, reward # else: # reward = 0 # return 0, reward if __name__ == "__main__": ai = MasterMindAI() num, reward = ai.play() [/code] Я просто застрял на том, где мне нужно устранить неполадки. Спасибо за любую помощь Подробнее здесь: [url]https://stackoverflow.com/questions/79363697/why-is-my-reinforcement-learning-ai-not-increasing-its-reward-over-time[/url]