import gridworld as gw import random import sys def q_learn(n, select, alpha, gamma): h = len(gw.walls) w = len(gw.walls[0]) # initialize Q to all 0's # a state is the row and column the agent is in; we store Q((r,c), a) # as q[r][c][a] q = [] for r in range(h): row = [] for c in range(w): row.append([0.0] * len(gw.actions)) q.append(row) for i in range(n): # initial position is bottom left corner s = (2, 0) while not gw.terminal[s[0]][s[1]]: # select an action row, col = s a = select(q[row][col]) # observe new state and reward new_s = gw.result(s, a) new_r, new_c = new_s r = gw.reward[new_r][new_c] # update Q q[row][col][a] += alpha * (r + gamma * max(q[new_r][new_c][a_prime] for a_prime in range(len(gw.actions))) - q[row][col][a]) s = new_s return q def print_directional_q(q): print('+', end='') for entry in q[0]: print('----------------------+', end='') print() for row in q: print('| ', end='') for entry in row: print(' ', '%+.3f' % entry[0], ' ', end='') print(' | ', end='') print() print('| ', end='') for entry in row: print('%+.3f' % entry[1], ' ', '%+.3f' % entry[2], end='') print(' | ', end='') print() print('| ', end='') for entry in row: print(' ', '%+.3f' % entry[3], ' ', end='') print(' | ', end='') print() print('+', end='') for entry in row: print('----------------------+', end='') print() def eps_greedy(eps): def fxn(q): if random.random() < eps: # return random choice with probability epsilon return random.choice(range(len(q))) else: # otherwise return index that maximizes q return max(enumerate(q), key=lambda p: p[1])[0] return fxn if __name__ == '__main__': if len(sys.argv) < 5: # suggest eps = 0.25, alpha = 0.1, gamma = 0.9 print("USAGE: {python3 | pypy3}", sys.argv[0], "n eps alpha gamma") sys.exit(1) q = q_learn(int(sys.argv[1]), eps_greedy(float(sys.argv[2])), float(sys.argv[3]), float(sys.argv[4])) print_directional_q(q)