Reinforcement Learning with Gymnasium in Python
Fouad Trad
Machine Learning Engineer
def compute_q_value(state, action):
if state == terminal_state: return None
_, next_state, reward, _ = env.unwrapped.P[state][action][0] return reward + gamma * compute_state_value(next_state)
Q = {(state, action): compute_q_value(state, action) for state in range(num_states) for action in range(num_actions)}
print(Q)
{(0, 0): 0, (0, 1): 1, (0, 2): 7, (0, 3): 0,
(1, 0): 0, (1, 1): 5, (1, 2): 8, (1, 3): 7,
(2, 0): 7, (2, 1): 9, (2, 2): 8, (2, 3): 8,
(3, 0): 1, (3, 1): 2, (3, 2): 5, (3, 3): 0,
(4, 0): 1, (4, 1): 3, (4, 2): 9, (4, 3): 7,
(5, 0): 5, (5, 1): 10, (5, 2): 9, (5, 3): 8,
(6, 0): 2, (6, 1): 2, (6, 2): 3, (6, 3): 1,
(7, 0): 2, (7, 1): 3, (7, 2): 10, (7, 3): 5,
(8, 0): None, (8, 1): None, (8, 2): None, (8, 3): None}
Old policy
improved_policy = {}
for state in range(num_states-1): max_action = max(range(num_actions), key=lambda action: Q[(state, action)])
improved_policy[state] = max_action
print(improved_policy)
{0: 2, 1: 2, 2: 1,
3: 2, 4: 2, 5: 1,
6: 2, 7: 2}
Reinforcement Learning with Gymnasium in Python