Reinforcement Learning with Gymnasium in Python
Fouad Trad
Machine Learning Engineer
def generate_episode(): episode = [] state, info = env.reset()
terminated = False while not terminated: action = env.action_space.sample()
next_state, reward, terminated, truncated, info = env.step(action)
episode.append((state, action, reward)) state = next_state
return episode
def first_visit_mc(num_episodes): Q = np.zeros((num_states, num_actions)) returns_sum = np.zeros((num_states, num_actions)) returns_count = np.zeros((num_states, num_actions))
for i in range(num_episodes): episode = generate_episode() visited_states_actions = set()
for j, (state, action, reward) in enumerate(episode):
if (state, action) not in visited_states:
returns_sum[state, action] += sum([x[2] for x in episode[j:]])
returns_count[state, action] += 1 visited_states_actions.add((state, action))
nonzero_counts = returns_count != 0
Q[nonzero_counts] = returns_sum[nonzero_counts] / returns_count[nonzero_counts] return Q
def every_visit_mc(num_episodes):
Q = np.zeros((num_states, num_actions))
returns_sum = np.zeros((num_states, num_actions))
returns_count = np.zeros((num_states, num_actions))
for i in range(num_episodes):
episode = generate_episode()
for j, (state, action, reward) in enumerate(episode):
returns_sum[state, action] += sum([x[2] for x in episode[j:]])
returns_count[state, action] += 1
nonzero_counts = returns_count != 0
Q[nonzero_counts] = returns_sum[nonzero_counts] / returns_count[nonzero_counts]
return Q
def get_policy():
policy = {state: np.argmax(Q[state]) for state in range(num_states)}
return policy
Q = first_visit_mc(1000)
policy_first_visit = get_policy()
print("First-visit policy: \n", policy_first_visit)
Q = every_visit_mc(1000)
policy_every_visit = get_policy()
print("Every-visit policy: \n", policy_every_visit)
First-visit policy:
{0: 2, 1: 2, 2: 1,
3: 2, 4: 2, 5: 0}
Every-visit policy:
{0: 2, 1: 2, 2: 1,
3: 2, 4: 2, 5: 0}
Reinforcement Learning with Gymnasium in Python