Reinforcement Learning with Gymnasium in Python
Fouad Trad
Machine Learning Engineer
n_bandits = 4 true_bandit_probs = np.random.rand(n_bandits)
n_iterations = 100000 epsilon = 1.0 min_epsilon = 0.01 epsilon_decay = 0.999
counts = np.zeros(n_bandits) # How many times each bandit was played
values = np.zeros(n_bandits) # Estimated winning probability of each bandit
rewards = np.zeros(n_iterations) # Reward history
selected_arms = np.zeros(n_iterations, dtype=int) # Arm selection history
for i in range(n_iterations): arm = epsilon_greedy()
reward = np.random.rand() < true_bandit_probs[arm]
rewards[i] = reward selected_arms[i] = arm counts[arm] += 1
values[arm] += (reward - values[arm]) / counts[arm]
epsilon = max(min_epsilon, epsilon * epsilon_decay)
selections_percentage = np.zeros((n_iterations, n_bandits))
selections_percentage = np.zeros((n_iterations, n_bandits))
for i in range(n_iterations): selections_percentage[i, selected_arms[i]] = 1
selections_percentage = np.zeros((n_iterations, n_bandits))
for i in range(n_iterations): selections_percentage[i, selected_arms[i]] = 1
selections_percentage = np.cumsum(selections_percentage, axis=0) / np.arange(1, n_iterations + 1).reshape(-1, 1)
for arm in range(n_bandits): plt.plot(selections_percentage[:, arm], label=f'Bandit #{arm+1}') plt.xscale('log') plt.title('Bandit Action Choices Over Time') plt.xlabel('Episode Number') plt.ylabel('Percentage of Bandit Selections (%)') plt.legend() plt.show()
for i, prob in enumerate(true_bandit_probs, 1): print(f"Bandit #{i} -> {prob:.2f}")
Bandit #1 -> 0.37
Bandit #2 -> 0.95
Bandit #3 -> 0.73
Bandit #4 -> 0.60
Reinforcement Learning with Gymnasium in Python