Reinforcement Learning with Gymnasium in Python
Fouad Trad
Machine Learning Engineer




n_bandits = 4 true_bandit_probs = np.random.rand(n_bandits)n_iterations = 100000 epsilon = 1.0 min_epsilon = 0.01 epsilon_decay = 0.999counts = np.zeros(n_bandits) # How many times each bandit was playedvalues = np.zeros(n_bandits) # Estimated winning probability of each banditrewards = np.zeros(n_iterations) # Reward historyselected_arms = np.zeros(n_iterations, dtype=int) # Arm selection history
for i in range(n_iterations): arm = epsilon_greedy()reward = np.random.rand() < true_bandit_probs[arm]rewards[i] = reward selected_arms[i] = arm counts[arm] += 1values[arm] += (reward - values[arm]) / counts[arm]epsilon = max(min_epsilon, epsilon * epsilon_decay)
selections_percentage = np.zeros((n_iterations, n_bandits))

selections_percentage = np.zeros((n_iterations, n_bandits))for i in range(n_iterations): selections_percentage[i, selected_arms[i]] = 1

selections_percentage = np.zeros((n_iterations, n_bandits))for i in range(n_iterations): selections_percentage[i, selected_arms[i]] = 1selections_percentage = np.cumsum(selections_percentage, axis=0) / np.arange(1, n_iterations + 1).reshape(-1, 1)


for arm in range(n_bandits): plt.plot(selections_percentage[:, arm], label=f'Bandit #{arm+1}') plt.xscale('log') plt.title('Bandit Action Choices Over Time') plt.xlabel('Episode Number') plt.ylabel('Percentage of Bandit Selections (%)') plt.legend() plt.show()for i, prob in enumerate(true_bandit_probs, 1): print(f"Bandit #{i} -> {prob:.2f}")
Bandit #1 -> 0.37
Bandit #2 -> 0.95
Bandit #3 -> 0.73
Bandit #4 -> 0.60
Reinforcement Learning with Gymnasium in Python