Deep Reinforcement Learning in Python
Timothée Carayol
Principal Machine Learning Engineer, Komment
def select_action(policy_network, state): action_probs = policy_network(state) action_dist = Categorical(action_probs) action = action_dist.sample() log_prob = action_dist.log_prob(action)
# Obtain the entropy of the policy entropy = action_dist.entropy()
return (action.item(), log_prob.reshape(1), entropy)
actor_loss -= c_entropy * entropy
Categorical.entropy()
is in nats; divide by math.log(2)
for bitsfor episode in range(10):
state, info = env.reset()
done = False
while not done:
action, action_log_prob, entropy = select_action(actor, state)
next_state, reward, terminated, truncated, _ = env.step(action)
done = terminated or truncated
actor_loss, critic_loss = calculate_losses(critic, action_log_prob, action_log_prob,
reward, state, next_state, done)
actor_loss -= c_entropy * entropy
actor_optimizer.zero_grad(); actor_loss.backward(); actor_optimizer.step()
critic_optimizer.zero_grad(); critic_loss.backward(); critic_optimizer.step()
state = next_state
Deep Reinforcement Learning in Python