Deep Reinforcement Learning in Python
Timothée Carayol
Principal Machine Learning Engineer, Komment





def select_action(policy_network, state): action_probs = policy_network(state) action_dist = Categorical(action_probs) action = action_dist.sample() log_prob = action_dist.log_prob(action)# Obtain the entropy of the policy entropy = action_dist.entropy()return (action.item(), log_prob.reshape(1), entropy)
actor_loss -= c_entropy * entropyCategorical.entropy() is in nats; divide by math.log(2) for bitsfor episode in range(10):
  state, info = env.reset()
  done = False
  while not done:
    action, action_log_prob, entropy = select_action(actor, state)
    next_state, reward, terminated, truncated, _ = env.step(action)
    done = terminated or truncated
    actor_loss, critic_loss = calculate_losses(critic, action_log_prob, action_log_prob,
                                               reward, state, next_state, done)
    actor_loss -= c_entropy * entropy
    actor_optimizer.zero_grad(); actor_loss.backward(); actor_optimizer.step()
    critic_optimizer.zero_grad(); critic_loss.backward(); critic_optimizer.step()
    state = next_state
  
Deep Reinforcement Learning in Python