# --- COMA specific parameters ---

action_selector: "multinomial"
epsilon_start: .0
epsilon_finish: .0
epsilon_anneal_time: 100000

runner: "parallel"

buffer_size: 8
batch_size_run: 8
batch_size: 8

env_args:
  state_last_action: False # coma critic adds last action internally

# update the target network every {} training steps
target_update_interval: 16
recurrent_critic: True

lr: 0.0005
critic_lr: 0.0005
td_lambda: 0.0

# use COMA
agent_output_type: "pi_logits"
learner: "actor_critic_learner"
critic_q_fn: "coma"
critic_baseline_fn: "coma"
critic_train_mode: "batch"
critic_train_reps: 4
q_nstep: 0  # 0 corresponds to default Q, 1 is r + gamma*Q, etc

name: "coma"
