# Scheduler for training baseline FP32 models of pre-activation ResNet on CIFAR-10
# Applicable to ResNet 20 / 32 / 44 / 56 / 110
# 
# Command line for training (running from the compress_classifier.py directory):
# python compress_classifier.py -a preact_resnet20_cifar --lr 0.1 -p 50 -b 128 <path_to_cifar10_dataset> -j 1 --epochs 200 --compress=../quantization/fp32_baselines/preact_resnet_cifar_base_fp32.yaml --wd=0.0002 --vs=0 --gpus 0
#
# Notes:
#  * In '-a preact_resnet20_cifar', replace '20' with the required depth
#  * '--wd=0.0002': Weight decay of 0.0002 is used
#  * '--vs=0': We train on the entire training dataset, and validate using the test set
#
# Knowledge Distillation:
# -----------------------
# To train these models with knowledge distillation, add the following arguments to the command line:
# --kd-teacher preact_resnet44_cifar --kd-resume <path_to_teacher_model_checkpoint> --kd-temp 5.0 --kd-dw 0.7 --kd-sw 0.3
#
# Notes:
#  * Replace 'preact_resnet44_cifar' with the required teacher model
#  * Use the command line above to train teacher models, which can be used as checkpoints passed to '--kd-resume'
#  * In this example we're using a distillation temperature of 5.0, and we give a weight of 0.7 to the distillation loss
#    (that is - the loss of the student predictions vs. the teacher's soft targets).
#  * Note we don't change any of the other training hyper-parameters
#  * More details on knowledge distillation at: 
#    https://nervanasystems.github.io/distiller/schedule/index.html#knowledge-distillation
#
# See some experimental results with the hyper-parameters shown above after the YAML schedule

lr_schedulers:
  training_lr:
    class: MultiStepMultiGammaLR
    milestones: [80, 120, 160]
    gammas: [0.1, 0.1, 0.2]

policies:
    - lr_scheduler:
        instance_name: training_lr
      starting_epoch: 0
      ending_epoch: 200
      frequency: 1

# The results listed here are based on 4 runs in each configuration. All results are Top-1:
#
# +-------+--------------+-------------------------+
# |       |              |           FP32          |
# +-------+--------------+-------------------------+
# | Depth | Distillation | Best  | Worst | Average |
# |       | Teacher      |       |       |         |
# +-------+--------------+-------+-------+---------+
# | 20    | None         | 92.4  | 91.91 | 92.2225 |
# +-------+--------------+-------+-------+---------+
# | 20    | 32           | 92.85 | 92.68 | 92.7375 |
# +-------+--------------+-------+-------+---------+
# | 20    | 44           | 93.09 | 92.64 | 92.795  |
# +-------+--------------+-------+-------+---------+
# | 20    | 56           | 92.77 | 92.52 | 92.6475 |
# +-------+--------------+-------+-------+---------+
# | 20    | 110          | 92.87 | 92.66 | 92.7725 |
# +-------+--------------+-------+-------+---------+
# |       |              |       |       |         |
# +-------+--------------+-------+-------+---------+
# | 32    | None         | 93.31 | 92.93 | 93.13   |
# +-------+--------------+-------+-------+---------+
# | 32    | 44           | 93.54 | 93.35 | 93.48   |
# +-------+--------------+-------+-------+---------+
# | 32    | 56           | 93.58 | 93.47 | 93.5125 |
# +-------+--------------+-------+-------+---------+
# | 32    | 110          | 93.6  | 93.29 | 93.4575 |
# +-------+--------------+-------+-------+---------+
# |       |              |       |       |         |
# +-------+--------------+-------+-------+---------+
# | 44    | None         | 94.07 | 93.5  | 93.7425 |
# +-------+--------------+-------+-------+---------+
# | 44    | 56           | 94.08 | 93.58 | 93.875  |
# +-------+--------------+-------+-------+---------+
# | 44    | 110          | 94.13 | 93.75 | 93.95   |
# +-------+--------------+-------+-------+---------+
# |       |              |       |       |         |
# +-------+--------------+-------+-------+---------+
# | 56    | None         | 94.2  | 93.52 | 93.8    |
# +-------+--------------+-------+-------+---------+
# | 56    | 110          | 94.47 | 94.0  | 94.16   |
# +-------+--------------+-------+-------+---------+
# |       |              |       |       |         |
# +-------+--------------+-------+-------+---------+
# | 110   | None         | 94.66 | 94.42 | 94.54   |
# +-------+--------------+-------+-------+---------+
