# network architecture
# encoder related
elayers: 12
eunits: 2048
# decoder related
dlayers: 6
dunits: 2048
# attention related
adim: 256
aheads: 4

# hybrid CTC/attention
ctc_type: builtin
mtlalpha: 1.0
mtlbelta: 1

# label smoothing
lsm-weight: 0.1

# minibatch related
batch-size: 12
maxlen-in: 30000  # if input length  > maxlen-in, batchsize is automatically reduced
maxlen-out: 150 # if output length > maxlen-out, batchsize is automatically reduced
drop-len: 120000 # if input length > drop-len, will dropout this utts

# optimization related
sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
opt: adam
accum-grad: 2
grad-clip: 5
patience: 0
epochs: 100
dropout-rate: 0.1

# wavloss related
sample-rate: 8000
frame-length: 256    # int(0.032 * FRAME_RATE)
frame-shift: 64

# joint model related
# transformer specific setting
backend: pytorch
model-module: "espnet.nets.pytorch_backend.e2e_asr_separate_transformer:E2E"
transformer-input-layer: conv2d     # encoder architecture type
transformer-lr: 1.0
transformer-warmup-steps: 25000
transformer-attn-dropout-rate: 0.0
transformer-length-normalized-loss: false
transformer-init: pytorch
# tasnet specific setting
N: 256
L: 20
B: 256
H: 512
P: 3
X: 8
R: 4
C: 2
norm-type: 'gLN'
causal: 0
mask-nonlinear: 'relu'
end-separation-mode: 1
greedy-tf: 1
add-last-silence: 0
pit-without-tf: 0

sampling-probability: 0.3

# Report CER & WER
report-cer: True
report-wer: True

sep-init: 'exp/pre_trained_tasnet/snapshot.ep.30'
sep-init-mods: 'encoder,separator,mask_conv1x1,decoder'
asr-init: 'exp/pre_trained_asr/model.last10.avg.best'
asr-init-mods: 'encoder,decoder,ctc'
fixed-mods: 'sep_encoder,sep_separator'
global-cmvn: 'conf/global_cmvn.txt'