## Configs for distillation benchmark ##
TIME_1H = 3600
TIME_4H = 14400
TIME_8H = 28800
TIME_30M = 1800
TIME_5M = 300
TIME_15M = 900
TIME_SUPERFAST = 60

DISTILLTIME_FACTOR_LONGEST = 20 # time_limit is multiplied by this amount
DISTILLTIME_FACTOR_FULL = 10 # time_limit is multiplied by this amount
DISTILLTIME_FACTOR_SHORTER = 5 # time_limit is multiplied by this amount
DISTILLTIME_FACTOR_SHORTEST = 1 # time_limit is multiplied by this amount
DISTILLTIME_FACTOR_SHRUNK = 0.3 # time_limit is multiplied by this amount

SUBSAMPLE_NONE = None
SUBSAMPLE_SUPERFAST = 1000
SUBSAMPLE = 10000

PROFILE_FAST = 'PROFILE_FAST'
PROFILE_5M_CONSTRAINED = 'PROFILE_5M_CONSTRAINED'
PROFILE_15M_CONSTRAINED = 'PROFILE_15M_CONSTRAINED'
PROFILE_1H = 'PROFILE_1H'  # The default
PROFILE_4H = 'PROFILE_4H'
PROFILE_1H_CONSTRAINED = 'PROFILE_1H_CONSTRAINED'
PROFILE_4H_CONSTRAINED = 'PROFILE_4H_CONSTRAINED'
PROFILE_8H = 'PROFILE_8H'
PROFILE_30M_CONSTRAINED = 'PROFILE_30M_CONSTRAINED'

PROFILE_5M_SMALLSTUDENT = 'PROFILE_5M_SMALLSTUDENT'
PROFILE_1H_SMALLSTUDENT = 'PROFILE_1H_SMALLSTUDENT'
PROFILE_4H_SMALLSTUDENT = 'PROFILE_4H_SMALLSTUDENT'
SMALL_STUDENT_PROFILES = [PROFILE_5M_SMALLSTUDENT, PROFILE_1H_SMALLSTUDENT, PROFILE_4H_SMALLSTUDENT]

FULL_GAN_EPOCHS = 300
CONSTRAINED_GAN_EPOCHS = 20
FAST_GAN_EPOCHS = 5
SUPERFAST_GAN_EPOCHS = 2


PROFILES = {
    PROFILE_15M_CONSTRAINED:{'subsample_size': SUBSAMPLE_NONE, 'time_limits': TIME_15M, 'distill_time_factor': DISTILLTIME_FACTOR_SHRUNK, 'distill_size_factor': DISTILLTIME_FACTOR_SHORTEST, 'gan_epochs':FAST_GAN_EPOCHS},
    PROFILE_1H: {'subsample_size': SUBSAMPLE_NONE, 'time_limits': TIME_1H, 'distill_time_factor': DISTILLTIME_FACTOR_FULL, 'distill_size_factor': DISTILLTIME_FACTOR_FULL},
    PROFILE_1H_CONSTRAINED:{'subsample_size': SUBSAMPLE_NONE, 'time_limits': TIME_1H, 'distill_time_factor': DISTILLTIME_FACTOR_SHORTEST, 'distill_size_factor': DISTILLTIME_FACTOR_FULL},
    PROFILE_4H: {'subsample_size': SUBSAMPLE_NONE, 'time_limits': TIME_4H, 'distill_time_factor': DISTILLTIME_FACTOR_FULL, 'distill_size_factor': DISTILLTIME_FACTOR_FULL},
    PROFILE_4H_CONSTRAINED: {'subsample_size': SUBSAMPLE_NONE, 'time_limits': TIME_4H, 'distill_time_factor': DISTILLTIME_FACTOR_SHORTEST, 'distill_size_factor': DISTILLTIME_FACTOR_FULL},
    PROFILE_8H: {'subsample_size': SUBSAMPLE_NONE, 'time_limits': TIME_8H, 'distill_time_factor': DISTILLTIME_FACTOR_SHORTEST, 'distill_size_factor': DISTILLTIME_FACTOR_FULL},
    PROFILE_30M_CONSTRAINED:{'subsample_size': SUBSAMPLE_NONE, 'time_limits': TIME_30M, 'distill_time_factor': DISTILLTIME_FACTOR_SHORTEST, 'distill_size_factor': DISTILLTIME_FACTOR_FULL},
    PROFILE_5M_CONSTRAINED:{'subsample_size': SUBSAMPLE_NONE, 'time_limits': TIME_5M, 'distill_time_factor': DISTILLTIME_FACTOR_SHORTEST, 'distill_size_factor': DISTILLTIME_FACTOR_FULL, 'gan_epochs': FAST_GAN_EPOCHS},
    PROFILE_FAST: {'subsample_size': SUBSAMPLE_SUPERFAST, 'time_limits': TIME_SUPERFAST, 'distill_time_factor': DISTILLTIME_FACTOR_SHORTEST, 'distill_size_factor': DISTILLTIME_FACTOR_SHORTEST, 'gan_epochs':SUPERFAST_GAN_EPOCHS},
}
PROFILES[PROFILE_5M_SMALLSTUDENT] = PROFILES[PROFILE_5M_CONSTRAINED].copy()
PROFILES[PROFILE_1H_SMALLSTUDENT] = PROFILES[PROFILE_1H_CONSTRAINED].copy()
PROFILES[PROFILE_4H_SMALLSTUDENT] = PROFILES[PROFILE_4H_CONSTRAINED].copy()

SMALL_STUDENT_HYPERPARAMS = {
    'NN':{'network_type':'feedforward','layers':[128],'max_layer_width':128,'max_embedding_dim':8, 'dropout_prob':0.0},
    'GBM':{'num_boost_round': 100},
    'CAT':{'iterations': 100},
    'RF': {'max_depth': 15},
}


BUCKET = 'gibbsdist'
GIBBS_BUCKET = 'gibbsdist'
GIBBS_PREFIX = 'AutomlBenchmarkDataSplits/fixed_header_gibbs_sampler/'  # subfolder where Gibbs samples are stored # OLD: AutomlBenchmarkDatasplits/generated2/

GIBBS_PREFIX_REGRESSDATA = 'AutomlBenchmarkDataSplits/gibbs_regs/'
SMALLER_MODEL_STRINGS = ['cpy1','cpp12y0']  # tags unique to the smaller transformer model
BIGGER_MODEL_STRINGS = ['cpy2','cpp12y6']  # tags unique to the bigger transformer model. Each augmented samples file-name must contain either bigger or smaller string.

# Methods:
ALL_METHODS = 'ALL_METHODS'
ALL_DISTILL = 'ALL_DISTILL'
ALL_METHODS_GIBBS_SOME = 'ALL_METHODS_GIBBS_SOME'
ALL_METHODS_NO_GIBBS = 'ALL_METHODS_NO_GIBBS'

ALL_DISTILL_GIBBS_SOME = 'ALL_DISTILL_GIBBS_SOME'
ALL_DISTILL_NO_GIBBS = 'ALL_DISTILL_NO_GIBBS'
ALL_DISTILL_SOFT_NOGIBBS = 'ALL_DISTILL_SOFT_NOGIBBS'

# The above are the only Methods that can be specified as string, the others below must be specified together in list of length > 1
DISTILL_BASELINE = 'DISTILL_BASELINE'
DISTILL_SOFT_NONE = 'DISTILL_SOFT_NONE'
DISTILL_HARD_MUNGE = 'DISTILL_HARD_MUNGE'
DISTILL_SOFT_MUNGE = 'DISTILL_SOFT_MUNGE'
DISTILL_SOFT_SPUNGE = 'DISTILL_SOFT_SPUNGE'
DISTILL_SOFT_GAN = 'DISTILL_SOFT_GAN'
DISTILL_GIBBS_R1 = 'DISTILL_GIBBS_R1'  # Only use R1 of Gibbs (from one of the models), ie. only consider one augmented dataset
DISTILL_GIBBS_R5 = 'DISTILL_GIBBS_R5'  # Only use R5 of Gibbs (from one of the models), ie. only consider one augmented dataset
DISTILL_GIBBS_R10 = 'DISTILL_GIBBS_R10'  # Only use R10 of Gibbs (from one of the models), ie. only consider one augmented dataset

DISTILL_GIBBS_SOME = 'DISTILL_GIBBS_SOME'  # Only try some configurations of Gibbs (1 round, 5 round, 10 round)
DISTILL_GIBBS_ALL = 'DISTILL_GIBBS_ALL'  # Try each possible configuration of Gibbs
DISTILL_GIBBS_200 = 'DISTILL_GIBBS_200'  # run extra rounds: r20, r40, r100, r200

DISTILL_METHODS_GIBBS_POSSIBILITIES = [DISTILL_GIBBS_R1, DISTILL_GIBBS_R5, DISTILL_GIBBS_R10, DISTILL_GIBBS_200, DISTILL_GIBBS_SOME, DISTILL_GIBBS_ALL]
DISTILL_METHODS_SOFTAUG_ONLY = [DISTILL_SOFT_MUNGE, DISTILL_SOFT_SPUNGE, DISTILL_SOFT_GAN]  # ALL_DISTILL_SOFT_NOGIBBS
DISTILL_METHODS_NO_GIBBS = [DISTILL_BASELINE, DISTILL_SOFT_NONE, DISTILL_HARD_MUNGE] + DISTILL_METHODS_SOFTAUG_ONLY  # ALL_DISTILL_NOGIBBS
DISTILL_METHODS_LIST = DISTILL_METHODS_NO_GIBBS + [DISTILL_GIBBS_ALL]  # ALL_DISTILL
DISTILL_METHODS_GIBBS_SOME = DISTILL_METHODS_NO_GIBBS + [DISTILL_GIBBS_SOME]  # ALL_DISTILL_GIBBS_SOME

FIT = 'FIT'
LOAD_FROM_TAG = 'LOAD_FROM_TAG'  # This overrides FIT if included (only use once). Loads predictor from tagged run.
# Specify it like this: 'LOAD_FROM_TAG__PROFILENAME__TAGNAME' (no slash after TAGNAME)
# PROFILENAME__TAGNAME must be formatted as eg: PROFILE_30M_CONSTRAINED__flt1_Ami6_dstl where __ = delimiter
# If neither FIT nor LOAD_FROM_TAG specified, then we will load predictor from file specified in current TAG (will break if current TAG file does not exist).
TAG_SPLIT = '__'  # use this delimiter to split off tag from LOAD_FROM_TAG

# 9 regression datasets from UCI (Bayesian deep learning benchmark)
REGRESSION_DATASETS = ['bostonHousing', 'concrete', 'energy', 'kin8nm', 'naval-propulsion-plant', 'power-plant', 'protein-tertiary-structure', 'wine-quality-red', 'yacht']

BINARY_DATASETS = [
'adult',
'airlines',
'albert',
'amazon_employee_access',
'apsfailure',
'australian',
'bank-marketing',
'blood-transfusion',
'christine',
'credit-g',
'guillermo',
'higgs',
'jasmine',
'kc1',
'kddcup09_appetency',
'kr-vs-kp',
'miniboone',
'nomao',
'numerai28.6',
'phoneme',
'riccardo',
'sylvine'
]
MULTICLASS_DATASETS = [
'anneal',
'arrhythmia',
'car',
'cnae-9',
'connect-4',
'covertype',
'dilbert',
'dionis',
'fabert',
'fashion-mnist',
'helena',
'jannis',
'jungle_chess_2pcs_raw_endgame_complete',
'mfeat-factors',
'robert',
'segment',
'shuttle',
'vehicle',
'volkert',
]  # Usage: is_multiclass = dataset_name.lower() in MULTICLASS_DATASETS

METRIC_ACC = 'accuracy'
METRIC_OG = 'original_metric'

HIGH_MEM_DATASETS = [
'Helena',
'Covertype',
]

min_samples_leaf = 10  # reduce memory of sklearn trees
HIGH_MEM_HYPERPARAMS = {  # No XT, reduced RF
    'NN':{}, 'GBM':{}, 'CAT':{}, 'KNN': {},
    'RF': {'min_samples_leaf': min_samples_leaf},
    'custom': ['GBM'],
}


## Final lists of datasets used:

OFFICIAL_NUMERICAL_CLASSIFICATION_DATASETS = ['Amazon_employee_access', 'Australian', 'blood-transfusion', 'connect-4', 'Covertype', 'Helena', 'higgs', 'Jannis', 'jasmine', 'jungle_chess_2pcs_raw_endgame_complete', 'mfeat-factors', 'MiniBooNE', 'nomao', 'numerai28.6', 'phoneme', 'sylvine', 'vehicle','segment', 'Volkert']  # removed 'Shuttle' (too easy), 'Dionis' (teacher is bad)
# consider removing: Amazon, Australian, mfeat-factor, MiniBooNE, nomao,
# definitely remove: kc1, 'Shuttle' (too easy), 'Dionis'

OFFICIAL_MIXED_CLASSIFICATION_DATASETS = ['credit-g', 'adult']
# consider removing 'adult'

OFFICIAL_REGRESSION_DATASETS = ['bostonHousing', 'concrete', 'energy', 'kin8nm', 'naval-propulsion-plant', 'power-plant', 'protein-tertiary-structure', 'wine-quality-red', 'yacht']
# 9 datasets

OFFICIAL_DATASETS = OFFICIAL_NUMERICAL_CLASSIFICATION_DATASETS + OFFICIAL_MIXED_CLASSIFICATION_DATASETS + OFFICIAL_REGRESSION_DATASETS
"""
>>> OFFICIAL_DATASETS
['Amazon_employee_access', 'Australian', 'blood-transfusion', 'connect-4', 'Covertype', 'Helena', 'higgs', 'Jannis', 'jasmine', 'jungle_chess_2pcs_raw_endgame_complete', 'mfeat-factors', 'MiniBooNE', 'nomao', 'numerai28.6', 'phoneme', 'sylvine', 'vehicle', 'segment', 'Volkert', 'credit-g', 'adult', 'bostonHousing', 'concrete', 'energy', 'kin8nm', 'naval-propulsion-plant', 'power-plant', 'protein-tertiary-structure', 'wine-quality-red', 'yacht']
>>> OFFICIAL_REGRESSION_DATASETS = ['bostonHousing', 'concrete', 'energy', 'kin8nm', 'naval-propulsion-plant', 'power-plant', 'protein-tertiary-structure', 'wine-quality-red', 'yacht']
"""

OFFICIAL_BINARY_DATASETS = sorted([x for x in OFFICIAL_DATASETS if x.lower() in BINARY_DATASETS])
# 12 datasets

OFFICIAL_MULTICLASS_DATASETS = sorted([x for x in OFFICIAL_DATASETS if x.lower() in MULTICLASS_DATASETS])
# 9 datasets

# All methods considered in results tables:
ALL_METHODS = ['dstl_BASELINE_LightGBM', 'dstl_BASELINE_Catboost', 'dstl_BASELINE_NeuralNet', 'dstl_BASELINE_RandomForest', 'dstl_hard_munge_LightGBM', 'dstl_hard_munge_Catboost', 'dstl_hard_munge_NeuralNet', 'dstl_hard_munge_RandomForest', 'dstl_soft_None_LightGBM', 'dstl_soft_None_Catboost', 'dstl_soft_None_NeuralNet', 'dstl_soft_None_RandomForest', 'dstl_soft_gan_LightGBM', 'dstl_soft_gan_Catboost', 'dstl_soft_gan_NeuralNet', 'dstl_soft_gan_RandomForest', 'dstl_soft_munge_LightGBM', 'dstl_soft_munge_Catboost', 'dstl_soft_munge_NeuralNet', 'dstl_soft_munge_RandomForest', 'dstl_soft_spunge_LightGBM', 'dstl_soft_spunge_Catboost', 'dstl_soft_spunge_NeuralNet', 'dstl_soft_spunge_RandomForest', 'dstl_GIB_r1_LightGBM', 'dstl_GIB_r1_Catboost', 'dstl_GIB_r1_NeuralNet', 'dstl_GIB_r1_RandomForest', 'dstl_GIB_r5_LightGBM', 'dstl_GIB_r5_Catboost', 'dstl_GIB_r5_NeuralNet', 'dstl_GIB_r5_RandomForest', 'dstl_GIB_r10_LightGBM', 'dstl_GIB_r10_Catboost', 'dstl_GIB_r10_NeuralNet', 'dstl_GIB_r10_RandomForest', 'sel_dstl_BASELINE', 'sel_dstl_hard_munge', 'sel_dstl_soft_None', 'sel_dstl_soft_gan', 'sel_dstl_soft_munge', 'sel_dstl_soft_spunge', 'sel_dstl_GIB_r1', 'sel_dstl_GIB_r5', 'sel_dstl_GIB_r10', 'avg_dstl_BASELINE', 'avg_dstl_hard_munge', 'avg_dstl_soft_None', 'avg_dstl_soft_gan', 'avg_dstl_soft_munge', 'avg_dstl_soft_spunge', 'avg_dstl_GIB_r1', 'avg_dstl_GIB_r5', 'avg_dstl_GIB_r10', 'teacher']







