""" Example script to run auto-sklearn.
    Note: auto-sklearn can only be run on Linux, not Mac.


Install first:

sudo apt-get -yq install swig

pip uninstall xgboost -y
pip install xgboost==0.80
pip install auto-sklearn==0.5.2

screen -h 100000 -L

python





REGRESSION_DATASETS = ['bostonHousing', 'concrete', 'energy', 'kin8nm', 'naval-propulsion-plant', 'power-plant', 'protein-tertiary-structure', 'wine-quality-red', 'yacht']


# Running 4H:
dst_autosklearn_boston
dst_autosklearn_concrete
dst_autosklearn_energy
dst_autosklearn_power-plant
dst_autosklearn_naval-propulsion-plant
dst_autosklearn_kin8nm


# Running test :
dst_autosklearn_protein
dst_autosklearn_wine
dst_autosklearn_yacht



'bostonHousing', 'concrete', 'energy', 'kin8nm', 'naval-propulsion-plant', 'power-plant', 'protein-tertiary-structure', 'wine-quality-red', 'yacht'

"""

import pandas as pd
import numpy as np
import autosklearn.regression
import sklearn.metrics
import time, warnings, os, tempfile, math, shutil
from psutil import virtual_memory

from autogluon_utils.benchmarking.distill_benchmark.configs import *


## Arguments ##
dataset_name = 'protein-tertiary-structure'
tag = "testrun"
runtime_sec = 30

""" real run:
runtime_sec =  60 * 60 * 4
tag = "4Hrun1"

"""

## End of Arguments ##

print(f"Args: \n dataset_name: {dataset_name} \n tag: {tag} \n runtime_sec: {runtime_sec}")

# Prespecifed configs:
FOLD = 0  # not config for now
fold_str = str(FOLD)
s3_dataset_folder = 's3://'+BUCKET+'/Regression/RegressionDataframes/'+dataset_name+'/fold_'+fold_str+'/'
s3_results_folder = 's3://'+BUCKET+'/results/'+dataset_name+'/autosklearn_4h/'+tag+'/'
metadata_csv = 'AutosklearnResults.csv'  # saved in current local directory
s3_results_file = s3_results_folder + metadata_csv
train_filename = 'raw_train.csv'
test_filename = 'raw_test.csv'
label_column = '__label__'

# Load data:
train_data = pd.read_csv(s3_dataset_folder + train_filename)
test_data = pd.read_csv(s3_dataset_folder + test_filename) # can be local CSV file as well, returns Pandas DataFrame
print(train_data.head())
print(test_data.head())

y_train = train_data[label_column].values
X_train = train_data.drop([label_column], axis=1).values
y_test = test_data[label_column].values
X_test = test_data.drop([label_column], axis=1).values

# autosklearn configs:
def numTrainedModels(autosk):
    """ Returns None if number of trained models cannot be found. """
    autosk_summary_str = autosk.sprint_statistics()
    list_str = autosk_summary_str.split("\n")
    identifier_str = "Number of target algorithm runs:" # how we find the right string to extract this value from.
    inds = [i for i, val in enumerate(list_str) if identifier_str in val]
    if len(inds) != 1:
        warnings.warn("Failed to get number of trained models from AutoSKLearn; sprint_statistics() returned unexpected formatted string")
        return None
    num_runs_str = list_str[inds[0]].strip()
    split_str = num_runs_str.split(identifier_str)
    if len(split_str) != 2:
        warnings.warn("Failed to get number of trained models from AutoSKLearn; sprint_statistics() returned unexpected formatted string")
        return None
    num_runs = int(split_str[1].strip())
    return num_runs

output_directory = 'autosklearn_models/' # where to save trained models
num_cores = 1
n_jobs = num_cores

autosk_params = {'n_jobs': 1, 'time_left_for_this_task': runtime_sec}

print("Fitting auto-sklearn....")
t0 = time.time()
autosk = autosklearn.regression.AutoSklearnRegressor(**autosk_params)
autosk.fit(X_train, y_train)
t1 = time.time()
fit_time = t1 - t0
num_models_trained = numTrainedModels(autosk)
num_models_ensemble = len(autosk.get_models_with_weights())
t2 = time.time()
y_pred = autosk.predict(X_test)
t3 = time.time()
predict_time = t3 - t2
perf = np.sqrt(sklearn.metrics.mean_squared_error(y_test, y_pred)) # use y_prob or y_prob[:,1] for metrics like log_loss or roc_auc

print("auto-sklearn test rmse: %s" % perf)
print("Number of models trained during auto-sklearn fit(): %s" % num_models_trained)
print("auto-sklearn ensemble-size used at inference-time: %s" % num_models_ensemble)
print("auto-sklearn fit runtime: %s" % fit_time)
print("auto-sklearn predict runtime: %s" % predict_time)

# save metadata-file with latency, performance:
metadata = pd.DataFrame({'test_rmse':[perf], 'predict_time':[predict_time], 'fit_time':[fit_time],
    'num_models_trained':[num_models_trained],'num_models_ensemble':[num_models_ensemble],
    'num_train':[len(train_data)], 'num_test':[len(test_data)]})
metadata.to_csv(metadata_csv, index=False)
print(f"metadata saved to file: {metadata_csv}")
print(metadata)

# copy to s3:
exit_val = os.system(f"aws s3 cp {metadata_csv} {s3_results_file}")
if exit_val != 0:
    raise ValueError("copying to s3 failed")
else:
    print(f"copied to s3: {s3_results_file}")

print(f"Run completed with Args: \n dataset_name: {dataset_name} \n tag: {tag} \n runtime_sec: {runtime_sec}")







"""
os.environ['JOBLIB_TEMP_FOLDER'] = tempfile.gettempdir()
os.environ['OMP_NUM_THREADS'] = '1'
os.environ['OPENBLAS_NUM_THREADS'] = '1'
os.environ['MKL_NUM_THREADS'] = '1'

DEFAULT_ML_MEMORY_LIMIT = 3072  # 3072 is autosklearn defaults
DEFAULT_ENSEMBLE_MEMORY_LIMIT = 1024 # 1024 is autosklearn defaults
mem = virtual_memory()
total_gb = mem.total >> 30
total_memory_limit_mb = (total_gb - 2) * 1000 # Leave 2GB free for OS, as done for h2o.
# when memory is large enough, we should have:
# memory_limit_mb = (cores - 1) * ml_memory_limit_mb + ensemble_memory_limit_mb
ml_memory_limit = max(math.ceil(total_memory_limit_mb/n_jobs),
                      DEFAULT_ML_MEMORY_LIMIT)
if ml_memory_limit >= total_memory_limit_mb - DEFAULT_ENSEMBLE_MEMORY_LIMIT:
    ml_memory_limit = max(total_memory_limit_mb - DEFAULT_ENSEMBLE_MEMORY_LIMIT,
                          DEFAULT_ML_MEMORY_LIMIT)

remaining_memory = total_memory_limit_mb - ml_memory_limit
if DEFAULT_ENSEMBLE_MEMORY_LIMIT > remaining_memory:
    ensemble_memory_limit = DEFAULT_ENSEMBLE_MEMORY_LIMIT
else:
    ensemble_memory_limit = max(math.ceil(remaining_memory - ml_memory_limit),
                                math.ceil(ml_memory_limit / 3.0),  # default proportions
                                DEFAULT_ENSEMBLE_MEMORY_LIMIT)

autosk_params = {
    'n_jobs': n_jobs, # = 1 or set = -1 to use all available cores. If num_cores == -1, autosk.fit() can produce bug: "automl=self._automl[0], IndexError: list index out of range". This is a known bug reported here: https://github.com/automl/auto-sklearn/pull/733
    'seed': 0,
    'time_left_for_this_task': runtime_sec,
    'per_run_time_limit': int(max(min(360, int(runtime_sec*0.99)),
                              runtime_sec/5.0)), # run at least 5 trials if overall runtime > 360. Note: OpenML AutoML benchmark did not set per_run_time_limit because their datasets are smaller, it is crucial for larger datasets. Here it is set as recommended by auto-sklearn authors.
    'ml_memory_limit': ml_memory_limit,
    'ensemble_memory_limit': ensemble_memory_limit,
}
if output_directory is not None:
    autosk_params['tmp_folder'] = output_directory
    autosk_params['delete_tmp_folder_after_terminate'] = False

"""

# Old install: curl https://raw.githubusercontent.com/automl/auto-sklearn/master/requirements.txt | xargs -n 1 -L 1 pip install







