""" Script to run h2o on regression dataset (on EC2)


REGRESSION_DATASETS = ['bostonHousing', 'concrete', 'energy', 'kin8nm', 'naval-propulsion-plant', 'power-plant', 'protein-tertiary-structure', 'wine-quality-red', 'yacht']



First run from cmd line:

ec2name=dst_h2o4h_kin8nm
echo $ec2name

aws_connect_name $ec2name

screen -h 100000 -L

python


# still missing:
dst_h2o4h_energy



"""

import pandas as pd
import numpy as np
import h2o
import os
from autogluon.utils.tabular.utils.loaders import load_pd
from autogluon.utils.tabular.ml.constants import BINARY, MULTICLASS, REGRESSION
from autogluon import TabularPrediction as task

from autogluon_utils.benchmarking.baselines.h2o_base.h2o_base import H2OBaseline
from autogluon_utils.benchmarking.distill_benchmark.configs import *


## Arguments ##
dataset_name = 'energy'
tag = "4Hrun3"

runtime_sec = 60 * 60 * 4
## End of Arguments ##

print(f"Args: \n dataset_name: {dataset_name} \n tag: {tag} \n runtime_sec: {runtime_sec}")

# Prespecifed configs:
FOLD = 0  # not config for now
fold_str = str(FOLD)
output_directory = 'H2O_models/' # where to save trained models
s3_dataset_folder = 's3://'+BUCKET+'/Regression/RegressionDataframes/'+dataset_name+'/fold_'+fold_str+'/'
s3_results_folder = 's3://'+BUCKET+'/results/'+dataset_name+'/H2O_4h/'+tag+'/'
metadata_csv = 'H2Oresults.csv'  # saved in current local directory
s3_results_file = s3_results_folder + metadata_csv
train_filename = 'raw_train.csv'
test_filename = 'raw_test.csv'
label_column = '__label__'
num_cores = -1
problem_type = REGRESSION
eval_metric = None  # RMSE by default

# Load data:
train_data = load_pd.load(s3_dataset_folder + train_filename)
test_data = load_pd.load(s3_dataset_folder + test_filename) # can be local CSV file as well, returns Pandas DataFrame
y_test = test_data[label_column]
test_data.drop([label_column], axis=1, inplace=True)

print(train_data.head())
print(test_data.head())


# Run h2o:
h2o_model = H2OBaseline()

num_models_trained, num_models_ensemble, fit_time = h2o_model.fit(train_data=train_data,
                    label_column=label_column, problem_type=problem_type, eval_metric=eval_metric,
                    runtime_sec=runtime_sec, num_cores=num_cores)

y_pred,_, predict_time  = h2o_model.predict(test_data,predict_proba=False, pred_class_and_proba=False)

# Can use autogluon.tabular.Predictor to evaluate predictions (assuming metric correctly specified):
ag_predictor = task.fit(task.Dataset(df=train_data), label=label_column,
        problem_type=problem_type, eval_metric=eval_metric, hyperparameters={'GBM': {'num_boost_round': 2}})

perf = ag_predictor.evaluate_predictions(y_test, y_pred)
print("H2O test performance: %s" % perf)
print("Number of models trained during H2O fit(): %s" % num_models_trained)
print("H2O ensemble-size used at inference-time: %s" % num_models_ensemble)
print("H2O fit runtime: %s" % fit_time)
print("H2O predict runtime: %s" % predict_time)

# save metadata-file with latency, performance:
metadata = pd.DataFrame({'test_rmse':[perf], 'predict_time':[predict_time], 'fit_time':[fit_time],
    'num_models_trained':[num_models_trained],'num_models_ensemble':[num_models_ensemble],
    'num_train':[len(train_data)], 'num_test':[len(test_data)]})
metadata.to_csv(metadata_csv, index=False)
print(f"metadata saved to file: {metadata_csv}")
print(metadata)

# copy to s3:
exit_val = os.system(f"aws s3 cp {metadata_csv} {s3_results_file}")
if exit_val != 0:
    raise ValueError("copying to s3 failed")
else:
    print(f"copied to s3: {s3_results_file}")

print(f"Run completed with Args: \n dataset_name: {dataset_name} \n tag: {tag} \n runtime_sec: {runtime_sec}")





