R"""Script for converting raw synthetic data experiment output to CSV.

Prints the CSV to stdout.
"""
import collections
import csv
import io
import json
import os
import re
from statistics import median
from typing import Dict, List

from absl import app
from absl import flags
from absl import logging

import numpy as np


FLAGS = flags.FLAGS

flags.DEFINE_string('data_dir', None, 'Path to directory to read from.')

flags.DEFINE_string('mgls_subdir', 'mgls',
                    'Subdirectory of data_dir containing results from mGLS experiments. Set to empty string if not present.')

flags.DEFINE_string('gradient_descent_subdir', 'gradient_descent',
                    'Subdirectory of data_dir containing results from gradient descent experiments. Set to empty string if not present.')

flags.DEFINE_string('random_vertex_subdir', 'random_vertex',
                    'Subdirectory of data_dir containing results from random vertex experiments. Set to empty string if not present.')

flags.DEFINE_enum('reduction', 'median', ['mean', 'median'],
                  'The reduction to perform to reduce a list of values to a single value representing it.')


ExpParams = collections.namedtuple('ExpParams', ['dataset', 'n_components', 'm', 'N'])
ExpResults = collections.namedtuple('ExpResults', ['train_loss', 'train_acc', 'val_acc'])


def base_read_result_file(filepath: str):
    filepath = os.path.expanduser(filepath)
    with open(filepath, 'r') as f:
        results = json.load(f)

    config = results['config']
    exp_params = ExpParams(
        dataset=config["dataset"],
        n_components=config["n_components"],
        m=config["m"],
        N=config["N"],
    )

    return exp_params, results


def transpose_exp_results(exp_results: List[ExpResults]) -> ExpResults:
    ret = ExpResults(train_loss=[], train_acc=[], val_acc=[])

    def add(r, field: str):
        val = getattr(r, field)
        if isinstance(val, (list, tuple)):
            getattr(ret, field).extend(val)
        else:
            getattr(ret, field).append(val)

    for r in exp_results:
        add(r, 'train_loss')
        add(r, 'train_acc')
        add(r, 'val_acc')
    return ret

#####################################################################


def _read_random_vertex_file(filepath: str):
    exp_params, results = base_read_result_file(filepath)
    exp_results = ExpResults(
        train_loss=results['train_loss'],
        train_acc=results['train_acc'],
        val_acc=results['val_acc'],
    )
    return exp_params, exp_results


def _read_gradient_descent_file(filepath: str):
    exp_params, results = base_read_result_file(filepath)
    train_losses = [el[0] for el in results['final_losses']]
    train_accs = [h['binary_accuracy'][-1] for h in results['histories']]
    val_accs = [h['val_binary_accuracy'][-1] for h in results['histories']]
    exp_results = ExpResults(
        train_loss=train_losses,
        train_acc=train_accs,
        val_acc=val_accs,
    )
    return exp_params, exp_results


def _read_mgls_file(filepath: str):
    exp_params, results = base_read_result_file(filepath)
    # Some of the experiments looked to have empty results for some reason.
    if not results['train_losses']:
        return None, None
    exp_results = ExpResults(
        train_loss=results['train_losses'][-1],
        train_acc=results['train_accuracies'][-1],
        val_acc=results['val_accuracies'][-1],
    )
    return exp_params, exp_results


_METHOD_TO_READ_FILE_FN = {
    'random_vertex': _read_random_vertex_file,
    'gradient_descent': _read_gradient_descent_file,
    'mgls': _read_mgls_file,
}


def collect_data(path: str, method: str):
    read_file = _METHOD_TO_READ_FILE_FN[method]
    path = os.path.expanduser(path)
    ret = collections.defaultdict(list)
    for filename in os.listdir(path):
        filepath = os.path.join(path, filename)
        exp_params, exp_results = read_file(filepath)
        ret[exp_params].append(exp_results)
    if None in ret:
        del ret[None]
    ret2 = {k: transpose_exp_results(v) for k, v in ret.items()}
    return ret2


#####################################################################

def rows_to_csv(rows) -> str:
    output = io.StringIO()
    writer = csv.writer(output)
    writer.writerows(rows)
    return output.getvalue()


def process_scores(scores):
    if FLAGS.reduction == 'mean':
        score = np.mean(scores)
    elif FLAGS.reduction == 'median':
        score = median(scores)
    else:
        raise ValueError(FLAGS.reduction)
    std = np.std(scores)
    return score, std


def data_to_csv(data, process_scores=process_scores):
    header = [
        'ds', 'd', 'm', 'N',
        'train_loss', 'train_loss_std',
        'train_acc', 'train_acc_std',
        'val_acc', 'val_acc_std',
        'n_data_points'
    ]
    rows = []
    for p, d in data.items():
        row = [
            p.dataset,
            p.n_components,
            p.m,
            p.N,
            *process_scores(d.train_loss),
            *process_scores(d.train_acc),
            *process_scores(d.val_acc),
            len(d.train_loss),
        ]
        rows.append(row)

    rows = [header, *sorted(rows)]
    return rows


def get_subdirs_dict():
    ret = {
        'mgls': FLAGS.mgls_subdir,
        'gradient_descent': FLAGS.gradient_descent_subdir,
        'random_vertex': FLAGS.random_vertex_subdir,
    }
    return {k: v for k, v in ret.items() if v}


def process_method_results(subdirs: Dict[str, str], method: str):
    data_dir = os.path.expanduser(FLAGS.data_dir)
    method_dir = os.path.join(data_dir, subdirs[method])
    data = collect_data(method_dir, method)
    return data_to_csv(data)


def main(_):
    subdirs = get_subdirs_dict()
    methods = list(subdirs.keys())

    rows = []
    for method in methods:
        method_rows = process_method_results(subdirs, method)
        rows.append([f'Results using {method}'])
        rows.extend(method_rows)
        rows.append([''])

    csv_str = rows_to_csv(rows)
    print(csv_str)


if __name__ == "__main__":
    app.run(main)
