from typing import Tuple
import os
import tensorflow as tf
import numpy as np
from .gfp_protein_serializer import deserialize_gfp_sequence


def get_gfp_data(directory: str, batch_size: int, max_sequence_length: int) -> \
        Tuple[tf.data.Dataset, tf.data.Dataset, tf.data.Dataset]:

    train_file = os.path.join(directory, 'supervised', 'gfp3', 'gfp3_train.tfrecords')
    valid_file = os.path.join(directory, 'supervised', 'gfp3', 'gfp3_valid.tfrecords')
    test_file = os.path.join(directory, 'supervised', 'gfp3', 'gfp3_test.tfrecords')

    if not os.path.exists(train_file):
        raise FileNotFoundError(train_file)
    if not os.path.exists(valid_file):
        raise FileNotFoundError(valid_file)
    if not os.path.exists(test_file):
        raise FileNotFoundError(test_file)

    train_data = tf.data.TFRecordDataset(train_file)
    valid_data = tf.data.TFRecordDataset(valid_file)
    test_data = tf.data.TFRecordDataset(test_file)

    def prepare_dataset(dataset: tf.data.Dataset, shuffle: bool) -> tf.data.Dataset:
        dataset = dataset.map(deserialize_gfp_sequence, batch_size)
        # dataset = dataset.filter(lambda example: example['protein_length'] < max_sequence_length)
        dataset = dataset.shuffle(1024) if shuffle else dataset.prefetch(1024)
        bucket_boundaries = np.arange(100, max_sequence_length + 100, 100)
        centers = np.arange(50, max_sequence_length + 100, 100)
        ratio = (centers[-1]) / (centers)
        ratio = ratio * batch_size
        ratio = np.asarray(ratio, np.int32)
        batch_fun = tf.data.experimental.bucket_by_sequence_length(
            lambda example: example['protein_length'],
            bucket_boundaries,
            ratio)
        dataset = dataset.apply(batch_fun)
        return dataset

    train_data = prepare_dataset(train_data, shuffle=True)
    valid_data = prepare_dataset(valid_data, shuffle=False)
    test_data = prepare_dataset(test_data, shuffle=False)

    return train_data, valid_data, test_data
