{
  "nbformat": 4,
  "nbformat_minor": 5,
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.8.8"
    },
    "colab": {
      "name": "german.ipynb",
      "provenance": [],
      "collapsed_sections": []
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "UeupksW9JqX7"
      },
      "source": [
        "### German Loan Dataset\n",
        "This dataset is provided by the UC Irvine ML Repository and can be found [here](https://archive.ics.uci.edu/ml/datasets/statlog+(german+credit+data)).  The inputs are demographic and financial data about an individual and the task is to predict whether the individual is a high or low risk of defaulting on a loan.  This notebook can run without a GPU and will take a few minutes per trial, depending on how many training iterations you want.  \n"
      ],
      "id": "UeupksW9JqX7"
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "casual-avatar"
      },
      "source": [
        "from pandas import read_csv\n",
        "import numpy as np\n",
        "from sklearn.preprocessing import LabelEncoder\n",
        "from sklearn.preprocessing import OneHotEncoder\n",
        "from sklearn.preprocessing import MinMaxScaler\n",
        "from sklearn.compose import ColumnTransformer\n",
        "from sklearn.linear_model import LogisticRegression as LogReg\n",
        "\n",
        "from time import time\n",
        "import sys\n",
        "\n",
        "# load the dataset\n",
        "def load_dataset(full_path, sep=','):\n",
        "\t# load the dataset as a numpy array\n",
        "\tdataframe = read_csv(full_path, sep=sep, header=None)\n",
        "\t# drop rows with missing\n",
        "\tdataframe = dataframe.dropna()\n",
        "\t# split into inputs and outputs\n",
        "\tlast_ix = len(dataframe.columns) - 1\n",
        "\tX, y = dataframe.drop(last_ix, axis=1), dataframe[last_ix]\n",
        "\t# select categorical and numerical features\n",
        "\tcat_ix = X.select_dtypes(include=['object', 'bool']).columns\n",
        "\tnum_ix = X.select_dtypes(include=['int64', 'float64']).columns\n",
        "\t# label encode the target variable to have the classes 0 and 1\n",
        "\ty = LabelEncoder().fit_transform(y)\n",
        "\treturn X.values, y, cat_ix, num_ix\n",
        " \n",
        "# define the location of the datasets\n",
        "train_path = 'german.data'\n",
        "# load the train and test datasets\n",
        "X, Y, cat_ix, num_ix = load_dataset(train_path, sep=' ')\n",
        "\n",
        "# set transformer to 1 hot encode categorical variables and scale numerical variables\n",
        "ct = ColumnTransformer([('c',OneHotEncoder(),cat_ix), ('n',MinMaxScaler(),num_ix)], sparse_threshold=0)\n",
        "# get encoded data inputs\n",
        "data = ct.fit_transform(X)"
      ],
      "id": "casual-avatar",
      "execution_count": 1,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "xm4-LfTCJ-iR"
      },
      "source": [
        "### Dataset\n",
        "\n",
        "Define a custom pytorch dataset which separates the dataset by male and female, our chosen attributes.  "
      ],
      "id": "xm4-LfTCJ-iR"
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "sunset-afternoon",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 237
        },
        "outputId": "508cf4a9-77fc-42c5-a263-55d5f8f88614"
      },
      "source": [
        "import torch\n",
        "from torch.utils.data import Dataset, DataLoader, Subset, RandomSampler\n",
        "\n",
        "class GermanDataset(Dataset):\n",
        "    def __init__(self, raw_data, transformed_data, labels):\n",
        "        #attributes = 'gender' corresponds to gender: 0=female, 1=male\n",
        "        self.data = transformed_data\n",
        "        self.labels = labels\n",
        "        self.indices = [[]]*2\n",
        "        self.indices[0] = np.argwhere((raw_data[:,8] == 'A92')+(raw_data[:,8]=='A95')).squeeze()\n",
        "        self.indices[1] = np.argwhere((raw_data[:,8] == 'A91')+(raw_data[:,8] == 'A93')+(raw_data[:,8] == 'A94')).squeeze()\n",
        "            \n",
        "    def __len__(self):\n",
        "        return len(self.data)\n",
        "    \n",
        "    def __getitem__(self, idx):\n",
        "        return self.data[idx], self.labels[idx]\n",
        "\n",
        "#generate random 70/30 training testing split\n",
        "perm = np.random.permutation(1000)\n",
        "X_test, y_test, data_test = X[perm[700:]], y[perm[700:]], data[perm[700:]]\n",
        "X_train, y_train, data_train = X[perm[:700]], y[perm[:700]], data[perm[:700]]\n",
        "trainset = GermanDataset(X_train, data_train, y_train)\n",
        "testset = GermanDataset(X_test, data_test, y_test)"
      ],
      "id": "sunset-afternoon",
      "execution_count": 2,
      "outputs": [
        {
          "output_type": "error",
          "ename": "NameError",
          "evalue": "ignored",
          "traceback": [
            "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
            "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
            "\u001b[0;32m<ipython-input-2-8c349f8b0c8e>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m     19\u001b[0m \u001b[0;31m#generate random 70/30 training testing split\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     20\u001b[0m \u001b[0mperm\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrandom\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpermutation\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m1000\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 21\u001b[0;31m \u001b[0mX_test\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_test\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata_test\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mperm\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m700\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mperm\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m700\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mperm\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m700\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     22\u001b[0m \u001b[0mX_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata_train\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mperm\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;36m700\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mperm\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;36m700\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mperm\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;36m700\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     23\u001b[0m \u001b[0mtrainset\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mGermanDataset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_train\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
            "\u001b[0;31mNameError\u001b[0m: name 'y' is not defined"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "xebR7-a-J91k"
      },
      "source": [
        "### Oracle\n",
        "\n",
        "Define our sampling oracle, which can provide samples of (feature,label) pairs of a specified attribute, sampled uniform at random with replacement over all training examples of that attribute.  \n"
      ],
      "id": "xebR7-a-J91k"
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "xSnKd0BmMppD"
      },
      "source": [
        "class Oracle_german():\n",
        "    def __init__(self, dset, device=None, batch_size=50, naive=False):\n",
        "        \n",
        "        if device is None:\n",
        "            device = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
        "        self.device = device\n",
        "        self.batch_size = batch_size \n",
        "        self.naive = naive\n",
        "        self.att_indices = dset.indices\n",
        "        m = len(self.att_indices)\n",
        "        self.attribute_iterators = []\n",
        "        if self.naive:\n",
        "            sampler = RandomSampler(dset, replacement=True, \n",
        "                                num_samples=sys.maxsize)\n",
        "            loader = DataLoader(dset, batch_size=self.batch_size, \n",
        "                                sampler=sampler, shuffle=False, num_workers=0)\n",
        "            self.attribute_iterators.append(iter(loader))\n",
        "        else:\n",
        "            for i in range(m):\n",
        "                #select all points in the dataset with attribute i\n",
        "                data_subset = Subset(dset, self.att_indices[i])\n",
        "                #create a random with replacement sampler over those points\n",
        "                #num_samples is how many samples before the iterator ends\n",
        "                sampler = RandomSampler(data_subset, replacement=True, \n",
        "                                        num_samples=sys.maxsize)\n",
        "                loader = DataLoader(data_subset, batch_size=self.batch_size, \n",
        "                                    sampler=sampler, shuffle=False, num_workers=0)\n",
        "                #create a dataloader with the subset of points+random sampler\n",
        "                #and store an iterable of it                    \n",
        "                self.attribute_iterators.append(iter(loader))\n",
        "    def __call__(self, att=0, batch_size=None, return_idx=False):\n",
        "        # these are the indices of the original, complete dataset\n",
        "        # have to be careful to maintain its order if we are going to use them\n",
        "\n",
        "        if self.naive:\n",
        "            att = 0\n",
        "        if return_idx:\n",
        "            return self.att_indices[att]\n",
        "  \n",
        "        X, Y = next(self.attribute_iterators[att])\n",
        "\n",
        "        #this discards some samples if the batch_sizes don't line up\n",
        "        #since it's uniform with replacement, that shouldn't matter\n",
        "        if batch_size is not None:\n",
        "            for i in range(batch_size // self.batch_size):\n",
        "                x, y = next(self.attribute_iterators[att])\n",
        "                X = torch.cat((X,x))\n",
        "                Y = torch.cat((Y,y))\n",
        "            X = X[:batch_size]\n",
        "            Y = Y[:batch_size]\n",
        "\n",
        "        return X.to(self.device), Y.to(self.device)"
      ],
      "id": "xSnKd0BmMppD",
      "execution_count": 3,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "amKQdYpdKVZH"
      },
      "source": [
        "Runs the algorithm: on each round, fit a logistic regression model to the \n",
        "accumulated training set and then evaluate its performance on the \"validation\" set `features2, use that performance and the specified scheme to select the next attribute to sample from and add to the training set.  Repeat for `TT` iterations, periodically recording test accuracy.  "
      ],
      "id": "amKQdYpdKVZH"
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "seven-privilege"
      },
      "source": [
        "def Experiment0(model, train_oracle, test_data, test_labels, features1, labels1, \n",
        "                features2, labels2, attributes, \n",
        "                num_test, n0, TT=500, rule='ucb0', param=None, \n",
        "                forced='True', verbose=True):\n",
        "    '''\n",
        "    rule  :string   possible options are\n",
        "                  ucb0 -- standard UCB\n",
        "                  emp  -- empirical, i.e., greedy \n",
        "                  eps   -- epsilon greedy (with epsilon=param)\n",
        "    attributes :list of ints, 0,1,... corresponding to the attribute classes\n",
        "    '''\n",
        "    # 2 for 'gender', 4 for 'both'\n",
        "    m = len(attributes)\n",
        "    # counters for testing frequency\n",
        "    n = (TT-m*n0) // (TT//num_test)\n",
        "    k=0\n",
        "    # track accuracy and mixture distribution for each attribute over time\n",
        "    total_acc = np.zeros((m,n))\n",
        "    Pi = n0 * np.ones(m)\n",
        "    # start the main loop\n",
        "    for t in range(m*n0, TT):\n",
        "        # fit the model to the training set\n",
        "        model.fit(features1, labels1)\n",
        "\n",
        "        #compute accuracy per attribute on the second set\n",
        "        acc = np.zeros(m)\n",
        "        for att in range(m):\n",
        "            x, y = features2[att], labels2[att]\n",
        "            yhat = torch.tensor(model.predict(x))\n",
        "            correct = (yhat==y).sum()\n",
        "            acc[att] = correct / len(y)\n",
        "\n",
        "        # Point Selection Rule\n",
        "        U = torch.zeros(m)\n",
        "        if rule=='ucb0':# Optimistic Rule\n",
        "            if param is None:\n",
        "                param=0.1\n",
        "            U = acc - param/np.sqrt(Pi)\n",
        "        elif rule=='emp':# Empirical Rule\n",
        "            U = acc\n",
        "        elif rule=='eps':\n",
        "            if param is None:\n",
        "                param=0.1 # epsilon value for epsilon-greedy sampling\n",
        "            if np.random.random()<param: # randomly choose one of the attributes\n",
        "                idx = np.random.randint(0,m)\n",
        "                U[idx] = -1\n",
        "            else: # empirical sampling otherwise\n",
        "                U = acc\n",
        "        # choose the attribute with lower value of the index function U \n",
        "        # unless an attribute has fallen under the forcing threshhold\n",
        "        if any(Pi < np.sqrt(t)) and forced:\n",
        "            for att in attributes:\n",
        "                if Pi[att] < np.sqrt(t):\n",
        "                    at = att\n",
        "                    break\n",
        "        else:\n",
        "            val = sys.maxsize\n",
        "            for i,u in enumerate(U):\n",
        "                if u < val:\n",
        "                    at = i\n",
        "                    val = u\n",
        "        #if \n",
        "        if rule=='unif':\n",
        "            at = t % 4\n",
        "        Pi[at] += 1 \n",
        "\n",
        "        #record accuracy of current model on a test set\n",
        "        if t%(TT//num_test)==(TT//num_test - 1):\n",
        "            for j, att in enumerate(attributes):\n",
        "                total_acc[j, k] = (model.predict(test_data[att])==test_labels[att]).mean()\n",
        "            k += 1\n",
        "        \n",
        "        # draw a sample from the chosen attribute \n",
        "        xt, yt = train_oracle(att=at, batch_size=2)\n",
        "        features1 = torch.cat((features1, xt[0:1]), dim=0)\n",
        "        labels1 = torch.cat((labels1, yt[0:1]), dim=0)\n",
        "        features2[at] = torch.cat((features2[at], xt[1:2]), dim=0)\n",
        "        labels2[at] = torch.cat((labels2[at], yt[1:2]), dim=0)\n",
        "    \n",
        "    return total_acc, Pi/Pi.sum()"
      ],
      "id": "seven-privilege",
      "execution_count": 4,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "62GULKR6KXEk"
      },
      "source": [
        "Specify parameters of the experiment here, options for scheme are `ucb0`, `emp`, for empirical/greedy, `eps` for epsilon-greedy, `unif`, and `naive`, which is what we referred to as `Uncurated` in the paper.  `TT` specifies the number of training iterations to run and `param` specifies the exploration parameter for both UCB and epsilon greedy.  This returns test accuracy over time for each trial, in `acc`, and the final mixture distribution selected by the sampling scheme for each trial in `pi`.  \n",
        "\n",
        "The additional code in this instantiation of the algorithm is due to the very small set of data available here and the lack of provided training/test splits.  So on each trial we generate a new, random train/test split.  "
      ],
      "id": "62GULKR6KXEk"
    },
    {
      "cell_type": "code",
      "metadata": {
        "tags": [],
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "great-figure",
        "outputId": "8be638c7-d214-4308-9d2a-7730d5ac00df"
      },
      "source": [
        "# Run the experiment \n",
        "def runExperiment0(X, Y, data, num_test,\n",
        "    attributes,\n",
        "    n0=10,\n",
        "    rule='ucb0',\n",
        "    forced=True,\n",
        "    param=0.2,\n",
        "    n_trials=7,\n",
        "    TT=500,\n",
        "    verbose=False\n",
        "):\n",
        "\n",
        "    start_time = time()\n",
        "    m = len(attributes)\n",
        "    n = (TT-m*n0) // (TT//num_test)\n",
        "    #accuracy and mixture dist for each attribute, over all trials, each TT//num_test epochs\n",
        "    acc = np.zeros((1,n_trials,m,n))\n",
        "    pi = np.zeros((1,n_trials,m))\n",
        "    for i in range(n_trials):\n",
        "        print('Starting trial {}/{} with rule = {}, forced = {}'.format(\n",
        "           i+1, n_trials, rule, forced))\n",
        "        #generate new random train/test split\n",
        "        perm = np.random.permutation(1000)\n",
        "        X_test, y_test, data_test = X[perm[700:]], Y[perm[700:]], data[perm[700:]]\n",
        "        X_train, y_train, data_train = X[perm[:700]], Y[perm[:700]], data[perm[:700]]\n",
        "        trainset = GermanDataset(X_train, data_train, y_train)\n",
        "        testset = GermanDataset(X_test, data_test, y_test)\n",
        "\n",
        "        train_oracle = Oracle_german(trainset, naive=naive)\n",
        "        test_oracle = Oracle_german(testset)\n",
        "        test_data = []\n",
        "        test_labels = []\n",
        "        for j in range(m):\n",
        "            idx = test_oracle.att_indices[j]\n",
        "            temp_data = data_test[idx]\n",
        "            temp_labels = y_test[idx]\n",
        "            test_data.append(temp_data)\n",
        "            test_labels.append(temp_labels)\n",
        "\n",
        "\n",
        "        # Initialize the model\n",
        "        model = LogReg(max_iter=5000)\n",
        "        # Get the initial training datasets (2*n0)\n",
        "        x,y = train_oracle(att=0, batch_size=n0)\n",
        "        features1 = x\n",
        "        labels1 = y\n",
        "        x,y = train_oracle(att=0, batch_size=n0)\n",
        "        features2 = [x]\n",
        "        labels2 = [y]\n",
        "        for att in range(1,m):\n",
        "            x,y = train_oracle(att=att, batch_size=n0)\n",
        "            features1 = torch.cat((features1, x), dim=0)\n",
        "            labels1 = torch.cat((labels1, y), dim=0)\n",
        "            x,y = train_oracle(att=att, batch_size=n0)\n",
        "            features2.append(x)\n",
        "            labels2.append(y) \n",
        "\n",
        "        # Run the experiment to get one set of accuracies and pi vector\n",
        "        temp_acc, pi_temp = Experiment0(model=model, train_oracle=train_oracle, test_data=test_data, test_labels=test_labels,\n",
        "            features1=features1, labels1=labels1, features2=features2, labels2=labels2,\n",
        "            num_test=num_test, attributes=attributes, n0=n0,  TT=TT,\n",
        "            rule=rule, param=param, forced=forced, verbose=verbose\n",
        "        )\n",
        "        acc[0,i] = temp_acc\n",
        "        pi[0,i] = pi_temp\n",
        "        end_time=time()\n",
        "        print('Completed {} trials in {:.2f} seconds \\n \\n'.format(i+1, -start_time+end_time)) \n",
        "\n",
        "\n",
        "    return acc, pi\n",
        "  \n",
        "\n",
        "TT = 700\n",
        "num_test=70\n",
        "rule = 'ucb0'\n",
        "naive = rule == 'naive'\n",
        "forced = True\n",
        "trials = 10\n",
        "param = 0.1\n",
        "n = 10\n",
        "\n",
        "\n",
        "acc, pi = runExperiment0(X, Y, data, num_test=num_test, attributes=range(2), rule=rule, TT=TT, n_trials=n, param=param, forced = forced)\n",
        "\n",
        "\n",
        "\n"
      ],
      "id": "great-figure",
      "execution_count": 7,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Starting trial 1/10 with rule = ucb0, forced = True\n",
            "Completed 1 trials in 13.93 seconds \n",
            " \n",
            "\n",
            "Starting trial 2/10 with rule = ucb0, forced = True\n",
            "Completed 2 trials in 28.19 seconds \n",
            " \n",
            "\n",
            "Starting trial 3/10 with rule = ucb0, forced = True\n",
            "Completed 3 trials in 41.22 seconds \n",
            " \n",
            "\n",
            "Starting trial 4/10 with rule = ucb0, forced = True\n",
            "Completed 4 trials in 54.52 seconds \n",
            " \n",
            "\n",
            "Starting trial 5/10 with rule = ucb0, forced = True\n",
            "Completed 5 trials in 69.05 seconds \n",
            " \n",
            "\n",
            "Starting trial 6/10 with rule = ucb0, forced = True\n",
            "Completed 6 trials in 82.74 seconds \n",
            " \n",
            "\n",
            "Starting trial 7/10 with rule = ucb0, forced = True\n",
            "Completed 7 trials in 95.97 seconds \n",
            " \n",
            "\n",
            "Starting trial 8/10 with rule = ucb0, forced = True\n",
            "Completed 8 trials in 109.67 seconds \n",
            " \n",
            "\n",
            "Starting trial 9/10 with rule = ucb0, forced = True\n",
            "Completed 9 trials in 123.34 seconds \n",
            " \n",
            "\n",
            "Starting trial 10/10 with rule = ucb0, forced = True\n",
            "Completed 10 trials in 137.59 seconds \n",
            " \n",
            "\n"
          ],
          "name": "stdout"
        }
      ]
    }
  ]
}