{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 71
    },
    "colab_type": "code",
    "id": "P7V1GOVFKmNu",
    "outputId": "95de6ccf-1a19-4c40-b198-6207233a144d"
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.6/dist-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.\n",
      "  import pandas.util.testing as tm\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "from numpy import genfromtxt\n",
    "from sklearn import svm,preprocessing\n",
    "from sklearn.metrics import accuracy_score\n",
    "import random\n",
    "import matplotlib.pyplot as plt\n",
    "import xgboost as xgb\n",
    "from xgboost import XGBClassifier\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from sklearn import svm\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "import statsmodels.api as sm\n",
    "import numpy.linalg as la\n",
    "import scipy.io as sio\n",
    "import pickle\n",
    "from cvxopt import matrix, solvers\n",
    "from sklearn.decomposition import PCA\n",
    "from sklearn.discriminant_analysis import LinearDiscriminantAnalysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "UeVYYRMXdoML"
   },
   "outputs": [],
   "source": [
    "def preprocess(W):\n",
    "    u = np.min(W,axis = 0)\n",
    "    v = np.max(W,axis = 0)\n",
    "    l = W.shape[0]\n",
    "    t = W.shape[1]\n",
    "    for i in range(l):\n",
    "        for j in range(t):\n",
    "            W[i,j] = W[i,j] - u[j]\n",
    "            W[i,j]/=(v[j]-u[j])\n",
    "    return W"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "orVUL-X2RI-X"
   },
   "source": [
    "**Code for datasets:**\n",
    "\n",
    "The next two sections generate the .p and .m files used for training and testing. You do not need to run this again if those files are loaded."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 34
    },
    "colab_type": "code",
    "id": "o9XGDWirRH59",
    "outputId": "7cc7cab5-01fa-46b4-f2d3-60e4018697ed"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(1372, 5)\n"
     ]
    }
   ],
   "source": [
    "datasets = []\n",
    "\n",
    "# Ringnorm\n",
    "data = []\n",
    "file = open('/content/ringnorm.data', 'r')\n",
    "for line in file.readlines():\n",
    "  data.append([float(x) for x in line.split()])\n",
    "data = np.array(data)\n",
    "data = preprocess(data)\n",
    "print(data)\n",
    "ringnorm_data = data\n",
    "datasets.append(('ringnorm', data))\n",
    "\n",
    "# twonorm\n",
    "data = []\n",
    "file = open('/content/twonorm.data', 'r')\n",
    "for line in file.readlines():\n",
    "  data.append([float(x) for x in line.split()])\n",
    "data = np.array(data)\n",
    "data = preprocess(data)\n",
    "twonorm_data = data\n",
    "datasets.append(('twonorm', data))\n",
    "\n",
    "mat_file = dict()\n",
    "mat_file['ringnorm_data'] = ringnorm_data\n",
    "mat_file['twonorm_data'] = twonorm_data\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "HLcIk7ibonj3"
   },
   "source": [
    "\n",
    "\n",
    "```\n",
    "# This is formatted as code\n",
    "```\n",
    "\n",
    "Generate biased training sets using Sugiyama technique"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 1000
    },
    "colab_type": "code",
    "id": "yPlUBFFDEd4b",
    "outputId": "1210f8e7-aa7b-49e8-fa9d-09cf22682289"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "trial 0\n",
      "258\n",
      "trial 1\n",
      "234\n",
      "trial 2\n",
      "416\n",
      "trial 3\n",
      "263\n",
      "trial 4\n",
      "242\n",
      "trial 5\n",
      "536\n",
      "trial 6\n",
      "250\n",
      "trial 7\n",
      "230\n",
      "trial 8\n",
      "233\n",
      "trial 9\n",
      "245\n",
      "trial 10\n",
      "499\n",
      "trial 11\n",
      "253\n",
      "trial 12\n",
      "226\n",
      "trial 13\n",
      "221\n",
      "trial 14\n",
      "226\n",
      "trial 15\n",
      "556\n",
      "trial 16\n",
      "215\n",
      "trial 17\n",
      "465\n",
      "trial 18\n",
      "244\n",
      "trial 19\n",
      "229\n",
      "trial 20\n",
      "229\n",
      "trial 21\n",
      "229\n",
      "trial 22\n",
      "239\n",
      "trial 23\n",
      "484\n",
      "trial 24\n",
      "247\n",
      "trial 25\n",
      "230\n",
      "trial 26\n",
      "527\n",
      "trial 27\n",
      "455\n",
      "trial 28\n",
      "454\n",
      "trial 29\n",
      "495\n",
      "trial 30\n",
      "521\n",
      "trial 31\n",
      "230\n",
      "trial 32\n",
      "239\n",
      "trial 33\n",
      "531\n",
      "trial 34\n",
      "448\n",
      "trial 35\n",
      "553\n",
      "trial 36\n",
      "431\n",
      "trial 37\n",
      "246\n",
      "trial 38\n",
      "510\n",
      "trial 39\n",
      "560\n",
      "trial 40\n",
      "219\n",
      "trial 41\n",
      "233\n",
      "trial 42\n",
      "226\n",
      "trial 43\n",
      "495\n",
      "trial 44\n",
      "424\n",
      "trial 45\n",
      "220\n",
      "trial 46\n",
      "255\n",
      "trial 47\n",
      "249\n",
      "trial 48\n",
      "219\n",
      "trial 49\n",
      "598\n",
      "trial 50\n",
      "441\n",
      "trial 51\n",
      "235\n",
      "trial 52\n",
      "255\n",
      "trial 53\n",
      "530\n",
      "trial 54\n",
      "533\n",
      "trial 55\n",
      "587\n",
      "trial 56\n",
      "474\n",
      "trial 57\n",
      "228\n",
      "trial 58\n",
      "214\n",
      "trial 59\n",
      "242\n",
      "trial 60\n",
      "240\n",
      "trial 61\n",
      "238\n",
      "trial 62\n",
      "258\n",
      "trial 63\n",
      "462\n",
      "trial 64\n",
      "219\n",
      "trial 65\n",
      "242\n",
      "trial 66\n",
      "241\n",
      "trial 67\n",
      "456\n",
      "trial 68\n",
      "255\n",
      "trial 69\n",
      "447\n",
      "trial 70\n",
      "478\n",
      "trial 71\n",
      "226\n",
      "trial 72\n",
      "428\n",
      "trial 73\n",
      "227\n",
      "trial 74\n",
      "461\n",
      "trial 75\n",
      "240\n",
      "trial 76\n",
      "217\n",
      "trial 77\n",
      "233\n",
      "trial 78\n",
      "224\n",
      "trial 79\n",
      "459\n",
      "trial 80\n",
      "267\n",
      "trial 81\n",
      "229\n",
      "trial 82\n",
      "441\n",
      "trial 83\n",
      "451\n",
      "trial 84\n",
      "220\n",
      "trial 85\n",
      "228\n",
      "trial 86\n",
      "246\n",
      "trial 87\n",
      "231\n",
      "trial 88\n",
      "404\n",
      "trial 89\n",
      "429\n",
      "trial 90\n",
      "541\n",
      "trial 91\n",
      "253\n",
      "trial 92\n",
      "455\n",
      "trial 93\n",
      "438\n",
      "trial 94\n",
      "217\n",
      "trial 95\n",
      "241\n",
      "trial 96\n",
      "220\n",
      "trial 97\n",
      "254\n",
      "trial 98\n",
      "220\n",
      "trial 99\n",
      "406\n"
     ]
    }
   ],
   "source": [
    "trials = 100 # trials per dataset\n",
    "training_points = 100 # size of training dataset\n",
    "testing_points = 500 # size of testing dataset\n",
    "\n",
    "for name, data in datasets:\n",
    "  trial = 0\n",
    "  training_sets = []\n",
    "  testing_sets = []\n",
    "  F = data.shape[1]\n",
    "  G = data.shape[0]\n",
    " \n",
    "  inds = range(G)\n",
    "  while trial < trials:\n",
    "    print(\"trial {}\".format(trial))\n",
    "    c = random.randint(0,F-1) #choosing a random feature index for a trial\n",
    "    indices = np.random.permutation(G) #shuffling the indices \n",
    "    test_inds  = []\n",
    "    i = 0\n",
    "    # choosing 500 indices for test set using Sugiyama biasing technique\n",
    "    while len(test_inds) < testing_points:\n",
    "        u = indices[i]\n",
    "        x = data[u,c]\n",
    "        prob = min(1,4*x*x)\n",
    "        v = np.random.random()\n",
    "        if v < prob:\n",
    "            test_inds.append(u)\n",
    "        i+=1\n",
    "    test_inds = np.array(test_inds)\n",
    "    print(i)\n",
    "    train_inds = np.random.choice(indices[i:], training_points, replace = False)\n",
    "    train_mask = np.array([(i in train_inds) for i in inds])\n",
    "\n",
    "    \n",
    "    trial += 1\n",
    "    training_sets.append(train_inds)\n",
    "    testing_sets.append(test_inds)\n",
    "  train_inds = np.array(training_sets)\n",
    "  test_inds = np.array(testing_sets)\n",
    "  mat_file['train_inds_{}'.format(name)] = train_inds\n",
    "  mat_file['test_inds_{}'.format(name)] = test_inds\n",
    "sio.savemat('datasets.mat', mat_file)\n",
    "dsets = open('datasets.p', 'wb')\n",
    "pickle.dump(mat_file, dsets)\n",
    "\n"
   ]
  }
 ],
 "metadata": {
  "colab": {
   "collapsed_sections": [],
   "name": "Dataset_Generator_Experiment_3",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
