{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "CovariateShiftExperiements",
      "provenance": [],
      "collapsed_sections": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    }
  },
  "cells": [
    {
      "cell_type": "code",
      "metadata": {
        "id": "P7V1GOVFKmNu",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "import numpy as np\n",
        "from numpy import genfromtxt\n",
        "from sklearn import svm\n",
        "from sklearn.metrics import accuracy_score\n",
        "import random\n",
        "import matplotlib.pyplot as plt\n",
        "import xgboost as xgb\n",
        "from xgboost import XGBClassifier\n",
        "from sklearn.tree import DecisionTreeClassifier\n",
        "from sklearn import svm\n",
        "from sklearn.linear_model import LogisticRegression\n",
        "import statsmodels.api as sm\n",
        "import numpy.linalg as la\n",
        "import scipy.io as sio\n",
        "import pickle\n",
        "from cvxopt import matrix, solvers\n",
        "from sklearn.decomposition import PCA\n",
        "from sklearn.discriminant_analysis import LinearDiscriminantAnalysis"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Mz7JVAEghPjJ",
        "colab_type": "text"
      },
      "source": [
        "We implement all of the methods using a V-matrix framework. Note that for all previous methods, the V-matrix is diagonal. This is because all previous methods assign an importance or weight to each point, whose lose is signified by the diagonal. Our contribution is to use Vapnik's framework, which utilizes the entire matrix. Kleip is run on the original Matlab code and the weights are imported."
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "6iaJVt_LXXDG",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "def V_matrix_Huang(T,X,n_t):\n",
        "  l = X.shape[0]\n",
        "  m = T.shape[0]\n",
        "  V = np.identity(l)\n",
        "  P = matrix(gram_matrix(X))\n",
        "  K = Kernel.gaussian(.1)\n",
        "  sum_T = [0 for i in range(l)]\n",
        "  for i in range(l):\n",
        "    for t in range(m):\n",
        "      sum_T[i]+=K(T[t],X[i])\n",
        "    sum_T[i] = -1*l/m*sum_T[i]  \n",
        "  q = matrix(sum_T)\n",
        "  G = [[0]*l for i in range(2*l+2)]\n",
        "  for i in range(l):\n",
        "    G[i][i] = 1\n",
        "    G[i+1][i] = -1\n",
        "  for j in range(l):\n",
        "    G[2*l][j] = 1\n",
        "    G[2*l+1][j] = -1\n",
        " \n",
        "  G = matrix(np.array(G),tc ='d')\n",
        "  B = 1000\n",
        "  eps = 1-1/np.sqrt(l)\n",
        "  h = [0 for i in range(2*l+2)]\n",
        "  for i in range(l):\n",
        "    h[i] = B\n",
        "  h[2*l] = l+l*eps\n",
        "  h[2*l+1] = l-l*eps   \n",
        "  h = matrix(h)\n",
        "  solvers.options['show_progress'] = False\n",
        "  sol = solvers.qp(P,q,G,h)\n",
        "  solution = np.array(sol['x'])\n",
        "  diagonal = [solution[i][0] for i in range(l)]\n",
        "  V = np.diag(diagonal)\n",
        "  return V \n",
        "\n",
        "def V_matrix_new(T,X,n_t):\n",
        "  l = X.shape[0]\n",
        "  V = np.identity(l) # our V\n",
        "  for i in range(l):\n",
        "    for j in range(l):\n",
        "      V[i,j] = 0\n",
        "      for t in T:\n",
        "        u = 1\n",
        "        for d in range(T.shape[1]):\n",
        "          if t[d] < np.maximum(X[i,d],X[j,d]):\n",
        "            u = 0\n",
        "            break\n",
        "        V[i,j]+=u    \n",
        "      V[i,j] = V[i,j]/n_t \n",
        "  #P,Q,R = la.svd(V)\n",
        "  #Q_hat = np.diag([Q[i] + .01 for i in range(l)])\n",
        "  #V = np.matmul(np.matmul(P,Q_hat),R) #Regularizing V in high dimensions     \n",
        "  return V\n",
        "\n",
        "def V_matrix_LDA(T, X, n_t, lda):\n",
        "  X_r = lda.transform(X)\n",
        "  T_r = lda.transform(T)\n",
        "\n",
        "  l = X.shape[0]\n",
        "  V = np.identity(l) # our V\n",
        "  for i in range(l):\n",
        "    for j in range(l):\n",
        "      V[i,j] = 0\n",
        "      for t in T_r:\n",
        "        u = 1\n",
        "        for d in range(T_r.shape[1]):\n",
        "          if t[d] < np.maximum(X_r[i,d],X_r[j,d]):\n",
        "            u = 0\n",
        "            break\n",
        "        V[i,j]+=u    \n",
        "      V[i,j] = V[i,j]/n_t \n",
        "  #P,Q,R = la.svd(V)\n",
        "  #Q_hat = np.diag([Q[i] + .01 for i in range(l)])\n",
        "  #V = np.matmul(np.matmul(P,Q_hat),R) #Regularizing V in high dimensions     \n",
        "  return V\n",
        "\n",
        "\n",
        "\n",
        "def V_matrix_add(T,X,n_t):\n",
        "  l = X.shape[0]\n",
        "  V = np.identity(l) # our V\n",
        "  for i in range(l):\n",
        "    for j in range(l):\n",
        "      V[i,j] = 0\n",
        "      for t in T:\n",
        "        u = 0\n",
        "        for d in range(T.shape[1]):\n",
        "          if t[d] >= np.maximum(X[i,d],X[j,d]):\n",
        "            u += 1\n",
        "        V[i,j]+=u    \n",
        "      V[i,j] = V[i,j]/n_t \n",
        "  #P,Q,R = la.svd(V)\n",
        "  #Q_hat = np.diag([Q[i] for i in range(l)])\n",
        "  #V = np.matmul(np.matmul(P,Q_hat),R) #Regularizing V in high dimensions     \n",
        "  return V\n",
        "\n",
        "def Vapnik(X):\n",
        "  l = X.shape[0]\n",
        "  V = np.array([[1]*l for i in range(l)]) \n",
        "  for i in range(l):\n",
        "    for j in range(l):\n",
        "      for d in range(X.shape[1]):\n",
        "        V[i,j]*= 10-np.maximum(X[i,d],X[j,d])\n",
        "  P,Q,R = la.svd(V)\n",
        "  #Q_hat = np.diag([Q[i]+0.001 for i in range(l)])\n",
        "  #V = np.matmul(np.matmul(P,Q_hat),R) #Regularizing V in high dimensions \n",
        "  return V\n",
        "\n",
        "def unif_kernel(x,X,h):\n",
        "  n = X.shape[0]\n",
        "  val = 0\n",
        "  for i in range(n):\n",
        "    if la.norm((x-X[i])/h,1) <=1:\n",
        "      val+= 0.5  \n",
        "  q = val/(n*h)\n",
        "  return q\n",
        "\n",
        "def gauss_kernel(x,X,h):\n",
        "  n = X.shape[0]\n",
        "  val = 0\n",
        "  for i in range(n):\n",
        "    val+=1/(np.sqrt(2*np.pi))*(np.exp(-la.norm((x-X[i])/h)**2/2))\n",
        "  q = val/(n*h)\n",
        "  return q\n",
        "\n",
        "def V_matrix_Shimodaira(T,X,n_t,h):\n",
        "  l = X.shape[0]\n",
        "  V = np.identity(l)\n",
        "  for i in range(l):\n",
        "    l_1 = gauss_kernel(X[i],T,h)\n",
        "    l_0 = gauss_kernel(X[i],X,h)\n",
        "    V[i,i] = l_1/l_0\n",
        "  return V\n",
        "\n",
        "def V_matrix_Sugiyama(T,X,n_t,h,tau):\n",
        "  l = X.shape[0]\n",
        "  V = np.identity(l)\n",
        "  for i in range(l):\n",
        "    l_1 = gauss_kernel(X[i],T,h)\n",
        "    l_0 = gauss_kernel(X[i],X,h)\n",
        "    V[i,i] = (l_1/l_0)**tau\n",
        "  return V\n",
        "\n",
        "def V_matrix_Makoto(T,X,n_t,h,a):\n",
        "  l = X.shape[0]\n",
        "  V = np.identity(l)\n",
        "  for i in range(l):\n",
        "    l_1 = gauss_kernel(X[i],T,h)\n",
        "    l_0 = gauss_kernel(X[i],X,h)\n",
        "    V[i,i] = l_1/(a*l_0+(1-a)*l_1)\n",
        "  return V  \n",
        "\n",
        "def test_methods_review(T,X,Y,Y_test,V):\n",
        "  U = gram_matrix(X)# get the gram matrix for the given training set X\n",
        "  g = 0.1  \n",
        "  A, c = A_estimate(V,U,Y,g)\n",
        "  p = []\n",
        "  p_exp = Y_test\n",
        "  for t in range(T.shape[0]):\n",
        "    U_l = style_K(X,T[t]) #U_l = # kernel for the test set\n",
        "    p.append(p_estimate(A,c,U_l))\n",
        "  p = np.array(p)\n",
        "  e = la.norm(p_exp-p)**2/T.shape[0]\n",
        "  return e\n",
        "\n",
        "\n",
        "#Estimate of A and c\n",
        "def A_estimate(V,U,Y,g):\n",
        "  l = V.shape[0]\n",
        "  I = np.identity(l)\n",
        "  O = np.ones((l,1))\n",
        "  a_1 = np.matmul(V,U)\n",
        "  a_2 = np.matmul(V,Y.reshape(l,1))\n",
        "  a_3 = np.matmul(V,O)\n",
        "  W = np.linalg.inv(a_1+g*I) # g:regularization constant\n",
        "  A_b = np.matmul(W,a_2).reshape(l,1) # from equation 14\n",
        "  A_c = np.matmul(W,a_3).reshape(l,1) # from equation 14\n",
        "  c_1 = np.matmul(O.transpose(),np.matmul(a_1,A_b)-a_2) # numerator in equation 14\n",
        "  c_2 = np.matmul(O.transpose(),np.matmul(a_1,A_c)-a_3)# denominator in equation 14\n",
        "  c = c_1/c_2 # from equation 14\n",
        "  A = A_b - c[0,0]*A_c\n",
        "  return A,c\n",
        "\n",
        "#Conditional probability estimate for a given x\n",
        "def p_estimate(A,c,U_l):\n",
        "  p = np.matmul(A.transpose(),U_l)+c\n",
        "  if p[0,0] > 0.5:\n",
        "    return 1\n",
        "  else:\n",
        "    return 0  \n",
        "\n",
        "def gram_matrix(X):\n",
        "   l = X.shape[0]\n",
        "   U = np.zeros((l,l))\n",
        "   K = Kernel.gaussian(.1)         \n",
        "   for i in range(l):\n",
        "     for j in range(l):\n",
        "       U[i,j] = K(X[i],X[j])\n",
        "   return U\n",
        "\n",
        "def style_K(X,t):\n",
        "   l = X.shape[0]\n",
        "   U_l = np.zeros((l,1))\n",
        "   K = Kernel.gaussian(.1)\n",
        "   for i in range(l):\n",
        "     U_l[i] = K(t,X[i])    \n",
        "   return U_l  \n",
        "\n",
        "class Kernel(object):\n",
        "    \"\"\"Implements list of kernels from\n",
        "    http://en.wikipedia.org/wiki/Support_vector_machine\n",
        "    \"\"\"\n",
        "    @staticmethod\n",
        "    def linear():\n",
        "        def f(x, y):\n",
        "            return np.inner(x, y)\n",
        "        return f\n",
        "\n",
        "    @staticmethod\n",
        "    def gaussian(sigma):\n",
        "        def f(x, y):\n",
        "            #exponent = -np.sqrt(sigma*la.norm(x-y)**2)\n",
        "            exponent = -sigma*la.norm(x-y)**2\n",
        "            return np.exp(exponent)\n",
        "        return f\n",
        "\n",
        "    @staticmethod\n",
        "    def _polykernel(dimension, offset):\n",
        "        def f(x, y):\n",
        "            return (offset + np.dot(x, y)) ** dimension\n",
        "        return f\n",
        "\n",
        "    @staticmethod\n",
        "    def inhomogenous_polynomial(dimension):\n",
        "        return Kernel._polykernel(dimension=dimension, offset=1.0)\n",
        "\n",
        "    @staticmethod\n",
        "    def homogenous_polynomial(dimension):\n",
        "        return Kernel._polykernel(dimension=dimension, offset=0.0)\n",
        "\n",
        "    @staticmethod\n",
        "    def hyperbolic_tangent(kappa, c):\n",
        "        def f(x, y):\n",
        "            return np.tanh(kappa * np.dot(x, y) + c)\n",
        "        return f\n",
        "\n",
        "def instance(X,T,Y,Y_test, kliep_weights, ulsif_weights):\n",
        "  l = X.shape[0]\n",
        "  n_t = T.shape[0]\n",
        "  h = 2.0\n",
        "  tau = 0.5\n",
        "  alpha = 0.5 \n",
        "  \n",
        "  #kpca = KernelPCA(n_components=9, kernel=\"rbf\", fit_inverse_transform=True, gamma=10)\n",
        "  #kpca.fit(T_sub)\n",
        "\n",
        "  V_1 = np.identity(l)                      # Unweighted\n",
        "  #V_2 = Vapnik(X)                           # Vapnik\n",
        "  V_3 = V_matrix_new(T,X,n_t)            # Proposed\n",
        "  V_4 = V_matrix_add(T, X, n_t)   # Additive\n",
        "  #V_4 = V_matrix_Shimodaira(T,X,n_t,h)  # Shimodaira \n",
        "  #V_5 = V_matrix_Sugiyama(T,X,n_t,h,tau)# Sugiyama\n",
        "  V_6 = V_matrix_Makoto(T,X,n_t,h,alpha)# Makoto\n",
        "  V_7 = V_matrix_Huang(T,X,n_t)         # Huang\n",
        "  V_8 = np.diag(kliep_weights)              # KLIEP\n",
        "  V_9 = np.diag(ulsif_weights)              # ULSIF\n",
        "\n",
        "  error_1 = test_methods_review(T,X,Y,Y_test,V_1)\n",
        "  print(\"Error Unweighted: {}\".format(error_1))\n",
        "  #error_2 = test_methods_review(T,X,Y,Y_test,V_2)\n",
        "  error_3 = test_methods_review(T,X,Y,Y_test,V_3)\n",
        "  print(\"Error Proposed: {}\".format(error_3))\n",
        "  error_4 = test_methods_review(T,X,Y,Y_test,V_4)\n",
        "  print(\"Error Proposed Additive: {}\".format(error_4))\n",
        "  #error_5 = test_methods_review(T,X,Y,Y_test,V_5)\n",
        "  error_6 = test_methods_review(T,X,Y,Y_test,V_6)\n",
        "  print(\"Error Makoto: {}\".format(error_6))\n",
        "  error_7 = test_methods_review(T,X,Y,Y_test,V_7)\n",
        "  print(\"Error Huang: {}\".format(error_7))\n",
        "  error_8 = test_methods_review(T,X,Y,Y_test,V_8)\n",
        "  print(\"Error KLIEP: {}\".format(error_8))\n",
        "  error_9 = test_methods_review(T,X,Y,Y_test,V_9)\n",
        "  print(\"Error ULSIF: {}\".format(error_9))\n",
        "  #error_10 = test_methods_review(T,X,Y,Y_test,V_10)\n",
        "  #print(\"Error KPCA: {}\".format(error_10))\n",
        "  return [error_1, error_3, error_4, error_6, error_7, error_8, error_9]"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "hXPPk33GOcwH",
        "colab_type": "text"
      },
      "source": [
        "Load previously generated datasets:"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "R5DYY1rSw57E",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "\n",
        "# For Norm-Biased Experiment\n",
        "dsets_read = open('multi_datasets.p', 'rb')\n",
        "dataset_info = pickle.load(dsets_read)\n",
        "matlab = sio.loadmat('matlab_multifeature.mat')\n",
        "\n",
        "#For Single-Feature Bias Experiment\n",
        "#dsets_read = open('single_datasets.p', 'rb')\n",
        "#dataset_info = pickle.load(dsets_read)\n",
        "#matlab = sio.loadmat('matlab_single.mat')\n",
        "\n",
        "# Set dataset equal to cancer, twonorm, ringnorm, diabetes, banknote\n",
        "dataset = 'ringnorm'\n",
        "dat = dataset_info['{}'.format(dataset)]\n",
        "tr_ind = dataset_info['train_inds_{}'.format(dataset)]\n",
        "te_ind = dataset_info['test_inds_{}'.format(dataset)]\n",
        "kliep_Vs = matlab['kliep_{}'.format(dataset)]\n",
        "ulsif_Vs = matlab['ulsif_{}'.format(dataset)]\n",
        "\n",
        "datasets = [(dataset, dat, tr_ind, te_ind, kliep_Vs, ulsif_Vs)]"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "BpIuJzqE5qzT",
        "colab_type": "code",
        "outputId": "ffcf3a4f-d303-4dfc-9afc-7b8f97396c65",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 646
        }
      },
      "source": [
        "def table_format(avg_err, std_err):\n",
        "  out = \"$\"\n",
        "  for avg, std in zip(avg_err, std_err):\n",
        "    out += \"{:.3f}({:.3f})$ & $\".format(avg, std)\n",
        "  out += \"$\"\n",
        "  return out\n",
        "\n",
        "\n",
        "for name, data, train_inds, test_inds, kliep, ulsif in datasets:\n",
        "  errs = []\n",
        "  trial = 0\n",
        "  training_sets = []\n",
        "  for trial in range(1,len(train_inds)):\n",
        "    tr_inds = train_inds[trial]\n",
        "    te_inds = test_inds[trial]\n",
        "    train = data[tr_inds, :]\n",
        "    test = data[te_inds, :]\n",
        "    \n",
        "    X_train = train[:, :-1]\n",
        "    y_train = train[:, -1]\n",
        "\n",
        "    X_test = test[:, :-1]\n",
        "    y_test = test[:, -1]\n",
        "\n",
        "    errors = instance(X_train, X_test, y_train, y_test, kliep[trial], ulsif[trial])\n",
        "    errors = errors / errors[0]\n",
        "    print(errors)\n",
        "    errs.append(errors)\n",
        "    avg_errors = np.mean(errs, axis=0)\n",
        "    std_errors = np.std(errs, axis = 0)\n",
        "    print(\"CUMULATIVE {}\\n{}\\n{}\".format(trial, name, table_format(avg_errors, std_errors)))"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Error Unweighted: 0.23799999999999996\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "stream",
          "text": [
            "/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:181: RuntimeWarning: invalid value encountered in true_divide\n"
          ],
          "name": "stderr"
        },
        {
          "output_type": "stream",
          "text": [
            "Error Proposed: 0.512\n",
            "Error Proposed Additive: 0.22400000000000003\n",
            "Error Makoto: 0.23799999999999996\n",
            "Error Huang: 0.316\n",
            "Error KLIEP: 0.242\n",
            "Error ULSIF: 0.23799999999999996\n",
            "[1.         2.1512605  0.94117647 1.         1.32773109 1.01680672\n",
            " 1.        ]\n",
            "CUMULATIVE 1\n",
            "ringnorm\n",
            "$1.000(0.000)$ & $2.151(0.000)$ & $0.941(0.000)$ & $1.000(0.000)$ & $1.328(0.000)$ & $1.017(0.000)$ & $1.000(0.000)$ & $$\n",
            "Error Unweighted: 0.196\n",
            "Error Proposed: 0.48799999999999993\n",
            "Error Proposed Additive: 0.15200000000000002\n",
            "Error Makoto: 0.196\n",
            "Error Huang: 0.22199999999999998\n",
            "Error KLIEP: 0.182\n",
            "Error ULSIF: 0.182\n",
            "[1.         2.48979592 0.7755102  1.         1.13265306 0.92857143\n",
            " 0.92857143]\n",
            "CUMULATIVE 2\n",
            "ringnorm\n",
            "$1.000(0.000)$ & $2.321(0.169)$ & $0.858(0.083)$ & $1.000(0.000)$ & $1.230(0.098)$ & $0.973(0.044)$ & $0.964(0.036)$ & $$\n",
            "Error Unweighted: 0.166\n",
            "Error Proposed: 0.474\n",
            "Error Proposed Additive: 0.176\n",
            "Error Makoto: 0.166\n",
            "Error Huang: 0.25399999999999995\n",
            "Error KLIEP: 0.166\n",
            "Error ULSIF: 0.172\n",
            "[1.         2.85542169 1.06024096 1.         1.53012048 1.\n",
            " 1.03614458]\n",
            "CUMULATIVE 3\n",
            "ringnorm\n",
            "$1.000(0.000)$ & $2.499(0.288)$ & $0.926(0.117)$ & $1.000(0.000)$ & $1.330(0.162)$ & $0.982(0.038)$ & $0.988(0.045)$ & $$\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "ZPWnt0zad_JB",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        ""
      ],
      "execution_count": 0,
      "outputs": []
    }
  ]
}