{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "yAq6aHVh5oCH"
   },
   "source": [
    "Licensed under the Apache License, Version 2.0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "VpEpTFph2ysp"
   },
   "outputs": [],
   "source": [
    "from __future__ import division\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import json\n",
    "import os,sys\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "from sklearn.model_selection import train_test_split\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "F-u1ecNmMiX3"
   },
   "source": [
    "## Overview\n",
    "### Pre-process Law School Admissions Council Dataset (LSAC) \n",
    "\n",
    "Download the Law School dataset from: (http://www.seaphe.org/databases.php), convert SAS file to CSV, and save it in the `./fairness_without_demographics/data/law_school` folder.\n",
    "\n",
    "Input: ./fairness_without_demographics/data/law_school/lsac.csv\n",
    "\n",
    "Outputs: train.csv, test.csv, mean_std.json, vocabulary.json, IPS_exampleweights_with_label.json, IPS_exampleweights_without_label.json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "oyFyRbFk7zox"
   },
   "outputs": [],
   "source": [
    "pd.options.display.float_format = '{:,.2f}'.format\n",
    "dataset_base_dir = './fairness_without_demographics/data/law_school/'\n",
    "dataset_file_name = 'lsac.csv'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "PgWxzZeyKog3"
   },
   "source": [
    "### Processing original dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "chhYs8357xyU"
   },
   "outputs": [],
   "source": [
    "file_path = os.path.join(dataset_base_dir,dataset_file_name)\n",
    "with open(file_path, \"r\") as file_name:\n",
    "  temp_df = pd.read_csv(file_name)\n",
    "\n",
    "# Columns of interest  \n",
    "df = temp_df[['zfygpa','zgpa','DOB_yr','parttime','gender','race','tier','fam_inc','lsat','ugpa','pass_bar','index6040']].copy()\n",
    "renameColumns={'gender':'sex',\n",
    "               'index6040':'weighted_lsat_ugpa',\n",
    "               'fam_inc':'family_income',\n",
    "               'tier':'cluster_tier',\n",
    "               'parttime':'isPartTime'}\n",
    "target_variable = 'pass_bar'\n",
    "target_value = 'Passed'\n",
    "\n",
    "# Renaming columns\n",
    "df = df.rename(columns = renameColumns)\n",
    "columns = renameColumns.values()\n",
    "\n",
    "# NaN in 'pass_bar' refer to dropouts. Considering NaN as failing the bar.\n",
    "df['pass_bar'] = df['pass_bar'].fillna(value=0.0)\n",
    "df['pass_bar'] = df.apply(lambda x: 'Passed' if x['pass_bar']==1.0 else 'Failed_or_not_attempted', axis=1).astype('category')\n",
    "\n",
    "df['zfygpa'] = df['zfygpa'].fillna(value=0.0)\n",
    "df['zgpa'] = df['zgpa'].fillna(value=0.0)\n",
    "df['DOB_yr'] = df['DOB_yr'].fillna(value=0.0)\n",
    "df = df.dropna()\n",
    "\n",
    "# Binarize target_variable\n",
    "df['isPartTime'] = df.apply(lambda x: 'Yes' if x['isPartTime']==1.0 else 'No', axis=1).astype('category')\n",
    "\n",
    "# Process protected-column values\n",
    "race_dict = {3.0:'Black',7.0:'White'}\n",
    "sex_dict = {'female':'Female','male':'Male'}\n",
    "df['race'] = df.apply(lambda x: race_dict[x['race']] if x['race'] in race_dict.keys() else 'Other', axis=1).astype('category')\n",
    "df['sex'] = df.apply(lambda x: sex_dict[x['sex']] if x['sex'] in sex_dict.keys() else 'Other', axis=1).astype('category')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab": {
     "height": 204
    },
    "colab_type": "code",
    "id": "VUdsXMczzBWe",
    "outputId": "d58b81a9-48aa-4145-f44e-bc455601b84e"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>zfygpa</th>\n",
       "      <th>zgpa</th>\n",
       "      <th>DOB_yr</th>\n",
       "      <th>isPartTime</th>\n",
       "      <th>sex</th>\n",
       "      <th>race</th>\n",
       "      <th>cluster_tier</th>\n",
       "      <th>family_income</th>\n",
       "      <th>lsat</th>\n",
       "      <th>ugpa</th>\n",
       "      <th>pass_bar</th>\n",
       "      <th>weighted_lsat_ugpa</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>-1.79</td>\n",
       "      <td>0.00</td>\n",
       "      <td>68.00</td>\n",
       "      <td>No</td>\n",
       "      <td>Female</td>\n",
       "      <td>White</td>\n",
       "      <td>2.00</td>\n",
       "      <td>4.00</td>\n",
       "      <td>30.00</td>\n",
       "      <td>3.10</td>\n",
       "      <td>Failed_or_not_attempted</td>\n",
       "      <td>625.79</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1.33</td>\n",
       "      <td>1.88</td>\n",
       "      <td>69.00</td>\n",
       "      <td>No</td>\n",
       "      <td>Female</td>\n",
       "      <td>White</td>\n",
       "      <td>4.00</td>\n",
       "      <td>5.00</td>\n",
       "      <td>44.00</td>\n",
       "      <td>3.50</td>\n",
       "      <td>Passed</td>\n",
       "      <td>886.84</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>-0.11</td>\n",
       "      <td>-0.57</td>\n",
       "      <td>69.00</td>\n",
       "      <td>No</td>\n",
       "      <td>Female</td>\n",
       "      <td>White</td>\n",
       "      <td>2.00</td>\n",
       "      <td>4.00</td>\n",
       "      <td>29.00</td>\n",
       "      <td>3.50</td>\n",
       "      <td>Passed</td>\n",
       "      <td>650.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1.22</td>\n",
       "      <td>0.95</td>\n",
       "      <td>58.00</td>\n",
       "      <td>Yes</td>\n",
       "      <td>Female</td>\n",
       "      <td>White</td>\n",
       "      <td>3.00</td>\n",
       "      <td>5.00</td>\n",
       "      <td>35.00</td>\n",
       "      <td>3.00</td>\n",
       "      <td>Failed_or_not_attempted</td>\n",
       "      <td>694.74</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.88</td>\n",
       "      <td>0.00</td>\n",
       "      <td>51.00</td>\n",
       "      <td>Yes</td>\n",
       "      <td>Female</td>\n",
       "      <td>White</td>\n",
       "      <td>2.00</td>\n",
       "      <td>4.00</td>\n",
       "      <td>39.00</td>\n",
       "      <td>2.90</td>\n",
       "      <td>Failed_or_not_attempted</td>\n",
       "      <td>747.89</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   zfygpa  zgpa  DOB_yr  ... ugpa                 pass_bar weighted_lsat_ugpa\n",
       "0   -1.79  0.00   68.00  ... 3.10  Failed_or_not_attempted             625.79\n",
       "1    1.33  1.88   69.00  ... 3.50                   Passed             886.84\n",
       "2   -0.11 -0.57   69.00  ... 3.50                   Passed             650.00\n",
       "3    1.22  0.95   58.00  ... 3.00  Failed_or_not_attempted             694.74\n",
       "4    0.88  0.00   51.00  ... 2.90  Failed_or_not_attempted             747.89\n",
       "\n",
       "[5 rows x 12 columns]"
      ]
     },
     "execution_count": 5,
     "metadata": {
      "tags": []
     },
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "zKNj_ZV2K_09"
   },
   "source": [
    "### Shuffle and Split into Train (70%) and Test set (30%)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "0ZLM1kXLz3PI"
   },
   "outputs": [],
   "source": [
    "train_df, test_df = train_test_split(df, test_size=0.30, random_state=42)\n",
    "\n",
    "output_file_path = os.path.join(dataset_base_dir,'train.csv')\n",
    "with open(output_file_path, mode=\"w\") as output_file:\n",
    "    train_df.to_csv(output_file,index=False,columns=columns,header=False)\n",
    "    output_file.close()\n",
    "\n",
    "output_file_path = os.path.join(dataset_base_dir,'test.csv')\n",
    "with open(output_file_path, mode=\"w\") as output_file:\n",
    "    test_df.to_csv(output_file,index=False,columns=columns,header=False)\n",
    "    output_file.close()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "1VQE85STLL46"
   },
   "source": [
    "### Computing Invese propensity weights for each subgroup, and writes to directory.\n",
    "\n",
    "IPS_example_weights_with_label.json: json dictionary of the format\n",
    "        {subgroup_id : inverse_propensity_score,...}. Used by IPS_reweighting_model approach."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab": {
     "height": 34
    },
    "colab_type": "code",
    "id": "2fkieHul02TL",
    "outputId": "2e58832f-48e6-4d45-940d-e6c04f418c92"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{0: 1.8843151171043293, 1: 2.488618103910016, 2: 36.36986301369863, 3: 25.013458950201883}\n"
     ]
    }
   ],
   "source": [
    "IPS_example_weights_without_label = {\n",
    "  0: (len(train_df))/(len(train_df[(train_df.race != 'Black') & (train_df.sex != 'Female')])), # 00: White Male\n",
    "  1: (len(train_df))/(len(train_df[(train_df.race != 'Black') & (train_df.sex == 'Female')])), # 01: White Female\n",
    "  2: (len(train_df))/(len(train_df[(train_df.race == 'Black') & (train_df.sex != 'Female')])), # 10: Black Male\n",
    "  3: (len(train_df))/(len(train_df[(train_df.race == 'Black') & (train_df.sex == 'Female')]))  # 11: Black Female\n",
    "}\n",
    "  \n",
    "output_file_path = os.path.join(dataset_base_dir,'IPS_example_weights_without_label.json')\n",
    "with open(output_file_path, mode=\"w\") as output_file:\n",
    "    output_file.write(json.dumps(IPS_example_weights_without_label))\n",
    "    output_file.close()\n",
    "\n",
    "print(IPS_example_weights_without_label)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab": {
     "height": 34
    },
    "colab_type": "code",
    "id": "Dm15uo-R0-LB",
    "outputId": "0c59cab3-3436-4fd2-c4f7-8cda8d0ba756"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{0: 10.194733955019199, 1: 13.545918367346939, 2: 82.23451327433628, 3: 63.214285714285715, 4: 2.3115671641791047, 5: 3.048720472440945, 6: 65.21052631578948, 7: 41.39198218262806}\n"
     ]
    }
   ],
   "source": [
    "IPS_example_weights_with_label = {\n",
    "0: (len(train_df))/(len(train_df[(train_df[target_variable] != target_value) & (train_df.race != 'Black') & (train_df.sex != 'Female')])), # 000: Negative White Male\n",
    "1: (len(train_df))/(len(train_df[(train_df[target_variable] != target_value) & (train_df.race != 'Black') & (train_df.sex == 'Female')])), # 001: Negative White Female\n",
    "2: (len(train_df))/(len(train_df[(train_df[target_variable] != target_value) & (train_df.race == 'Black') & (train_df.sex != 'Female')])), # 010: Negative Black Male\n",
    "3: (len(train_df))/(len(train_df[(train_df[target_variable] != target_value) & (train_df.race == 'Black') & (train_df.sex == 'Female')])), # 011: Negative Black Female\n",
    "4: (len(train_df))/(len(train_df[(train_df[target_variable] == target_value) & (train_df.race != 'Black') & (train_df.sex != 'Female')])), # 100: Positive White Male\n",
    "5: (len(train_df))/(len(train_df[(train_df[target_variable] == target_value) & (train_df.race != 'Black') & (train_df.sex == 'Female')])), # 101: Positive White Female\n",
    "6: (len(train_df))/(len(train_df[(train_df[target_variable] == target_value) & (train_df.race == 'Black') & (train_df.sex != 'Female')])), # 110: Positive Black Male\n",
    "7: (len(train_df))/(len(train_df[(train_df[target_variable] == target_value) & (train_df.race == 'Black') & (train_df.sex == 'Female')])), # 111: Positive Black Female\n",
    "}\n",
    "  \n",
    "output_file_path = os.path.join(dataset_base_dir,'IPS_example_weights_with_label.json')\n",
    "with open(output_file_path, mode=\"w\") as output_file:\n",
    "    output_file.write(json.dumps(IPS_example_weights_with_label))\n",
    "    output_file.close()\n",
    "\n",
    "print(IPS_example_weights_with_label)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "8SQc7h9HLcSc"
   },
   "source": [
    "### Construct vocabulary.json, and write to directory.\n",
    "\n",
    "vocabulary.json: json dictionary of the format {feature_name:      [feature_vocabulary]}, containing vocabulary for categorical features."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab": {
     "height": 34
    },
    "colab_type": "code",
    "id": "YIebJG2YfMpv",
    "outputId": "d22853e7-576b-416a-df26-db7faedf1670"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'isPartTime': ['No', 'Yes'], 'sex': ['Female', 'Male'], 'race': ['Black', 'White', 'Other'], 'pass_bar': ['Passed', 'Failed_or_not_attempted']}\n"
     ]
    }
   ],
   "source": [
    "cat_cols = train_df.select_dtypes(include='category').columns\n",
    "vocab_dict = {}\n",
    "for col in cat_cols:\n",
    "  vocab_dict[col] = list(set(train_df[col].cat.categories))\n",
    "  \n",
    "output_file_path = os.path.join(dataset_base_dir,'vocabulary.json')\n",
    "with open(output_file_path, mode=\"w\") as output_file:\n",
    "    output_file.write(json.dumps(vocab_dict))\n",
    "    output_file.close()\n",
    "print(vocab_dict)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "V9cxiG9SLfk6"
   },
   "source": [
    "### Construct mean_std.json, and write to directory\n",
    "\n",
    "mean_std.json: json dictionary of the format feature_name: [mean, std]},\n",
    "containing mean and std for numerical features. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab": {
     "height": 54
    },
    "colab_type": "code",
    "id": "URN20fCpFcdi",
    "outputId": "d6969466-4c09-4d7a-94b9-4a6b85d5d0b5"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'zfygpa': [0.007156308851224107, 0.956269325542025], 'zgpa': [0.005135324186171643, 0.9203686714713514], 'DOB_yr': [64.9954802259887, 6.374190672837983], 'cluster_tier': [3.7390906645143933, 1.183449020338574], 'family_income': [3.4257734732311005, 0.8794618881913022], 'lsat': [36.57297820823245, 5.629890085895137], 'ugpa': [3.2242292171105733, 0.41846631192390027], 'weighted_lsat_ugpa': [741.9962436595317, 107.69097610619035]}\n"
     ]
    }
   ],
   "source": [
    "temp_dict = train_df.describe().to_dict()\n",
    "mean_std_dict = {}\n",
    "for key, value in temp_dict.items():\n",
    "  mean_std_dict[key] = [value['mean'],value['std']]\n",
    "\n",
    "output_file_path = os.path.join(dataset_base_dir,'mean_std.json')\n",
    "with open(output_file_path, mode=\"w\") as output_file:\n",
    "    output_file.write(json.dumps(mean_std_dict))\n",
    "    output_file.close()\n",
    "print(mean_std_dict)"
   ]
  }
 ],
 "metadata": {
  "colab": {
   "collapsed_sections": [],
   "name": "CreateLawSchoolDatasetFiles.ipynb",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
