{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "yAq6aHVh5oCH"
   },
   "source": [
    "Licensed under the Apache License, Version 2.0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "VpEpTFph2ysp"
   },
   "outputs": [],
   "source": [
    "from __future__ import division\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import json\n",
    "import os,sys\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "from sklearn.model_selection import train_test_split\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "F-u1ecNmMiX3"
   },
   "source": [
    "## Overview\n",
    "\n",
    "### Pre-processes UCI Adult (Census Income) dataset:\n",
    "\n",
    "Download the Adult train and test data files can be downloaded from:\n",
    "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data\n",
    "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test\n",
    "and save them in the `./fairness_without_demographics/data/uci_adult` folder.\n",
    "\n",
    "Input: \n",
    "\n",
    "*   ./fairness_without_demographics/data/uci_adult/adult.data \n",
    "*   ./fairness_without_demographics/data/uci_adult/adult.test\n",
    "\n",
    "\n",
    "\n",
    "Outputs: train.csv, test.csv, mean_std.json, vocabulary.json, IPS_exampleweights_with_label.json, IPS_exampleweights_without_label.json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "oyFyRbFk7zox"
   },
   "outputs": [],
   "source": [
    "pd.options.display.float_format = '{:,.2f}'.format\n",
    "dataset_base_dir = './fairness_without_demographics/data/uci_adult/'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "PgWxzZeyKog3"
   },
   "source": [
    "### Load original dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "hB-PxNRyCRZm"
   },
   "outputs": [],
   "source": [
    "def convert_object_type_to_category(df):\n",
    "  \"\"\"Converts columns of type object to category.\"\"\"\n",
    "  df = pd.concat([df.select_dtypes(include=[], exclude=['object']),\n",
    "                  df.select_dtypes(['object']).apply(pd.Series.astype, dtype='category')\n",
    "                  ], axis=1).reindex_axis(df.columns, axis=1)\n",
    "  return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "chhYs8357xyU"
   },
   "outputs": [],
   "source": [
    "TRAIN_FILE = os.path.join(dataset_base_dir,'adult.data')\n",
    "TEST_FILE = os.path.join(dataset_base_dir,'adult.test')\n",
    "\n",
    "columns = [\n",
    "    \"age\", \"workclass\", \"fnlwgt\", \"education\", \"education-num\",\n",
    "    \"marital-status\", \"occupation\", \"relationship\", \"race\", \"sex\",\n",
    "    \"capital-gain\", \"capital-loss\", \"hours-per-week\", \"native-country\", \"income\"\n",
    "]\n",
    "\n",
    "target_variable = \"income\"\n",
    "target_value = \">50K\"\n",
    "\n",
    "with open(TRAIN_FILE, \"r\") as TRAIN_FILE:\n",
    "  train_df = pd.read_csv(TRAIN_FILE,sep=',',names=columns)\n",
    "\n",
    "with open(TEST_FILE, \"r\") as TEST_FILE:\n",
    "  test_df = pd.read_csv(TEST_FILE,sep=',',names=columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "Wy_GkVwHUsKQ"
   },
   "outputs": [],
   "source": [
    "# Convert columns of type ``object`` to ``category`` \n",
    "train_df = convert_object_type_to_category(train_df)\n",
    "test_df = convert_object_type_to_category(test_df)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "1VQE85STLL46"
   },
   "source": [
    "### Computing Invese propensity weights for each subgroup, and writes to directory.\n",
    "\n",
    "IPS_example_weights_with_label.json: json dictionary of the format\n",
    "        {subgroup_id : inverse_propensity_score,...}. Used by IPS_reweighting_model approach."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab": {
     "height": 34
    },
    "colab_type": "code",
    "id": "2fkieHul02TL",
    "outputId": "cb4c15dc-1979-46ee-c4c1-7ff546d30907"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{0: 1.6102566638642994, 1: 3.5330946180555554, 2: 20.752708731676226, 3: 20.939549839228295}\n"
     ]
    }
   ],
   "source": [
    "IPS_example_weights_without_label = {\n",
    "  0: (len(train_df))/(len(train_df[(train_df.race != 'Black') & (train_df.sex != 'Female')])), # 00: White Male\n",
    "  1: (len(train_df))/(len(train_df[(train_df.race != 'Black') & (train_df.sex == 'Female')])), # 01: White Female\n",
    "  2: (len(train_df))/(len(train_df[(train_df.race == 'Black') & (train_df.sex != 'Female')])), # 10: Black Male\n",
    "  3: (len(train_df))/(len(train_df[(train_df.race == 'Black') & (train_df.sex == 'Female')]))  # 11: Black Female\n",
    "}\n",
    "  \n",
    "output_file_path = os.path.join(dataset_base_dir,'IPS_example_weights_without_label.json')\n",
    "with open(output_file_path, mode=\"w\") as output_file:\n",
    "    output_file.write(json.dumps(IPS_example_weights_without_label))\n",
    "    output_file.close()\n",
    "\n",
    "print(IPS_example_weights_without_label)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab": {
     "height": 34
    },
    "colab_type": "code",
    "id": "Dm15uo-R0-LB",
    "outputId": "15fd3cf8-3feb-48f7-dcbf-228fe02f7dbc"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{0: 2.3499566974595845, 1: 4.00652147163775, 2: 25.59827044025157, 3: 22.2259385665529, 4: 5.115632364493323, 5: 29.899908172635445, 6: 109.63299663299664, 7: 361.7888888888889}\n"
     ]
    }
   ],
   "source": [
    "IPS_example_weights_with_label = {\n",
    "0: (len(train_df))/(len(train_df[(train_df[target_variable] != target_value) & (train_df.race != 'Black') & (train_df.sex != 'Female')])), # 000: Negative White Male\n",
    "1: (len(train_df))/(len(train_df[(train_df[target_variable] != target_value) & (train_df.race != 'Black') & (train_df.sex == 'Female')])), # 001: Negative White Female\n",
    "2: (len(train_df))/(len(train_df[(train_df[target_variable] != target_value) & (train_df.race == 'Black') & (train_df.sex != 'Female')])), # 010: Negative Black Male\n",
    "3: (len(train_df))/(len(train_df[(train_df[target_variable] != target_value) & (train_df.race == 'Black') & (train_df.sex == 'Female')])), # 011: Negative Black Female\n",
    "4: (len(train_df))/(len(train_df[(train_df[target_variable] == target_value) & (train_df.race != 'Black') & (train_df.sex != 'Female')])), # 100: Positive White Male\n",
    "5: (len(train_df))/(len(train_df[(train_df[target_variable] == target_value) & (train_df.race != 'Black') & (train_df.sex == 'Female')])), # 101: Positive White Female\n",
    "6: (len(train_df))/(len(train_df[(train_df[target_variable] == target_value) & (train_df.race == 'Black') & (train_df.sex != 'Female')])), # 110: Positive Black Male\n",
    "7: (len(train_df))/(len(train_df[(train_df[target_variable] == target_value) & (train_df.race == 'Black') & (train_df.sex == 'Female')])), # 111: Positive Black Female\n",
    "}\n",
    "  \n",
    "output_file_path = os.path.join(dataset_base_dir,'IPS_example_weights_with_label.json')\n",
    "with open(output_file_path, mode=\"w\") as output_file:\n",
    "    output_file.write(json.dumps(IPS_example_weights_with_label))\n",
    "    output_file.close()\n",
    "\n",
    "print(IPS_example_weights_with_label)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "8SQc7h9HLcSc"
   },
   "source": [
    "### Construct vocabulary.json, and write to directory.\n",
    "\n",
    "vocabulary.json: json dictionary of the format {feature_name:      [feature_vocabulary]}, containing vocabulary for categorical features."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab": {
     "height": 54
    },
    "colab_type": "code",
    "id": "YIebJG2YfMpv",
    "outputId": "3c38fa2e-da0b-4958-915f-d3990556c138"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'workclass': ['Self-emp-inc', 'Self-emp-not-inc', 'Federal-gov', 'Never-worked', 'State-gov', 'Without-pay', 'Private', 'Local-gov'], 'education': ['HS-grad', 'Doctorate', 'Masters', 'Assoc-voc', '1st-4th', '11th', '5th-6th', 'Assoc-acdm', 'Some-college', '10th', '7th-8th', '9th', 'Preschool', '12th', 'Bachelors', 'Prof-school'], 'marital-status': ['Divorced', 'Married-AF-spouse', 'Married-civ-spouse', 'Never-married', 'Married-spouse-absent', 'Widowed', 'Separated'], 'occupation': ['Tech-support', 'Farming-fishing', 'Craft-repair', 'Other-service', 'Exec-managerial', 'Sales', 'Handlers-cleaners', 'Priv-house-serv', 'Prof-specialty', 'Adm-clerical', 'Armed-Forces', 'Protective-serv', 'Machine-op-inspct', 'Transport-moving'], 'relationship': ['Other-relative', 'Wife', 'Husband', 'Own-child', 'Not-in-family', 'Unmarried'], 'race': ['Amer-Indian-Eskimo', 'Other', 'White', 'Asian-Pac-Islander', 'Black'], 'sex': ['Female', 'Male'], 'native-country': ['Iran', 'Ireland', 'Japan', 'Germany', 'Portugal', 'Greece', 'Mexico', 'Thailand', 'Outlying-US(Guam-USVI-etc)', 'Columbia', 'Philippines', 'France', 'Scotland', 'Hungary', 'Vietnam', 'India', 'Puerto-Rico', 'Hong', 'Poland', 'Nicaragua', 'Canada', 'China', 'Jamaica', 'Italy', 'Haiti', 'Ecuador', 'United-States', 'Holand-Netherlands', 'Cambodia', 'Peru', 'Honduras', 'Dominican-Republic', 'Trinadad&Tobago', 'Yugoslavia', 'South', 'Taiwan', 'Cuba', 'El-Salvador', 'England', 'Laos', 'Guatemala'], 'income': ['>50K', '<=50K']}\n"
     ]
    }
   ],
   "source": [
    "cat_cols = train_df.select_dtypes(include='category').columns\n",
    "vocab_dict = {}\n",
    "for col in cat_cols:\n",
    "  vocab_dict[col] = list(set(train_df[col].cat.categories)-{\"?\"})\n",
    "  \n",
    "output_file_path = os.path.join(dataset_base_dir,'vocabulary.json')\n",
    "with open(output_file_path, mode=\"w\") as output_file:\n",
    "    output_file.write(json.dumps(vocab_dict))\n",
    "    output_file.close()\n",
    "print(vocab_dict)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "V9cxiG9SLfk6"
   },
   "source": [
    "### Construct mean_std.json, and write to directory\n",
    "\n",
    "mean_std.json: json dictionary of the format feature_name: [mean, std]},\n",
    "containing mean and std for numerical features. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab": {
     "height": 54
    },
    "colab_type": "code",
    "id": "sUWCDXhaQZE_",
    "outputId": "82d48f27-6ced-41ea-ff94-60d41d6561a3"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'age': [38.58164675532078, 13.640432553581341], 'fnlwgt': [189778.36651208502, 105549.97769702224], 'education-num': [10.0806793403151, 2.5727203320673877], 'capital-gain': [1077.6488437087312, 7385.292084840338], 'capital-loss': [87.303829734959, 402.9602186489998], 'hours-per-week': [40.437455852092995, 12.347428681731843]}\n"
     ]
    }
   ],
   "source": [
    "temp_dict = train_df.describe().to_dict()\n",
    "mean_std_dict = {}\n",
    "for key, value in temp_dict.items():\n",
    "  mean_std_dict[key] = [value['mean'],value['std']]\n",
    "\n",
    "output_file_path = os.path.join(dataset_base_dir,'mean_std.json')\n",
    "with open(output_file_path, mode=\"w\") as output_file:\n",
    "    output_file.write(json.dumps(mean_std_dict))\n",
    "    output_file.close()\n",
    "print(mean_std_dict)"
   ]
  }
 ],
 "metadata": {
  "colab": {
   "collapsed_sections": [],
   "name": "CreateUCIAdultDatasetFiles.ipynb",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
