{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "yAq6aHVh5oCH"
   },
   "source": [
    "Licensed under the Apache License, Version 2.0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "VpEpTFph2ysp"
   },
   "outputs": [],
   "source": [
    "from __future__ import division\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import json\n",
    "import os,sys\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "from sklearn.model_selection import train_test_split\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "F-u1ecNmMiX3"
   },
   "source": [
    "## Overview\n",
    "\n",
    "### Pre-processes COMPAS dataset:\n",
    "\n",
    "Download the COMPAS dataset from:\n",
    "https://github.com/propublica/compas-analysis/blob/master/compas-scores-two-years.csv\n",
    "and save it in the `./fairness_without_demographics/data/compas` folder.\n",
    "\n",
    "Input: ./group_agnostic_fairness/data/compas/compas-scores-two-years.csv\n",
    "\n",
    "Outputs: train.csv, test.csv, mean_std.json, vocabulary.json, IPS_exampleweights_with_label.json, IPS_exampleweights_without_label.json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "oyFyRbFk7zox"
   },
   "outputs": [],
   "source": [
    "pd.options.display.float_format = '{:,.2f}'.format\n",
    "dataset_base_dir = './fairness_without_demographics/data/compas/'\n",
    "dataset_file_name = 'compas-scores-two-years.csv'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "PgWxzZeyKog3"
   },
   "source": [
    "### Processing original dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "kL3-NykBQhKz"
   },
   "outputs": [],
   "source": [
    "file_path = os.path.join(dataset_base_dir,dataset_file_name)\n",
    "with open(file_path, \"r\") as file_name:\n",
    "  temp_df = pd.read_csv(file_name)\n",
    "\n",
    "# Columns of interest\n",
    "columns = ['juv_fel_count', 'juv_misd_count', 'juv_other_count', 'priors_count',\n",
    "                'age', \n",
    "                'c_charge_degree', \n",
    "                'c_charge_desc',\n",
    "                'age_cat',\n",
    "                'sex', 'race',  'is_recid']\n",
    "target_variable = 'is_recid'\n",
    "target_value = 'Yes'\n",
    "\n",
    "# Drop duplicates\n",
    "temp_df = temp_df[['id']+columns].drop_duplicates()\n",
    "df = temp_df[columns].copy()\n",
    "\n",
    "# Convert columns of type ``object`` to ``category`` \n",
    "df = pd.concat([\n",
    "        df.select_dtypes(include=[], exclude=['object']),\n",
    "        df.select_dtypes(['object']).apply(pd.Series.astype, dtype='category')\n",
    "        ], axis=1).reindex_axis(df.columns, axis=1)\n",
    "\n",
    "# Binarize target_variable\n",
    "df['is_recid'] = df.apply(lambda x: 'Yes' if x['is_recid']==1.0 else 'No', axis=1).astype('category')\n",
    "\n",
    "# Process protected-column values\n",
    "race_dict = {'African-American':'Black','Caucasian':'White'}\n",
    "df['race'] = df.apply(lambda x: race_dict[x['race']] if x['race'] in race_dict.keys() else 'Other', axis=1).astype('category')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab": {
     "height": 204
    },
    "colab_type": "code",
    "id": "XCClI7O8ZHTi",
    "outputId": "bb0568b5-1338-4286-9182-7f0ff23fc18c"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>juv_fel_count</th>\n",
       "      <th>juv_misd_count</th>\n",
       "      <th>juv_other_count</th>\n",
       "      <th>priors_count</th>\n",
       "      <th>age</th>\n",
       "      <th>c_charge_degree</th>\n",
       "      <th>c_charge_desc</th>\n",
       "      <th>age_cat</th>\n",
       "      <th>sex</th>\n",
       "      <th>race</th>\n",
       "      <th>is_recid</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>69</td>\n",
       "      <td>F</td>\n",
       "      <td>Aggravated Assault w/Firearm</td>\n",
       "      <td>Greater than 45</td>\n",
       "      <td>Male</td>\n",
       "      <td>Other</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>34</td>\n",
       "      <td>F</td>\n",
       "      <td>Felony Battery w/Prior Convict</td>\n",
       "      <td>25 - 45</td>\n",
       "      <td>Male</td>\n",
       "      <td>Black</td>\n",
       "      <td>Yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>24</td>\n",
       "      <td>F</td>\n",
       "      <td>Possession of Cocaine</td>\n",
       "      <td>Less than 25</td>\n",
       "      <td>Male</td>\n",
       "      <td>Black</td>\n",
       "      <td>Yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>23</td>\n",
       "      <td>F</td>\n",
       "      <td>Possession of Cannabis</td>\n",
       "      <td>Less than 25</td>\n",
       "      <td>Male</td>\n",
       "      <td>Black</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>43</td>\n",
       "      <td>F</td>\n",
       "      <td>arrest case no charge</td>\n",
       "      <td>25 - 45</td>\n",
       "      <td>Male</td>\n",
       "      <td>Other</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   juv_fel_count  juv_misd_count  juv_other_count  ...   sex   race is_recid\n",
       "0              0               0                0  ...  Male  Other       No\n",
       "1              0               0                0  ...  Male  Black      Yes\n",
       "2              0               0                1  ...  Male  Black      Yes\n",
       "3              0               1                0  ...  Male  Black       No\n",
       "4              0               0                0  ...  Male  Other       No\n",
       "\n",
       "[5 rows x 11 columns]"
      ]
     },
     "execution_count": 4,
     "metadata": {
      "tags": []
     },
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "zKNj_ZV2K_09"
   },
   "source": [
    "### Shuffle and Split into Train (70%) and Test set (30%)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "0ZLM1kXLz3PI"
   },
   "outputs": [],
   "source": [
    "train_df, test_df = train_test_split(df, test_size=0.30, random_state=42)\n",
    "\n",
    "output_file_path = os.path.join(dataset_base_dir,'train.csv')\n",
    "with open(output_file_path, mode=\"w\") as output_file:\n",
    "    train_df.to_csv(output_file,index=False,columns=columns,header=False)\n",
    "    output_file.close()\n",
    "\n",
    "output_file_path = os.path.join(dataset_base_dir,'test.csv')\n",
    "with open(output_file_path, mode=\"w\") as output_file:\n",
    "    test_df.to_csv(output_file,index=False,columns=columns,header=False)\n",
    "    output_file.close()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "1VQE85STLL46"
   },
   "source": [
    "### Computing Invese propensity weights for each subgroup, and writes to directory.\n",
    "\n",
    "IPS_example_weights_with_label.json: json dictionary of the format\n",
    "        {subgroup_id : inverse_propensity_score,...}. Used by IPS_reweighting_model approach."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab": {
     "height": 34
    },
    "colab_type": "code",
    "id": "2fkieHul02TL",
    "outputId": "9aa901d9-b832-4b89-edab-e3521d5c7217"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{0: 2.595886889460154, 1: 9.709615384615384, 2: 2.3974358974358974, 3: 10.56276150627615}\n"
     ]
    }
   ],
   "source": [
    "IPS_example_weights_without_label = {\n",
    "  0: (len(train_df))/(len(train_df[(train_df.race != 'Black') & (train_df.sex != 'Female')])), # 00: White Male\n",
    "  1: (len(train_df))/(len(train_df[(train_df.race != 'Black') & (train_df.sex == 'Female')])), # 01: White Female\n",
    "  2: (len(train_df))/(len(train_df[(train_df.race == 'Black') & (train_df.sex != 'Female')])), # 10: Black Male\n",
    "  3: (len(train_df))/(len(train_df[(train_df.race == 'Black') & (train_df.sex == 'Female')]))  # 11: Black Female\n",
    "}\n",
    "  \n",
    "output_file_path = os.path.join(dataset_base_dir,'IPS_example_weights_without_label.json')\n",
    "with open(output_file_path, mode=\"w\") as output_file:\n",
    "    output_file.write(json.dumps(IPS_example_weights_without_label))\n",
    "    output_file.close()\n",
    "\n",
    "print(IPS_example_weights_without_label)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab": {
     "height": 34
    },
    "colab_type": "code",
    "id": "Dm15uo-R0-LB",
    "outputId": "2619a7d0-d079-43c9-cee6-d9eeb3ad4ce4"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{0: 4.598360655737705, 1: 15.162162162162161, 2: 5.744027303754266, 3: 18.03214285714286, 4: 5.961038961038961, 5: 27.0, 6: 4.114914425427873, 7: 25.5}\n"
     ]
    }
   ],
   "source": [
    "IPS_example_weights_with_label = {\n",
    "0: (len(train_df))/(len(train_df[(train_df[target_variable] != target_value) & (train_df.race != 'Black') & (train_df.sex != 'Female')])), # 000: Negative White Male\n",
    "1: (len(train_df))/(len(train_df[(train_df[target_variable] != target_value) & (train_df.race != 'Black') & (train_df.sex == 'Female')])), # 001: Negative White Female\n",
    "2: (len(train_df))/(len(train_df[(train_df[target_variable] != target_value) & (train_df.race == 'Black') & (train_df.sex != 'Female')])), # 010: Negative Black Male\n",
    "3: (len(train_df))/(len(train_df[(train_df[target_variable] != target_value) & (train_df.race == 'Black') & (train_df.sex == 'Female')])), # 011: Negative Black Female\n",
    "4: (len(train_df))/(len(train_df[(train_df[target_variable] == target_value) & (train_df.race != 'Black') & (train_df.sex != 'Female')])), # 100: Positive White Male\n",
    "5: (len(train_df))/(len(train_df[(train_df[target_variable] == target_value) & (train_df.race != 'Black') & (train_df.sex == 'Female')])), # 101: Positive White Female\n",
    "6: (len(train_df))/(len(train_df[(train_df[target_variable] == target_value) & (train_df.race == 'Black') & (train_df.sex != 'Female')])), # 110: Positive Black Male\n",
    "7: (len(train_df))/(len(train_df[(train_df[target_variable] == target_value) & (train_df.race == 'Black') & (train_df.sex == 'Female')])), # 111: Positive Black Female\n",
    "}\n",
    "  \n",
    "output_file_path = os.path.join(dataset_base_dir,'IPS_example_weights_with_label.json')\n",
    "with open(output_file_path, mode=\"w\") as output_file:\n",
    "    output_file.write(json.dumps(IPS_example_weights_with_label))\n",
    "    output_file.close()\n",
    "\n",
    "print(IPS_example_weights_with_label)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "8SQc7h9HLcSc"
   },
   "source": [
    "### Construct vocabulary.json, and write to directory.\n",
    "\n",
    "vocabulary.json: json dictionary of the format {feature_name:      [feature_vocabulary]}, containing vocabulary for categorical features."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab": {
     "height": 54
    },
    "colab_type": "code",
    "id": "YIebJG2YfMpv",
    "outputId": "2a03258b-c31e-47ce-a282-4f4168f8ca65"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'c_charge_degree': ['M', 'F'], 'c_charge_desc': ['Assault', 'DUI/Property Damage/Persnl Inj', 'Poss of Firearm by Convic Felo', 'Cash Item w/Intent to Defraud', 'Trespass Struct/Conveyance', 'Agg Fleeing/Eluding High Speed', 'Poss Counterfeit Payment Inst', 'Del Morphine at/near Park', 'Poss of Vessel w/Altered ID NO', 'Opert With Susp DL 2nd Offens', 'Compulsory Attendance Violation', 'Depriv LEO of Protect/Communic', 'Carrying Concealed Firearm', 'Aggravated Assault w/Firearm', 'Possession Burglary Tools', 'Possession Of Alprazolam', 'Flee/Elude LEO-Agg Flee Unsafe', 'Aggravated Assault', 'Harm Public Servant Or Family', 'Battery', 'Money Launder 100K or More Dols', 'Tampering with a Victim', 'Unauth Poss ID Card or DL', 'Manslaughter W/Weapon/Firearm', 'Sell or Offer for Sale Counterfeit Goods', 'Fail Register Career Offender', 'Felony Petit Theft', 'Possession of Morphine', 'Burglary Unoccupied Dwelling', 'Burglary Dwelling Occupied', 'Simulation of Legal Process', 'Resist Officer w/Violence', 'Exploit Elderly Person 20-100K', 'DUI- Enhanced', 'Imperson Public Officer or Emplyee', 'Burglary Structure Unoccup', 'Poss Unlaw Issue Id', 'Computer Pornography', 'Purchase Cannabis', 'Consume Alcoholic Bev Pub', 'Aggravated Assault W/dead Weap', 'DUI - Enhanced', 'Trespass Property w/Dang Weap', 'Possession Of Heroin', 'Felony DUI - Enhanced', 'Obstruct Fire Equipment', 'D.U.I. Serious Bodily Injury', 'Murder in the First Degree', 'Culpable Negligence', 'Traffick Oxycodone     4g><14g', 'Del of JWH-250 2-Methox 1-Pentyl', 'Crim Attempt/Solicit/Consp', 'Retail Theft $300 1st Offense', 'Deliver 3,4 Methylenediox', 'Solicit Purchase Cocaine', 'Poss Drugs W/O A Prescription', 'Possession Of Phentermine', 'Uttering a Forged Instrument', 'Tamper With Witness/Victim/CI', 'Possession of Codeine', 'False Imprisonment', 'Pos Cannabis For Consideration', 'Trespass Struct/Convey Occupy', 'Deliver Alprazolam', 'Gambling/Gamb Paraphernalia', 'Agg Assault W/int Com Fel Dome', 'Use Of 2 Way Device To Fac Fel', 'Possession of Methadone', 'Deliver Cocaine 1000FT Church', 'Exposes Culpable Negligence', 'Stalking', 'Prostitution', 'Tamper With Witness', 'Sel/Pur/Mfr/Del Control Substa', 'Carjacking with a Firearm', 'Attempted Burg/Convey/Unocc', 'Trans/Harm/Material to a Minor', 'Obstruct Officer W/Violence', 'Fail To Obey Police Officer', 'Stalking (Aggravated)', 'DUI - Property Damage/Personal Injury', 'Burgl Dwel/Struct/Convey Armed', 'Crim Attempt/Solic/Consp', 'Poss/pur/sell/deliver Cocaine', 'arrest case no charge', 'Driving License Suspended', 'Drivg While Lic Suspd/Revk/Can', 'Aggravated Battery On 65/Older', 'Murder in 2nd Degree', 'Grand Theft Firearm', 'Traffick Amphetamine 28g><200g', 'Attempt Armed Burglary Dwell', 'Possession of LSD', 'Defrauding Innkeeper $300/More', 'Possession of Ethylone', 'Possession Of Methamphetamine', 'Poss Of Controlled Substance', 'Deliver Cocaine 1000FT Store', 'Poss/Sell/Deliver Clonazepam', 'Consp Traff Oxycodone  4g><14g', 'DWLS Susp/Cancel Revoked', 'Sex Battery Deft 18+/Vict 11-', 'Prostitution/Lewd Act Assignation', 'Fail Register Vehicle', 'Agg Abuse Elderlly/Disabled Adult', 'Pos Methylenedioxymethcath W/I/D/S', 'Fraudulent Use of Credit Card', 'Lewdness Violation', 'Battery Emergency Care Provide', 'Solicitation On Felony 3 Deg', 'Susp Drivers Lic 1st Offense', 'Use Scanning Device to Defraud', 'Poss Similitude of Drivers Lic', 'Fabricating Physical Evidence', 'Delivery of Heroin', 'Solicit To Deliver Cocaine', 'Harass Witness/Victm/Informnt', 'Lewd/Lasc Exhib Presence <16yr', 'Poss Meth/Diox/Meth/Amp (MDMA)', 'Aggravated Battery / Pregnant', 'Fail Obey Driv Lic Restrictions', 'Throw Deadly Missile Into Veh', 'Extradition/Defendants', 'Shoot In Occupied Dwell', 'Poss Oxycodone W/Int/Sell/Del', 'Dealing in Stolen Property', 'Deliver Cocaine', 'DUI Level 0.15 Or Minor In Veh', 'Manage Busn W/O City Occup Lic', 'DUI Blood Alcohol Above 0.20', 'Offer Agree Secure For Lewd Act', 'Felony/Driving Under Influence', 'Fail To Secure Load', 'Interference with Custody', 'Prostitution/Lewdness/Assign', 'Accessory After the Fact', 'Possession of Benzylpiperazine', 'Issuing a Worthless Draft', 'Cause Anoth Phone Ring Repeat', 'Throw In Occupied Dwell', 'Att Burgl Unoccupied Dwel', 'Sex Offender Fail Comply W/Law', 'Misuse Of 911 Or E911 System', 'Grand Theft of a Fire Extinquisher', 'Possession of Cocaine', 'Viol Injunction Protect Dom Vi', 'Possess w/I/Utter Forged Bills', 'Purchase Of Cocaine', 'Criminal Mischief', 'Poss Pyrrolidinobutiophenone', 'Viol Pretrial Release Dom Viol', 'Tresspass in Structure or Conveyance', 'False 911 Call', 'Hiring with Intent to Defraud', 'Restraining Order Dating Viol', 'DWI w/Inj Susp Lic / Habit Off', 'Possess Weapon On School Prop', 'Poss Wep Conv Felon', 'Poss Pyrrolidinovalerophenone W/I/D/S', 'Solicit Deliver Cocaine', 'Att Burgl Struc/Conv Dwel/Occp', 'Failure To Return Hired Vehicle', 'Poss Of 1,4-Butanediol', 'Burglary Structure Occupied', 'Criminal Mischief>$200<$1000', 'Unauth C/P/S Sounds>1000/Audio', 'Att Tamper w/Physical Evidence', 'Sexual Performance by a Child', 'Attempted Deliv Control Subst', 'Cruelty to Animals', 'Unemployment Compensatn Fraud', 'Disorderly Intoxication', 'Insurance Fraud', 'Trespass On School Grounds', 'Unlaw LicTag/Sticker Attach', 'Aggrav Child Abuse-Causes Harm', 'Grand Theft (Motor Vehicle)', 'Possession of Hydromorphone', 'Sexual Battery / Vict 12 Yrs +', 'Grand Theft In The 3Rd Degree', 'Pos Cannabis W/Intent Sel/Del', 'Use of Anti-Shoplifting Device', 'Battery On Parking Enfor Speci', 'Crlty Twrd Child Urge Oth Act', 'Obtain Control Substance By Fraud', 'Fleeing or Eluding a LEO', 'Felon in Pos of Firearm or Amm', 'Possession Of Anabolic Steroid', 'Poss/Sell/Del Cocaine 1000FT Sch', 'Voyeurism', 'Poss Tetrahydrocannabinols', 'DWLS Canceled Disqul 1st Off', 'Aggravated Battery (Firearm)', 'Deliver Cocaine 1000FT Park', 'Kidnapping / Domestic Violence', 'Refuse to Supply DNA Sample', 'Conspiracy Dealing Stolen Prop', 'Grand Theft (motor Vehicle)', 'Poss of Cocaine W/I/D/S 1000FT Park', 'Attempted Robbery  Weapon', 'Driving Under The Influence', 'Robbery / No Weapon', 'Possession Of Carisoprodol', 'Possession Of Clonazepam', 'Trespass Structure w/Dang Weap', 'Offn Against Intellectual Prop', 'Falsely Impersonating Officer', 'Assault Law Enforcement Officer', 'Strong Armed  Robbery', 'Del 3,4 Methylenedioxymethcath', 'Battery On Fire Fighter', 'Soliciting For Prostitution', 'Aggress/Panhandle/Beg/Solict', 'Opert With Susp DL 2ND Offense', 'Traffic Counterfeit Cred Cards', 'Carry Open/Uncov Bev In Pub', 'Possession of XLR11', 'Possession Of Fentanyl', 'Murder In 2nd Degree W/firearm', 'Possession Of Paraphernalia', 'Disorderly Conduct', 'Fraud Obtain Food or Lodging', 'Purchasing Of Alprazolam', 'Felony Batt(Great Bodily Harm)', 'Unlaw Lic Use/Disply Of Others', 'Introduce Contraband Into Jail', 'Poss of Methylethcathinone', 'Aggrav Stalking After Injunctn', 'Poss Trifluoromethylphenylpipe', 'Defrauding Innkeeper', 'Sell/Man/Del Pos/w/int Heroin', 'Poss Of RX Without RX', 'Discharge Firearm From Vehicle', 'Lewd or Lascivious Molestation', 'Possession of Cannabis', 'Poss Firearm W/Altered ID#', 'Felony DUI (level 3)', 'Possession Of Buprenorphine', 'Felony Battery (Dom Strang)', 'Possession Of Cocaine', 'Grand Theft in the 3rd Degree', 'Possession Of Lorazepam', 'False Bomb Report', 'Refuse Submit Blood/Breath Test', 'Aggravated Battery (Firearm/Actual Possession)', 'Neglect Child / Bodily Harm', 'Threat Public Servant', 'Posses/Disply Susp/Revk/Frd DL', 'Tresspass Struct/Conveyance', 'Felony Battery w/Prior Convict', 'Structuring Transactions', 'Uttering Forged Bills', 'Forging Bank Bills/Promis Note', 'Consp Traff Oxycodone 28g><30k', 'Uttering Forged Credit Card', 'Sel Etc/Pos/w/Int Contrft Schd', 'Crimin Mischief Damage $1000+', 'Reckless Driving', 'Burglary Structure Assault/Batt', 'Solic to Commit Battery', 'Deliver Cannabis 1000FTSch', 'Trespass Private Property', 'Interfere W/Traf Cont Dev RR', 'Possession Of 3,4Methylenediox', 'Traff In Cocaine <400g>150 Kil', 'Burglary Dwelling Armed', 'License Suspended Revoked', 'Uttering Worthless Check +$150', 'Arson II (Vehicle)', 'Armed Trafficking in Cannabis', 'Sale/Del Cannabis At/Near Scho', 'Aggr Child Abuse-Torture,Punish', 'Robbery / Weapon', 'Live on Earnings of Prostitute', 'Exhibition Weapon School Prop', 'Burglary Conveyance Armed', 'Grand Theft of the 2nd Degree', 'Unauthorized Interf w/Railroad', 'Unlawful Conveyance of Fuel', 'Unlaw Use False Name/Identity', 'Att Burgl Conv Occp', 'Agg Battery Grt/Bod/Harm', 'Possess Controlled Substance', 'Neglect Child / No Bodily Harm', 'Corrupt Public Servant', 'False Name By Person Arrest', 'Expired DL More Than 6 Months', 'Felony Committing Prostitution', 'Operating W/O Valid License', 'Grand Theft Dwell Property', 'Petit Theft $100- $300', 'Conspiracy to Deliver Cocaine', 'Possession of Hydrocodone', 'Littering', 'Grand Theft on 65 Yr or Older', 'Oper Motorcycle W/O Valid DL', 'Delivery Of Drug Paraphernalia', 'Carrying A Concealed Weapon', 'Arson in the First Degree', 'Contribute Delinquency Of A Minor', 'Possess Countrfeit Credit Card', 'Theft', 'Unlawful Use Of Police Badges', 'False Ownership Info/Pawn Item', 'PL/Unlaw Use Credit Card', 'Counterfeit Lic Plates/Sticker', 'Violation Of Boater Safety Id', 'Robbery Sudd Snatch No Weapon', 'Deliver Cannabis', 'Fighting/Baiting Animals', 'Aggravated Assault W/o Firearm', 'Burglary Conveyance Assault/Bat', 'Cruelty Toward Child', 'Giving False Crime Report', 'Robbery W/Firearm', 'Criminal Mischief Damage <$200', 'Aggrav Battery w/Deadly Weapon', 'Possession of Oxycodone', 'Trespassing/Construction Site', 'Neglect/Abuse Elderly Person', 'Poss Pyrrolidinovalerophenone', 'Prowling/Loitering', 'Poss3,4 Methylenedioxymethcath', 'Shoot Into Vehicle', 'Poss F/Arm Delinq', 'Burglary Conveyance Occupied', 'Burglary With Assault/battery', 'Aiding Escape', 'Bribery Athletic Contests', 'Throw Missile Into Pub/Priv Dw', 'Use Computer for Child Exploit', 'DOC/Cause Public Danger', 'Battery on a Person Over 65', 'Resist/Obstruct W/O Violence', 'Poss Contr Subst W/o Prescript', 'Poss Unlaw Issue Driver Licenc', 'Deliver Cocaine 1000FT School', 'Possess Cannabis/20 Grams Or Less', 'Principal In The First Degree', 'Aggravated Battery', 'Poss Cocaine/Intent To Del/Sel', 'Purchase/P/W/Int Cannabis', 'Leaving Acc/Unattended Veh', 'Possess Cannabis 1000FTSch', 'Leave Accd/Attend Veh/Less $50', 'Del Cannabis At/Near Park', 'Leave Acc/Attend Veh/More $50', 'Manufacture Cannabis', 'Criminal Attempt 3rd Deg Felon', 'Poss Cntrft Contr Sub w/Intent', 'Aide/Abet Prostitution Lewdness', 'Grand Theft in the 1st Degree', 'Fleeing Or Attmp Eluding A Leo', 'Attempted Robbery  No Weapon', 'Dealing In Stolen Property', 'Attempt Burglary (Struct)', 'Possession of Alcohol Under 21', 'Violation License Restrictions', 'Crim Use Of Personal Id Info', 'Possession Firearm School Prop', 'Trespass Other Struct/Conve', 'Disrupting School Function', 'Viol Injunct Domestic Violence', 'Abuse Without Great Harm', 'Robbery W/Deadly Weapon', 'Poss 3,4 MDMA (Ecstasy)', 'Contradict Statement', 'Unl/Disturb Education/Instui', 'Ride Tri-Rail Without Paying', 'Battery On A Person Over 65', 'Compulsory Sch Attnd Violation', 'Lease For Purpose Trafficking', 'Attempted Robbery Firearm', 'Unlicensed Telemarketing', 'Sell Cannabis', 'Intoxicated/Safety Of Another', 'Possess/Use Weapon 1 Deg Felon', 'Possession Of Amphetamine', 'Possession Child Pornography', 'Possession of Butylone', 'Poss Alprazolam W/int Sell/Del', 'Present Proof of Invalid Insur', 'Burglary Dwelling Assault/Batt', 'Fel Drive License Perm Revoke', 'Sell Conterfeit Cont Substance', 'Crim Use of Personal ID Info', 'Del Cannabis For Consideration', 'Fail Sex Offend Report Bylaw', 'Lewd/Lasc Battery Pers 12+/<16', 'Carjacking w/o Deadly Weapon', 'Traffick Hydrocodone   4g><14g', 'Escape', 'Poss/Sell/Del/Man Amobarbital', 'Battery on Law Enforc Officer', 'Tamper With Victim', 'Theft/To Deprive', 'Possess Tobacco Product Under 18', 'Failure To Pay Taxi Cab Charge', 'Sound Articles Over 100', 'Possession Of Diazepam', 'Sex Batt Faml/Cust Vict 12-17Y', 'Aggrav Child Abuse-Agg Battery', 'False Motor Veh Insurance Card', 'Possess Mot Veh W/Alt Vin #', 'Possess Drug Paraphernalia', 'DUI Property Damage/Injury', 'Felony Battery', 'Poss Anti-Shoplifting Device', 'False Info LEO During Invest', 'Fail To Redeliv Hire/Leas Prop', 'Lewd/Lasciv Molest Elder Persn', 'Lve/Scen/Acc/Veh/Prop/Damage', 'Trespass Structure/Conveyance', 'Video Voyeur-<24Y on Child >16', 'Alcoholic Beverage Violation-FL', 'Burglary Assault/Battery Armed', 'Agg Assault Law Enforc Officer', 'Violation of Injunction Order/Stalking/Cyberstalking', 'Petit Theft', 'Attempted Burg/struct/unocc', 'Burglary Conveyance Unoccup', 'Driving While License Revoked', 'Sale/Del Counterfeit Cont Subs', 'Aggravated Assault W/Dead Weap', 'Lewd Act Presence Child 16-', 'Open Carrying Of Weapon', 'Leaving the Scene of Accident', 'Child Abuse', 'Tampering With Physical Evidence', 'Agg Fleeing and Eluding', 'Fail To Redeliver Hire Prop', 'Delivery of 5-Fluoro PB-22', 'Battery Spouse Or Girlfriend', 'Felony Driving While Lic Suspd', 'Retail Theft $300 2nd Offense', 'Viol Prot Injunc Repeat Viol', 'Offer Agree Secure/Lewd Act'], 'age_cat': ['Greater than 45', 'Less than 25', '25 - 45'], 'sex': ['Male', 'Female'], 'race': ['Other', 'Black', 'White'], 'is_recid': ['No', 'Yes']}\n"
     ]
    }
   ],
   "source": [
    "cat_cols = train_df.select_dtypes(include='category').columns\n",
    "vocab_dict = {}\n",
    "for col in cat_cols:\n",
    "  vocab_dict[col] = list(set(train_df[col].cat.categories))\n",
    "  \n",
    "output_file_path = os.path.join(dataset_base_dir,'vocabulary.json')\n",
    "with open(output_file_path, mode=\"w\") as output_file:\n",
    "    output_file.write(json.dumps(vocab_dict))\n",
    "    output_file.close()\n",
    "print(vocab_dict)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "V9cxiG9SLfk6"
   },
   "source": [
    "### Construct mean_std.json, and write to directory\n",
    "\n",
    "mean_std.json: json dictionary of the format feature_name: [mean, std]},\n",
    "containing mean and std for numerical features. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "colab": {
     "height": 54
    },
    "colab_type": "code",
    "id": "EjZvIZC6FMFm",
    "outputId": "d9ae1abd-d18f-486b-f067-c657b1420d97"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'juv_fel_count': [0.06971677559912855, 0.5014755752507987], 'juv_misd_count': [0.09368191721132897, 0.5172707491859172], 'juv_other_count': [0.10556545850663497, 0.4686557323312097], 'priors_count': [3.5094078035254506, 4.951584194970699], 'age': [34.93761140819964, 11.925808583868745]}\n"
     ]
    }
   ],
   "source": [
    "temp_dict = train_df.describe().to_dict()\n",
    "mean_std_dict = {}\n",
    "for key, value in temp_dict.items():\n",
    "  mean_std_dict[key] = [value['mean'],value['std']]\n",
    "\n",
    "output_file_path = os.path.join(dataset_base_dir,'mean_std.json')\n",
    "with open(output_file_path, mode=\"w\") as output_file:\n",
    "    output_file.write(json.dumps(mean_std_dict))\n",
    "    output_file.close()\n",
    "print(mean_std_dict)"
   ]
  }
 ],
 "metadata": {
  "colab": {
   "collapsed_sections": [],
   "name": "CreateCompasDatasetFiles.ipynb",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
