# documentation
# output: clean data set
# for each data set 'name.csv' we create a function clean_name

import numpy as np
import pandas as pd


def center(X):
    for col in X.columns:
        X.loc[:, col] = X.loc[:, col]-np.mean(X.loc[:, col])
    return X


def add_intercept(X):
    """Add all 1's column to predictor matrix"""
    X['intercept'] = [1]*X.shape[0]



def clean_communities():
    """Clean communities & crime data set."""
    # Data Cleaning and Import
    df = pd.read_csv('dataset/communities.csv')
    df = df.fillna(0)
    #y = df['ViolentCrimesPerPop']
    #q_y = np.percentile(y, 70)
    # convert y's to binary predictions on whether the neighborhood is
    # especially violent
    #y = [np.round((1 + np.sign(s - q_y)) / 2) for s in y]
    X = df.iloc[:, 0:122]
    X = center(X)
    #X = add_intercept(X)
    features = X.iloc[0:200,50:70]
    problems = X.iloc[0:200,0:50]
    return features, problems


def clean_synthetic():
    df = pd.read_csv('dataset/synthetic.csv')
    ind = list(df.columns).index('problem_0')
    end = len(list(df.columns))
    features = df.iloc[:,0:ind]
    problems = df.iloc[:,ind:end]
    return features, problems

