In [0]:
import numpy as np
import pandas as pd
import itertools
import time
import matplotlib.pyplot as plt
from scipy import stats
from sklearn import preprocessing
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
In [0]:
##Suppress Warning##
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

Subroutines For Jackknife+-after-bootstrap

Generating train/test split , & bootstrap samples:

In [0]:
def data_split(X_full, Y_full, train_size):
    '''
      Sample n=200 data from the original dataset to be used as training data
      X_full: Full X data matrix (for communities/meps/blog)
      Y_full: Full Y data matrix (for communities/meps/blog)
      Return: A list of data matrices of train/test(or validation) pairs
    '''
    n_tot = len(Y_full)
    idx = np.random.choice(n_tot, train_size, replace=False)
    not_idx = [i not in idx for i in range(n_tot)]
    X_train = X_full[idx, :]
    X_test = X_full[not_idx, :]
    Y_train = Y_full[idx, ]
    Y_test = Y_full[not_idx, ]
    return([X_train, X_test, Y_train, Y_test])
In [0]:
def generate_bootstrap_samples(n, m, B):
    '''
      Return: B-by-m matrix, where row b gives the indices for b-th bootstrap sample
    '''
    samples_idx = np.zeros((B, m),dtype=int)
    for b in range(B):
        sample_idx = np.random.choice(n, m)
        samples_idx[b, :] = sample_idx
    return(samples_idx)

Fitting bootstrap estimators:

In [0]:
def fit_bootstrap_models(X_train, Y_train, X_predict, fit_muh_fun, n, m, B):
    '''
      Train B bootstrap estimators and calculate predictions on X_predict
      Return: list of matrices [M,P]
        samples_idx = B-by-m matrix, row b = indices of b-th bootstrap sample
        predictions = B-by-n1 matrix, row b = predictions from b-th bootstrap sample
          (n1=len(X_predict))
    '''
    samples_idx = generate_bootstrap_samples(n, m, B)
    n1 = len(X_predict)
    # P holds the predictions from individual bootstrap estimators
    predictions = np.zeros((B, n1), dtype=float)
    for b in range(B):
        predictions[b] = fit_muh_fun(X_train[samples_idx[b], :],\
                                     Y_train[samples_idx[b], ], X_predict)
    return([samples_idx, predictions])
In [0]:
# Define Baseline Regressor to be used later
def ridge2(X,Y,X1,ridge_mult=0.001):
    # named ridge2 to distinguish it from the ridge regressor in sklearn.
    lam = ridge_mult * np.linalg.svd(X,compute_uv=False).max()**2
    betahat = np.linalg.solve(\
            np.c_[np.ones(X.shape[0]),X].T.dot(np.c_[np.ones(X.shape[0]),X]) \
                              + lam * np.diag(np.r_[0,np.ones(X.shape[1])]),
            np.c_[np.ones(X.shape[0]),X].T.dot(Y))
    return betahat[0] + X1.dot(betahat[1:])
  
def neural_net(X,Y,X1):
    nnet = MLPRegressor(solver='lbfgs',activation='logistic').fit(X,Y)
    return nnet.predict(X1)

def random_forest_nB(X,Y,X1,ntree=20):
    # when bootstrap=False, it means each tree is trained on all rows of X and only
    #      subsamples its columns (which are features).
    rf = RandomForestRegressor(n_estimators=ntree,criterion='mae',bootstrap=False).fit(X,Y)
    return rf.predict(X1)

Compute prediction intervals for Jackknife+-after-bootstrap

Include mean, median, and trimmed mean aggregation

In [0]:
def compute_PIs_jacknife_plus_after_bootstrap(X,Y,X1,alpha,fit_muh_fun,B,m,aggre):
    '''
    Using mean aggregation
    '''
    n=len(X)
    n1 = len(X1)
    [boot_samples_idx,boot_predictions] = \
        fit_bootstrap_models(X, Y, np.vstack((X,X1)), fit_muh_fun, n, m, B)
    in_boot_sample = np.zeros((B,n),dtype=bool)
    for b in range(len(in_boot_sample)):
        in_boot_sample[b,boot_samples_idx[b]] = True
    resids_LOO = np.zeros(n)
    muh_LOO_vals_testpoint = np.zeros((n,n1))
    for i in range(n):
        b_keep = np.argwhere(~(in_boot_sample[:,i])).reshape(-1)
        if(len(b_keep)>0):
          if aggre=='mean':
            resids_LOO[i] = np.abs(Y[i] - boot_predictions[b_keep,i].mean())
            muh_LOO_vals_testpoint[i] = boot_predictions[b_keep,n:].mean(0)
          elif aggre=='median':
            resids_LOO[i] = np.abs(Y[i] - np.median(boot_predictions[b_keep,i]))
            muh_LOO_vals_testpoint[i] = np.median(boot_predictions[b_keep,n:],axis=0)
          elif aggre=='tmean':
            resids_LOO[i] = np.abs(Y[i] - stats.trim_mean(boot_predictions[b_keep,i],0.25))
            muh_LOO_vals_testpoint[i] = stats.trim_mean(boot_predictions[b_keep,n:],0.25,axis=0)
        else: # if aggregating an empty set of models, predict zero everywhere
            resids_LOO[i] = np.abs(Y[i])
            muh_LOO_vals_testpoint[i] = np.zeros(n1)
    ind_q = (np.ceil((1-alpha)*(n+1))).astype(int)
    ###############################
    # construct prediction intervals
    ###############################
        
    return pd.DataFrame(\
            np.c_[np.sort(muh_LOO_vals_testpoint.T - resids_LOO,axis=1).T[-ind_q], \
                np.sort(muh_LOO_vals_testpoint.T + resids_LOO,axis=1).T[ind_q-1]],\
                    columns = ['lower','upper'])

Compute prediction intervals for Naive O(n*B) Jackknife+ (ensembled)

Include mean, median, and trimmed mean aggregation

In [0]:
def compute_PIs_jacknife_plus_naive(X,Y,X1,alpha,fit_muh_fun,B,m,aggre):
    '''
    Using mean aggregation
    '''
    n=len(X)
    n1 = len(X1)
    resids_LOO = np.zeros(n)
    muh_LOO_vals_testpoint = np.zeros((n,n1))
    for i in range(n):
      boot_predictions = fit_bootstrap_models(np.delete(X,i,0), Y, np.vstack((X,X1)), fit_muh_fun, n-1, m, B)[1]
      if aggre=='mean':
        resids_LOO[i] = np.abs(Y[i] - np.mean(boot_predictions[:,i]))
        muh_LOO_vals_testpoint[i] = np.mean(boot_predictions[:,n:],axis=0)
      elif aggre=='median':
        resids_LOO[i] = np.abs(Y[i] - np.median(boot_predictions[:,i]))
        muh_LOO_vals_testpoint[i] = np.median(boot_predictions[:,n:],axis=0)
      elif aggre=='tmean':
        resids_LOO[i] = np.abs(Y[i] - stats.trim_mean(boot_predictions[:,i],0.25))
        muh_LOO_vals_testpoint[i] = stats.trim_mean(boot_predictions[:,n:],0.25,axis=0)
    ind_q = (np.ceil((1-alpha)*(n+1))).astype(int)
    ###############################
    # construct prediction intervals
    ###############################

    return pd.DataFrame(\
            np.c_[np.sort(muh_LOO_vals_testpoint.T - resids_LOO,axis=1).T[-ind_q], \
                np.sort(muh_LOO_vals_testpoint.T + resids_LOO,axis=1).T[ind_q-1]],\
                    columns = ['lower','upper'])

Compute prediction intervals for Jackknife+ (non-ensembled)

In [0]:
def compute_PIs_jacknife_plus(X,Y,X1,alpha,fit_muh_fun):
    n = len(X)
    n1 = len(X1)
    muh_vals = fit_muh_fun(X,Y,np.r_[X,X1])
    resids_naive = np.abs(Y-muh_vals[:n])
    muh_vals_testpoint = muh_vals[n:]
    resids_LOO = np.zeros(n)
    muh_LOO_vals_testpoint = np.zeros((n,n1))
    for i in range(n):
        muh_vals_LOO = fit_muh_fun(np.delete(X,i,0),np.delete(Y,i),\
                                   np.r_[X[i].reshape((1,-1)),X1])
        resids_LOO[i] = np.abs(Y[i] - muh_vals_LOO[0])
        muh_LOO_vals_testpoint[i] = muh_vals_LOO[1:]
    ind_q = (np.ceil((1-alpha)*(n+1))).astype(int)
    ###############################
    # construct prediction intervals
    ###############################
        
    return pd.DataFrame(\
            np.c_[np.sort(muh_LOO_vals_testpoint.T - resids_LOO,axis=1).T[-ind_q], \
                np.sort(muh_LOO_vals_testpoint.T + resids_LOO,axis=1).T[ind_q-1]],\
                    columns = ['lower','upper'])

Run real data experiment

load data

In [0]:
# UCI Communities and Crime Data Set
# http://archive.ics.uci.edu/ml/datasets/communities+and+crime
communities_data = np.loadtxt('communities_data.csv',delimiter=',',dtype=str)
# remove categorical predictors
communities_data = np.delete(communities_data, np.arange(5), 1)
# remove predictors with missing values
communities_data = np.delete(communities_data,            np.argwhere(
    (communities_data == '?').sum(0) > 0).reshape(-1), 1)
communities_data = communities_data.astype(float)
X_communities = communities_data[:, :-1]
Y_communities = communities_data[:, -1]
In [0]:
# UCI BlogFeedback data set
# https://archive.ics.uci.edu/ml/datasets/BlogFeedback
blog_data = np.loadtxt('blogData_train.csv',delimiter=',')
X_blog = blog_data[:, :-1]
Y_blog = np.log(1+blog_data[:, -1])
In [0]:
# MEPS data set
# data downloaded (in .ssp format) from:
# https://meps.ahrq.gov/mepsweb/data_files/pufs/h192ssp.zip
# then run get_meps_data.ipynb script
# to perform feature selection & data cleaning, & store to .txt file
meps_data = np.loadtxt('meps_data.txt')
X_meps = meps_data[:, :-1]
Y_meps = meps_data[:, -1]

run experiment & save results

Main Experiments
In [0]:
# Initialize Parameters
alpha= 0.1
tot_trial = 10
train_size = 40 # training size n
m_vals = np.round(train_size*np.linspace(0.2,1,num=5)).astype(int)
B_ = 20 # number of bootstrap samples in J+aB is drawn as B ~ Binomial(int(B_/(1-1/(n+1))^m),(1-1/(n+1))^m)
In [0]:
# Run Mean
fit_funcs = [ridge2,neural_net,random_forest_nB]
Data_name = ['meps']
results = pd.DataFrame(columns = ['itrial','dataset','muh_fun','method','coverage','width','time'])
for data_name in Data_name:
    X = eval('X_'+data_name)
    Y = eval('Y_'+data_name)
    for ifunc in range(len(fit_funcs)):
        fit_func = fit_funcs[ifunc]
        print('running '+fit_func.__name__+' on '+data_name)
        for itrial in range(tot_trial):
            np.random.seed(98765+itrial)
            [X_train, X_test, Y_train, Y_test] = data_split(X, Y, train_size)
            # Start clock
            start_time=time.time()
            # Run jackknife+
            PIs = compute_PIs_jacknife_plus(X_train,Y_train,X_test,alpha,fit_func)
            coverage = ((PIs['lower'] <= Y_test)&(PIs['upper'] >= Y_test)).mean()
            width = (PIs['upper'] - PIs['lower']).mean()
            results.loc[len(results)]=\
                [itrial,data_name,fit_func.__name__,'jack+',coverage,width,time.time()-start_time]
            # Run Naive jackknife+ and jackknife+aB at number sampled = 200 across all m values
            for m in m_vals:
                # Naive J+
                start_time=time.time()
                PIs = compute_PIs_jacknife_plus_naive(\
                            X_train,Y_train,X_test,alpha,fit_func,B_,m,'mean')
                coverage = ((PIs['lower'] <= Y_test)&(PIs['upper'] >= Y_test)).mean()
                width = (PIs['upper'] - PIs['lower']).mean()
                results.loc[len(results)]=\
                    [itrial,data_name,fit_func.__name__,'jack+-naive (m='+str(m)+')',\
                            coverage,width,time.time()-start_time]
                # J+aB
                start_time=time.time()
                B = int(np.random.binomial(int(B_/((1-1./(1+train_size))**m*(1-1./train_size)**m)),(1-1./(1+train_size))**m,size=1))
                PIs = compute_PIs_jacknife_plus_after_bootstrap(\
                            X_train,Y_train,X_test,alpha,fit_func,B,m,'mean')
                coverage = ((PIs['lower'] <= Y_test)&(PIs['upper'] >= Y_test)).mean()
                width = (PIs['upper'] - PIs['lower']).mean()
                results.loc[len(results)]=\
                    [itrial,data_name,fit_func.__name__,'jack+aB (m='+str(m)+')',\
                            coverage,width,time.time()-start_time]
                results.to_csv('new-jack+aB_realdata_results-mean.csv',index=False)
In [0]:
# Run Median
fit_funcs = [ridge2,neural_net,random_forest_nB]
Data_name = ['meps']
results = pd.DataFrame(columns = ['itrial','dataset','muh_fun','method','coverage','width','time'])
for data_name in Data_name:
    X = eval('X_'+data_name)
    Y = eval('Y_'+data_name)
    for ifunc in range(len(fit_funcs)):
        fit_func = fit_funcs[ifunc]
        print('running '+fit_func.__name__+' on '+data_name)
        for itrial in range(tot_trial):
            np.random.seed(98765+itrial)
            [X_train, X_test, Y_train, Y_test] = data_split(X, Y, train_size)
            # Start clock
            start_time=time.time()
            # Run jackknife+
            PIs = compute_PIs_jacknife_plus(X_train,Y_train,X_test,alpha,fit_func)
            coverage = ((PIs['lower'] <= Y_test)&(PIs['upper'] >= Y_test)).mean()
            width = (PIs['upper'] - PIs['lower']).mean()
            results.loc[len(results)]=\
                [itrial,data_name,fit_func.__name__,'jack+',coverage,width,time.time()-start_time]
            # Run Naive jackknife+ and jackknife+aB at number sampled = 200 across all m values
            for m in m_vals:
                # Naive J+
                start_time=time.time()
                PIs = compute_PIs_jacknife_plus_naive(\
                            X_train,Y_train,X_test,alpha,fit_func,B_,m,'median')
                coverage = ((PIs['lower'] <= Y_test)&(PIs['upper'] >= Y_test)).mean()
                width = (PIs['upper'] - PIs['lower']).mean()
                results.loc[len(results)]=\
                    [itrial,data_name,fit_func.__name__,'jack+-naive (m='+str(m)+')',\
                            coverage,width,time.time()-start_time]
                # J+aB
                start_time=time.time()
                B = int(np.random.binomial(int(B_/((1-1./(1+train_size))**m*(1-1./train_size)**m)),(1-1./(1+train_size))**m,size=1))
                PIs = compute_PIs_jacknife_plus_after_bootstrap(\
                            X_train,Y_train,X_test,alpha,fit_func,B,m,'median')
                coverage = ((PIs['lower'] <= Y_test)&(PIs['upper'] >= Y_test)).mean()
                width = (PIs['upper'] - PIs['lower']).mean()
                results.loc[len(results)]=\
                    [itrial,data_name,fit_func.__name__,'jack+aB (m='+str(m)+')',\
                            coverage,width,time.time()-start_time]
                results.to_csv('new-jack+aB_realdata_results-median.csv',index=False)
In [0]:
# Run Trimmed Mean
fit_funcs = [ridge2,neural_net,random_forest_nB]
Data_name = ['meps']
results = pd.DataFrame(columns = ['itrial','dataset','muh_fun','method','coverage','width','time'])
for data_name in Data_name:
    X = eval('X_'+data_name)
    Y = eval('Y_'+data_name)
    for ifunc in range(len(fit_funcs)):
        fit_func = fit_funcs[ifunc]
        print('running '+fit_func.__name__+' on '+data_name)
        for itrial in range(tot_trial):
            np.random.seed(98765+itrial)
            [X_train, X_test, Y_train, Y_test] = data_split(X, Y, train_size)
            # Start clock
            start_time=time.time()
            # Run jackknife+
            PIs = compute_PIs_jacknife_plus(X_train,Y_train,X_test,alpha,fit_func)
            coverage = ((PIs['lower'] <= Y_test)&(PIs['upper'] >= Y_test)).mean()
            width = (PIs['upper'] - PIs['lower']).mean()
            results.loc[len(results)]=\
                [itrial,data_name,fit_func.__name__,'jack+',coverage,width,time.time()-start_time]
            # Run Naive jackknife+ and jackknife+aB at number sampled = 200 across all m values
            for m in m_vals:
                # Naive J+
                start_time=time.time()
                PIs = compute_PIs_jacknife_plus_naive(\
                            X_train,Y_train,X_test,alpha,fit_func,B_,m,'tmean')
                coverage = ((PIs['lower'] <= Y_test)&(PIs['upper'] >= Y_test)).mean()
                width = (PIs['upper'] - PIs['lower']).mean()
                results.loc[len(results)]=\
                    [itrial,data_name,fit_func.__name__,'jack+-naive (m='+str(m)+')',\
                            coverage,width,time.time()-start_time]
                # J+aB
                start_time=time.time()
                B = int(np.random.binomial(int(B_/((1-1./(1+train_size))**m*(1-1./train_size)**m)),(1-1./(1+train_size))**m,size=1))
                PIs = compute_PIs_jacknife_plus_after_bootstrap(\
                            X_train,Y_train,X_test,alpha,fit_func,B,m,'tmean')
                coverage = ((PIs['lower'] <= Y_test)&(PIs['upper'] >= Y_test)).mean()
                width = (PIs['upper'] - PIs['lower']).mean()
                results.loc[len(results)]=\
                    [itrial,data_name,fit_func.__name__,'jack+aB (m='+str(m)+')',\
                            coverage,width,time.time()-start_time]
                results.to_csv('new-jack+aB_realdata_results-tmean.csv',index=False)
Supplementary Experiments

Compare J+aB with Random $B$ and with Fixed $B'$, making sure $E(B)=B'$

In [0]:
# Initialize Parameters
alpha= 0.1
tot_trial = 10
train_size = 200 # training size n
m_vals = np.round(train_size*np.linspace(0.1,1,num=10)).astype(int)
B_ = 50 # number of bootstrap samples in J+aB is drawn as B ~ Binomial(int(B_/(1-1/(n+1))^m),(1-1/(n+1))^m)
In [0]:
# Run Mean, Random vs. Fixed J+aB
fit_funcs = [ridge2,neural_net,random_forest_nB]
Data_name = ['meps','communities','blog']
results = pd.DataFrame(columns = ['itrial','dataset','muh_fun','method','coverage','width','time'])
for data_name in Data_name:
    X = eval('X_'+data_name)
    Y = eval('Y_'+data_name)
    for ifunc in range(len(fit_funcs)):
        fit_func = fit_funcs[ifunc]
        print('running '+fit_func.__name__+' on '+data_name)
        for itrial in range(tot_trial):
            np.random.seed(98765+itrial)
            [X_train, X_test, Y_train, Y_test] = data_split(X, Y, train_size)
            # Run jackknife+aB, random B & non-random B
            for m in m_vals:
                # J+aB, random B
                start_time=time.time()
                B = int(np.random.binomial(int(B_/(1-1./(1+train_size))**m),(1-1./(1+train_size))**m,size=1))
                PIs = compute_PIs_jacknife_plus_after_bootstrap(\
                            X_train,Y_train,X_test,alpha,fit_func,B,m,'mean')
                coverage = ((PIs['lower'] <= Y_test)&(PIs['upper'] >= Y_test)).mean()
                width = (PIs['upper'] - PIs['lower']).mean()
                results.loc[len(results)]=\
                    [itrial,data_name,fit_func.__name__,'random-jack+aB (m='+str(m)+')',\
                            coverage,width,time.time()-start_time]
                results.to_csv('random_fixed-jack+aB_realdata_results-mean.csv',index=False)
                # J+aB, fixed B
                start_time=time.time()
                PIs = compute_PIs_jacknife_plus_after_bootstrap(\
                            X_train,Y_train,X_test,alpha,fit_func,B_,m,'mean')
                coverage = ((PIs['lower'] <= Y_test)&(PIs['upper'] >= Y_test)).mean()
                width = (PIs['upper'] - PIs['lower']).mean()
                results.loc[len(results)]=\
                    [itrial,data_name,fit_func.__name__,'fixed-jack+aB (m='+str(m)+')',\
                            coverage,width,time.time()-start_time]
                results.to_csv('random_fixed-jack+aB_realdata_results-mean.csv',index=False)
In [0]:
# Run Median, Random vs. Fixed J+aB
fit_funcs = [ridge2,neural_net,random_forest_nB]
Data_name = ['meps','communities','blog']
results = pd.DataFrame(columns = ['itrial','dataset','muh_fun','method','coverage','width','time'])
for data_name in Data_name:
    X = eval('X_'+data_name)
    Y = eval('Y_'+data_name)
    for ifunc in range(len(fit_funcs)):
        fit_func = fit_funcs[ifunc]
        print('running '+fit_func.__name__+' on '+data_name)
        for itrial in range(tot_trial):
            np.random.seed(98765+itrial)
            [X_train, X_test, Y_train, Y_test] = data_split(X, Y, train_size)
            # Run jackknife+aB, random B & non-random B
            for m in m_vals:
                # J+aB, random B
                start_time=time.time()
                B = int(np.random.binomial(int(B_/(1-1./(1+train_size))**m),(1-1./(1+train_size))**m,size=1))
                PIs = compute_PIs_jacknife_plus_after_bootstrap(\
                            X_train,Y_train,X_test,alpha,fit_func,B,m,'median')
                coverage = ((PIs['lower'] <= Y_test)&(PIs['upper'] >= Y_test)).mean()
                width = (PIs['upper'] - PIs['lower']).mean()
                results.loc[len(results)]=\
                    [itrial,data_name,fit_func.__name__,'random-jack+aB (m='+str(m)+')',\
                            coverage,width,time.time()-start_time]
                results.to_csv('random_fixed-jack+aB_realdata_results-median.csv',index=False)
                # J+aB, fixed B
                start_time=time.time()
                PIs = compute_PIs_jacknife_plus_after_bootstrap(\
                            X_train,Y_train,X_test,alpha,fit_func,B_,m,'median')
                coverage = ((PIs['lower'] <= Y_test)&(PIs['upper'] >= Y_test)).mean()
                width = (PIs['upper'] - PIs['lower']).mean()
                results.loc[len(results)]=\
                    [itrial,data_name,fit_func.__name__,'fixed-jack+aB (m='+str(m)+')',\
                            coverage,width,time.time()-start_time]
                results.to_csv('random_fixed-jack+aB_realdata_results-median.csv',index=False)
In [0]:
# Run Trimmed Mean, Random vs. Fixed J+aB
fit_funcs = [ridge2,neural_net,random_forest_nB]
Data_name = ['meps','communities','blog']
results = pd.DataFrame(columns = ['itrial','dataset','muh_fun','method','coverage','width','time'])
for data_name in Data_name:
    X = eval('X_'+data_name)
    Y = eval('Y_'+data_name)
    for ifunc in range(len(fit_funcs)):
        fit_func = fit_funcs[ifunc]
        print('running '+fit_func.__name__+' on '+data_name)
        for itrial in range(tot_trial):
            np.random.seed(98765+itrial)
            [X_train, X_test, Y_train, Y_test] = data_split(X, Y, train_size)
            # Run jackknife+aB, random B & non-random B
            for m in m_vals:
                # J+aB, random B
                start_time=time.time()
                B = int(np.random.binomial(int(B_/(1-1./(1+train_size))**m),(1-1./(1+train_size))**m,size=1))
                PIs = compute_PIs_jacknife_plus_after_bootstrap(\
                            X_train,Y_train,X_test,alpha,fit_func,B,m,'tmean')
                coverage = ((PIs['lower'] <= Y_test)&(PIs['upper'] >= Y_test)).mean()
                width = (PIs['upper'] - PIs['lower']).mean()
                results.loc[len(results)]=\
                    [itrial,data_name,fit_func.__name__,'random-jack+aB (m='+str(m)+')',\
                            coverage,width,time.time()-start_time]
                results.to_csv('random_fixed-jack+aB_realdata_results-tmean.csv',index=False)
                # J+aB, fixed B
                start_time=time.time()
                PIs = compute_PIs_jacknife_plus_after_bootstrap(\
                            X_train,Y_train,X_test,alpha,fit_func,B_,m,'tmean')
                coverage = ((PIs['lower'] <= Y_test)&(PIs['upper'] >= Y_test)).mean()
                width = (PIs['upper'] - PIs['lower']).mean()
                results.loc[len(results)]=\
                    [itrial,data_name,fit_func.__name__,'fixed-jack+aB (m='+str(m)+')',\
                            coverage,width,time.time()-start_time]
                results.to_csv('random_fixed-jack+aB_realdata_results-tmean.csv',index=False)

Plots & Table

Main Plot

Results are average coverage and width plots for meps dataset using random forest that subsamples features only.

In [0]:
results=pd.read_csv('new-jack+aB_realdata_results-mean.csv')
In [0]:
plt.rcParams.update({'font.size': 14})
fig, (ax1, ax2) = plt.subplots(ncols=2)
fig.set_size_inches([36,8])
m_vals=np.round(train_size*np.linspace(0.2,1,num=5)).astype(int)
Data=eval('X_meps')
d=Data.shape[1]
# Left Panel on Coverage
# Fast J+aB
coverage_AB_mean=[]
coverage_AB_SE=[]
for m in m_vals:
  m=int(m)
  coverage_AB_mean.append(results[(results['dataset']=='meps') &\
                                  (results['method']=='jack+aB (m='+str(m)+')') &\
                                  (results['muh_fun']=='random_forest_nB')]['coverage'].mean())
  coverage_AB_SE.append(results[(results['dataset']=='meps') &\
                                  (results['method']=='jack+aB (m='+str(m)+')') &\
                                  (results['muh_fun']=='random_forest_nB')]['coverage'].std()\
                              /np.sqrt(tot_trial))
coverage_AB_mean=np.array(coverage_AB_mean)
coverage_AB_SE=np.array(coverage_AB_SE)  
ax1.plot(m_vals/train_size,coverage_AB_mean,linestyle='-',marker='o',label='j+aB')
ax1.fill_between(m_vals/train_size,coverage_AB_mean-coverage_AB_SE,coverage_AB_mean+coverage_AB_SE,alpha=0.35)
# Naive J+aB
coverage_naive_mean=[]
coverage_naive_SE=[]
for m in m_vals:
  m=int(m)
  coverage_naive_mean.append(results[(results['dataset']=='meps') &\
                                  (results['method']=='jack+-naive (m='+str(m)+')') &\
                                  (results['muh_fun']=='random_forest_nB')]['coverage'].mean())
  coverage_naive_SE.append(results[(results['dataset']=='meps') &\
                                  (results['method']=='jack+-naive (m='+str(m)+')') &\
                                  (results['muh_fun']=='random_forest_nB')]['coverage'].std()\
                              /np.sqrt(tot_trial))
coverage_naive_mean=np.array(coverage_naive_mean)
coverage_naive_SE=np.array(coverage_naive_SE)  

ax1.plot(m_vals/train_size,coverage_naive_mean,linestyle='-',marker='o',label='j+ ensemble')
ax1.fill_between(m_vals/train_size,coverage_naive_mean-coverage_naive_SE,coverage_naive_mean+coverage_naive_SE,alpha=0.35)
# J+
coverage_mean=[]
coverage_SE=[]
for m in m_vals:
  m=int(m)
  coverage_mean.append(results[(results['dataset']=='meps') &\
                                  (results['method']=='jack+') &\
                                  (results['muh_fun']=='random_forest_nB')]['coverage'].mean())
  coverage_SE.append(results[(results['dataset']=='meps') &\
                                (results['method']=='jack+') &\
                                (results['muh_fun']=='random_forest_nB')]['coverage'].std()\
                              /np.sqrt(tot_trial))
coverage_mean=np.array(coverage_mean)
coverage_SE=np.array(coverage_SE)  
ax1.plot(m_vals/train_size,coverage_mean,linestyle='-',marker='o',label='j+ non-ensemble')
ax1.fill_between(m_vals/train_size,coverage_mean-coverage_SE,coverage_mean+coverage_SE,alpha=0.35)
ax1.axhline(0.9,linestyle='-.',color='black')
ax1.set_ylim(0.8,1)
ax1.legend(loc='upper right')

# Right Panel on Width
# J+aB
width_AB_mean=[]
width_AB_SE=[]
for m in m_vals:
  m=int(m)
  width_AB_mean.append(results[(results['dataset']=='meps') &\
                                (results['method']=='jack+aB (m='+str(m)+')') &\
                                (results['muh_fun']=='random_forest_nB')]['width'].mean())
  width_AB_SE.append(results[(results['dataset']=='meps') &\
                              (results['method']=='jack+aB (m='+str(m)+')') &\
                              (results['muh_fun']=='random_forest_nB')]['width'].std()\
                              /np.sqrt(tot_trial))
width_AB_mean=np.array(width_AB_mean)
width_AB_SE=np.array(width_AB_SE)                                                               
ax2.plot(m_vals/train_size,width_AB_mean,linestyle='-',marker='o')
ax2.fill_between(m_vals/train_size,width_AB_mean-width_AB_SE,width_AB_mean+width_AB_SE,alpha=0.35)
# Naive J+aB
width_naive_mean=[]
width_naive_SE=[]
for m in m_vals:
  m=int(m)
  width_naive_mean.append(results[(results['dataset']=='meps') &\
                                (results['method']=='jack+-naive (m='+str(m)+')') &\
                                (results['muh_fun']=='random_forest_nB')]['width'].mean())
  width_naive_SE.append(results[(results['dataset']=='meps') &\
                              (results['method']=='jack+-naive (m='+str(m)+')') &\
                              (results['muh_fun']=='random_forest_nB')]['width'].std()\
                              /np.sqrt(tot_trial))
width_naive_mean=np.array(width_naive_mean)
width_naive_SE=np.array(width_naive_SE)                                                               
ax2.plot(m_vals/train_size,width_naive_mean,linestyle='-',marker='o')
ax2.fill_between(m_vals/train_size,width_naive_mean-width_naive_SE,width_naive_mean+width_naive_SE,alpha=0.35)
# J+
width_mean=[]
width_SE=[]
for m in m_vals:
  m=int(m)
  width_mean.append(results[(results['dataset']=='meps') &\
                                  (results['method']=='jack+') &\
                                  (results['muh_fun']=='random_forest_nB')]['width'].mean())
  width_SE.append(results[(results['dataset']=='meps') &\
                                  (results['method']=='jack+') &\
                                  (results['muh_fun']=='random_forest_nB')]['width'].std()\
                              /np.sqrt(tot_trial))
width_mean=np.array(width_mean)
width_SE=np.array(width_SE)                                                               
ax2.plot(m_vals/train_size,width_mean,linestyle='-',marker='o')
ax2.fill_between(m_vals/train_size,width_mean-width_SE,width_mean+width_SE,alpha=0.35)
plt.tight_layout()
fig.savefig('Fig1_meps_random_forest_mean.pdf',dpi=300)
fig.show()
Table 1
In [0]:
# For J+aB
aggre=['mean','median','tmean']
tot_trial=5
m=int(0.6*train_size)
Data=eval('X_meps')
regressor = ['ridge2','random_forest_nB','neural_net']
for agg in aggre:
  print('Average Coverage under '+agg)
  results=pd.read_csv('new-jack+aB_realdata_results-'+agg+'.csv')
  for regr in regressor:
    cov_mean=results[(results['dataset']==data_name) &\
                                      (results['method']=='jack+aB (m='+str(int(m))+')') &\
                                      (results['muh_fun']==regr)]['coverage'].mean()
    cov_std=results[(results['dataset']==data_name) &\
                                      (results['method']=='jack+aB (m='+str(int(m))+')') &\
                                      (results['muh_fun']==regr)]['coverage'].std()/np.sqrt(tot_trial)                
    width_mean=results[(results['dataset']==data_name) &\
                                      (results['method']=='jack+aB (m='+str(int(m))+')') &\
                                      (results['muh_fun']==regr)]['width'].mean()
    width_std=results[(results['dataset']==data_name) &\
                                      (results['method']=='jack+aB (m='+str(int(m))+')') &\
                                      (results['muh_fun']==regr)]['width'].std()/np.sqrt(tot_trial)                
    print('J+AB with '+regr+' on '+data_name+' has mean coverage '+str(cov_mean))
    print('J+AB with '+regr+' on '+data_name+' has std coverage '+str(cov_std))
Average Width under mean
Fast J+AB with ridge2 on meps has mean coverage 0.885593811618383
Fast J+AB with ridge2 on meps has std coverage 0.007428846817870016
Fast J+AB with random_forest_nB on meps has mean coverage 0.9242833308053997
Fast J+AB with random_forest_nB on meps has std coverage 0.02125193766023532
Fast J+AB with neural_net on meps has mean coverage 0.8846304254019446
Fast J+AB with neural_net on meps has std coverage 0.013074163466836689
Average Width under median
Fast J+AB with ridge2 on meps has mean coverage 0.8985530107689973
Fast J+AB with ridge2 on meps has std coverage 0.0014798916646952807
Fast J+AB with random_forest_nB on meps has mean coverage 0.9287486728348247
Fast J+AB with random_forest_nB on meps has std coverage 0.019362187231175894
Fast J+AB with neural_net on meps has mean coverage 0.9050993477931139
Fast J+AB with neural_net on meps has std coverage 0.014305178427487866
Average Width under tmean
Fast J+AB with ridge2 on meps has mean coverage 0.8919278022144699
Fast J+AB with ridge2 on meps has std coverage 0.006287878276100417
Fast J+AB with random_forest_nB on meps has mean coverage 0.9250538449871076
Fast J+AB with random_forest_nB on meps has std coverage 0.01949682665856566
Fast J+AB with neural_net on meps has mean coverage 0.8932261489458515
Fast J+AB with neural_net on meps has std coverage 0.015732921245692977
Supplementary Table: Computing Time
In [0]:
# J+aB vs. J+ vs. Naive J+aB, with Neural Network on Meps
# All three aggregation methods considered
# By row
aggre=['mean','median','tmean']
regressor = ['ridge2','random_forest_nB','neural_net']
m=int(0.6*train_size)
Data=eval('X_meps')
d=Data.shape[1]
for regr in regressor:
  for agg in aggre:
    print('Computing time under '+regr+'&'+agg)
    results=pd.read_csv('new-jack+aB_realdata_results-'+agg+'.csv')
    # J+aB
    time_across_m_J_plusaB=[]
    time_across_m_J_plusaB.append(results[(results['dataset']=='meps') &\
                                  (results['method']=='jack+aB (m='+str(m)+')') &\
                                  (results['muh_fun']==regr)]['time'].mean())
    print('For J+aB')
    print('')
    print(time_across_m_J_plusaB[0])
    # J+
    time_across_m_J_plus=[]
    time_across_m_J_plus.append(results[(results['dataset']=='meps') &\
                                      (results['method']=='jack+') &\
                                      (results['muh_fun']==regr)]['time'].mean())
    print('For J+ Non-Ensemble')
    print('')
    print(time_across_m_J_plus[0])
    # Naive J+aB
    time_across_m_NaiveJ_plusaB=[]
    for m in m_vals:
      m=int(m)
      time_across_m_NaiveJ_plusaB.append(results[(results['dataset']=='meps') &\
                                    (results['method']=='jack+-naive (m='+str(m)+')') &\
                                    (results['muh_fun']==regr)]['time'].mean())
    print('For J+ Ensemble')
    print('')
    print(time_across_m_NaiveJ_plusaB[0])
Computing time under ridge2&mean
For J+aB

0.21921305656433104
For J+ Non-Ensemble

0.4001453876495361
For J+ Ensemble

1.9264767646789551
Computing time under ridge2&median
For J+aB

0.5860251426696778
For J+ Non-Ensemble

0.4321089267730713
For J+ Ensemble

2.710788536071777
Computing time under ridge2&tmean
For J+aB

0.5742483377456665
For J+ Non-Ensemble

0.44266324043273925
For J+ Ensemble

2.5758665800094604
Computing time under random_forest_nB&mean
For J+aB

6.032822394371033
For J+ Non-Ensemble

5.044797778129578
For J+ Ensemble

43.874479508399965
Computing time under random_forest_nB&median
For J+aB

6.236211562156678
For J+ Non-Ensemble

5.020432925224304
For J+ Ensemble

44.2888649225235
Computing time under random_forest_nB&tmean
For J+aB

6.837657117843628
For J+ Non-Ensemble

4.600383853912353
For J+ Ensemble

39.23002676963806
Computing time under neural_net&mean
For J+aB

16.82468900680542
For J+ Non-Ensemble

14.117470622062683
For J+ Ensemble

160.00048921108245
Computing time under neural_net&median
For J+aB

17.8035507440567
For J+ Non-Ensemble

14.961453938484192
For J+ Ensemble

169.30520374774932
Computing time under neural_net&tmean
For J+aB

17.480592608451843
For J+ Non-Ensemble

14.121620798110962
For J+ Ensemble

162.8417428255081
Supplementary Plots
  1. Baseline Regression & Aggregation Function combination
In [0]:
# For 2. Initialize Parameters
alpha= 0.1
tot_trial = 10
train_size = 40 # training size n
m_vals = np.round(train_size*np.linspace(0.2,1,num=5)).astype(int)
B_ = 20 # number of bootstrap samples in J+aB is drawn as B ~ Binomial(int(B_/(1-1/(n+1))^m),(1-1/(n+1))^m)
In [0]:
# For 2. Coverage
aggre=['mean','median','tmean']
Data_name = ['meps']
regressor = ['ridge2','random_forest_nB','neural_net']
for data_name in Data_name:
  plt.rcParams.update({'font.size': 14})
  fig, ax1 = plt.subplots(3,3,figsize=(12,12)) 
  Data=eval('X_'+data_name)
  d=Data.shape[1]
  j=0
  for agg in aggre:
    results=pd.read_csv('new-jack+aB_realdata_results-'+agg+'.csv')
    i=0
    for regr in regressor:
      # J+AB, coverage
      coverage_AB_mean=[]
      coverage_AB_SE=[]
      for m in m_vals:
        coverage_AB_mean.append(results[(results['dataset']==data_name) &\
                                        (results['method']=='jack+aB (m='+str(m)+')') &\
                                        (results['muh_fun']==regr)]['coverage'].mean())
        coverage_AB_SE.append(results[(results['dataset']==data_name) &\
                                      (results['method']=='jack+aB (m='+str(m)+')') &\
                                      (results['muh_fun']==regr)]['coverage'].std()\
                                    /np.sqrt(tot_trial))
      coverage_AB_mean=np.array(coverage_AB_mean)
      coverage_AB_SE=np.array(coverage_AB_SE)  
      if regr=='ridge2': # For legend
        if agg=='tmean':
          ax1[i,j].plot(m_vals/train_size,coverage_AB_mean,linestyle='-',marker='o',label='j+aB')
        else: 
          ax1[i,j].plot(m_vals/train_size,coverage_AB_mean,linestyle='-',marker='o')
      else:
        ax1[i,j].plot(m_vals/train_size,coverage_AB_mean,linestyle='-',marker='o')
      ax1[i,j].fill_between(m_vals/train_size,coverage_AB_mean-coverage_AB_SE,coverage_AB_mean+coverage_AB_SE,alpha=0.35)
      # Naive J+AB, coverage
      coverage_naive_mean=[]
      coverage_naive_SE=[]
      for m in m_vals:
        coverage_naive_mean.append(results[(results['dataset']==data_name) &\
                                        (results['method']=='jack+-naive (m='+str(m)+')') &\
                                        (results['muh_fun']==regr)]['coverage'].mean())
        coverage_naive_SE.append(results[(results['dataset']==data_name) &\
                                      (results['method']=='jack+-naive (m='+str(m)+')') &\
                                      (results['muh_fun']==regr)]['coverage'].std()\
                                    /np.sqrt(tot_trial))
      coverage_naive_mean=np.array(coverage_naive_mean)
      coverage_naive_SE=np.array(coverage_naive_SE)  
      if regr=='ridge2': # For legend
        if agg=='tmean':
          ax1[i,j].plot(m_vals/train_size,coverage_naive_mean,linestyle='-',marker='o',label='j+ ensemble')
        else: 
          ax1[i,j].plot(m_vals/train_size,coverage_naive_mean,linestyle='-',marker='o')
      else:
        ax1[i,j].plot(m_vals/train_size,coverage_naive_mean,linestyle='-',marker='o')
      ax1[i,j].fill_between(m_vals/train_size,coverage_naive_mean-coverage_naive_SE,coverage_naive_mean+coverage_naive_SE,alpha=0.35)
      # J+, coverage
      coverage_mean=[]
      coverage_SE=[]
      for m in m_vals:
        coverage_mean.append(results[(results['dataset']==data_name) &\
                                        (results['method']=='jack+') &\
                                        (results['muh_fun']==regr)]['coverage'].mean())
        coverage_SE.append(results[(results['dataset']==data_name) &\
                                      (results['method']=='jack+') &\
                                      (results['muh_fun']==regr)]['coverage'].std()\
                                    /np.sqrt(tot_trial))
      coverage_mean=np.array(coverage_mean)
      coverage_SE=np.array(coverage_SE)  
      if regr=='ridge2': # For legend
        if agg=='tmean':
          ax1[i,j].plot(m_vals/train_size,coverage_mean,linestyle='-',marker='o',label='j+ non-ensemble')
          ax1[i,j].legend(loc='upper right',fontsize='small')
        else: 
          ax1[i,j].plot(m_vals/train_size,coverage_mean,linestyle='-',marker='o')
      else:
        ax1[i,j].plot(m_vals/train_size,coverage_mean,linestyle='-',marker='o')
      ax1[i,j].fill_between(m_vals/train_size,coverage_mean-coverage_SE,coverage_mean+coverage_SE,alpha=0.35)
      # Other Features
      ax1[i,j].axhline(0.9,linestyle='-.',color='black')
      ax1[i,j].set_ylim(0.8,1)
      if agg=='mean': # For axis label
        if regr=='neural_net':
          ax1[i,j].xaxis.set_ticks(np.arange(0.2, 1.01, 0.2))
          pass
        else:
          ax1[i,j].xaxis.set_ticks(np.arange(0.2, 1.01, 0.2))
          ax1[i,j].xaxis.set_visible(False)
      else:
        if regr=='neural_net':
          ax1[i,j].xaxis.set_ticks(np.arange(0.2, 1.01, 0.2))
          ax1[i,j].yaxis.set_visible(False)
        else: 
          ax1[i,j].xaxis.set_ticks(np.arange(0.2, 1.01, 0.2))
          ax1[i,j].xaxis.set_visible(False)
          ax1[i,j].yaxis.set_visible(False)
      if i==0 and j==0: # Set Title and Label
        ax1[i,j].set_title('Mean',fontsize=14)
        ax1[i,j].set_ylabel('Ridge',fontsize=14)
      if i==1 and j==0:
        ax1[i,j].set_ylabel('RF',fontsize=14)
      if i==2 and j==0:
        ax1[i,j].set_ylabel('NN',fontsize=14)
      if i==0 and j==1:
        ax1[i,j].set_title('Median',fontsize=14)
      if i==0 and j==2:
        ax1[i,j].set_title('Trimmed Mean',fontsize=14)
      fig.tight_layout()
      i+=1
    j+=1
  fig.savefig('Supp2_Meps_Cov.pdf',dpi=300)
  fig.show()
In [0]:
# For 2. Width
aggre=['mean','median','tmean']
Data_name = ['meps']
regressor = ['ridge2','random_forest_nB','neural_net']
for data_name in Data_name:
  plt.rcParams.update({'font.size': 14})
  fig, ax2 = plt.subplots(3,3,figsize=(12,12)) 
  Data=eval('X_'+data_name)
  d=Data.shape[1]
  j=0
  for agg in aggre:
    results=pd.read_csv('new-jack+aB_realdata_results-'+agg+'.csv')
    i=0
    for regr in regressor:
      # Fast J+AB, width
      width_AB_mean=[]
      width_AB_SE=[]
      for m in m_vals:
        m=int(m)
        width_AB_mean.append(results[(results['dataset']==data_name) &\
                                        (results['method']=='jack+aB (m='+str(m)+')') &\
                                        (results['muh_fun']==regr)]['width'].mean())
        width_AB_SE.append(results[(results['dataset']==data_name) &\
                                      (results['method']=='jack+aB (m='+str(m)+')') &\
                                      (results['muh_fun']==regr)]['width'].std()\
                                    /np.sqrt(tot_trial))
      width_AB_mean=np.array(width_AB_mean)
      width_AB_SE=np.array(width_AB_SE)                                                               
      if regr=='ridge2': # For legend
        if agg == 'tmean':
          ax2[i,j].plot(m_vals/train_size,width_AB_mean,linestyle='-',marker='o',label='j+aB')
        else: 
          ax2[i,j].plot(m_vals/train_size,width_AB_mean,linestyle='-',marker='o')
      else:
        ax2[i,j].plot(m_vals/train_size,width_AB_mean,linestyle='-',marker='o')
      ax2[i,j].fill_between(m_vals/train_size,width_AB_mean-width_AB_SE,width_AB_mean+width_AB_SE,alpha=0.35)
      # Naive J+AB, width
      width_naive_mean=[]
      width_naive_SE=[]
      for m in m_vals:
        m=int(m)
        width_naive_mean.append(results[(results['dataset']==data_name) &\
                                        (results['method']=='jack+-naive (m='+str(m)+')') &\
                                        (results['muh_fun']==regr)]['width'].mean())
        width_naive_SE.append(results[(results['dataset']==data_name) &\
                                      (results['method']=='jack+-naive (m='+str(m)+')') &\
                                      (results['muh_fun']==regr)]['width'].std()\
                                    /np.sqrt(tot_trial))
      width_naive_mean=np.array(width_naive_mean)
      width_naive_SE=np.array(width_naive_SE)                                                               
      if regr=='ridge2': # For legend
        if agg == 'tmean':
          ax2[i,j].plot(m_vals/train_size,width_naive_mean,linestyle='-',marker='o',label='j+ ensemble')
        else: 
          ax2[i,j].plot(m_vals/train_size,width_naive_mean,linestyle='-',marker='o')
      else:
        ax2[i,j].plot(m_vals/train_size,width_naive_mean,linestyle='-',marker='o')
      ax2[i,j].fill_between(m_vals/train_size,width_naive_mean-width_naive_SE,width_naive_mean+width_naive_SE,alpha=0.35)
      # J+,width
      width_mean=[]
      width_SE=[]
      for m in m_vals:
        m=int(m)
        width_mean.append(results[(results['dataset']==data_name) &\
                                        (results['method']=='jack+') &\
                                        (results['muh_fun']==regr)]['width'].mean())
        width_SE.append(results[(results['dataset']==data_name) &\
                                      (results['method']=='jack+') &\
                                      (results['muh_fun']==regr)]['width'].std()\
                                    /np.sqrt(tot_trial))
      width_mean=np.array(width_mean)
      width_SE=np.array(width_SE)                                                               
      if regr=='ridge2': # For legend
        if agg == 'tmean':
          ax2[i,j].plot(m_vals/train_size,width_mean,linestyle='-',marker='o',label='j+ non-ensemble')
          ax2[i,j].legend(loc='upper right',fontsize='small')
        else: 
          ax2[i,j].plot(m_vals/train_size,width_mean,linestyle='-',marker='o')
      else:
        ax2[i,j].plot(m_vals/train_size,width_mean,linestyle='-',marker='o')
      ax2[i,j].fill_between(m_vals/train_size,width_mean-width_SE,width_mean+width_SE,alpha=0.35)
      # Maintain appropriate spacing
      if regr=='neural_net':
        ax2[i,j].set_ylim(np.min(0.5*(width_mean+width_AB_mean)-2*(width_SE+width_AB_SE)),
                    np.max(0.5*(width_mean+width_AB_mean)+2*(width_SE+width_AB_SE)))
        ax2[i,j].xaxis.set_ticks(np.arange(0.2, 1.01, 0.2))
      else: 
        if regr=='ridge2':
          ax2[i,j].set_ylim(np.min(0.5*(width_mean+width_AB_mean)-2*(width_SE+width_AB_SE)),
                    np.max(0.5*(width_mean+width_AB_mean)+2*(width_SE+width_AB_SE)))
        ax2[i,j].xaxis.set_ticks(np.arange(0.2, 1.01, 0.2))    
        ax2[i,j].xaxis.set_visible(False)
      # Other Features
      if i==0 and j==0: # Set Title and Label
        ax2[i,j].set_title('Mean',fontsize=14)
        ax2[i,j].set_ylabel('Ridge',fontsize=14)
      if i==1 and j==0:
        ax2[i,j].set_ylabel('RF',fontsize=14)
      if i==2 and j==0:
        ax2[i,j].set_ylabel('NN',fontsize=14)
      if i==0 and j==1:
        ax2[i,j].set_title('Median',fontsize=14)
      if i==0 and j==2:
        ax2[i,j].set_title('Trimmed Mean',fontsize=14)
      fig.tight_layout()
      i+=1
    j+=1
  fig.savefig('Supp2_Meps_Width.pdf',dpi=300)
  fig.show()
Supplementary Plots
  1. Random vs. Fixed $B$ for J+aB
In [0]:
# For 1. Initialize Parameters
alpha= 0.1
tot_trial = 10
train_size = 200 # training size n
m_vals = np.round(train_size*np.linspace(0.1,1,num=10)).astype(int)
B_ = 50 # number of bootstrap samples in J+aB is drawn as B ~ Binomial(int(B_/(1-1/(n+1))^m),(1-1/(n+1))^m)
In [0]:
# For 1. Coverage
aggre=['mean','median','tmean']
Data_name = ['meps','communities','blog']
regressor = ['ridge2','random_forest_nB','neural_net']
for data_name in Data_name:
  plt.rcParams.update({'font.size': 14})
  fig, ax1 = plt.subplots(3,3,figsize=(12,12)) 
  m_vals = np.round(train_size*np.linspace(0.1,1,num=10)).astype(int)
  Data=eval('X_'+data_name)
  d=Data.shape[1]
  j=0
  for agg in aggre:
    results=pd.read_csv('random_fixed-jack+aB_realdata_results-'+agg+'.csv')
    i=0
    for regr in regressor:
      # Random J+AB, coverage
      coverage_AB_mean=[]
      coverage_AB_SE=[]
      for m in m_vals:
        coverage_AB_mean.append(results[(results['dataset']==data_name) &\
                                        (results['method']=='random-jack+aB (m='+str(m)+')') &\
                                        (results['muh_fun']==regr)]['coverage'].mean())
        coverage_AB_SE.append(results[(results['dataset']==data_name) &\
                                      (results['method']=='random-jack+aB (m='+str(m)+')') &\
                                      (results['muh_fun']==regr)]['coverage'].std()\
                                    /np.sqrt(tot_trial))
      coverage_AB_mean=np.array(coverage_AB_mean)
      coverage_AB_SE=np.array(coverage_AB_SE)  
      if regr=='ridge2': # For legend
        if agg=='tmean':
          ax1[i,j].plot(m_vals/train_size,coverage_AB_mean,linestyle='-',marker='o',label='j+aB random B')
        else: 
          ax1[i,j].plot(m_vals/train_size,coverage_AB_mean,linestyle='-',marker='o')
      else:
        ax1[i,j].plot(m_vals/train_size,coverage_AB_mean,linestyle='-',marker='o')
      ax1[i,j].fill_between(m_vals/train_size,coverage_AB_mean-coverage_AB_SE,coverage_AB_mean+coverage_AB_SE,alpha=0.35)
      # Fixed J+AB, coverage
      coverage_AB_mean=[]
      coverage_AB_SE=[]
      for m in m_vals:
        coverage_AB_mean.append(results[(results['dataset']==data_name) &\
                                        (results['method']=='fixed-jack+aB (m='+str(m)+')') &\
                                        (results['muh_fun']==regr)]['coverage'].mean())
        coverage_AB_SE.append(results[(results['dataset']==data_name) &\
                                      (results['method']=='fixed-jack+aB (m='+str(m)+')') &\
                                      (results['muh_fun']==regr)]['coverage'].std()\
                                    /np.sqrt(tot_trial))
      coverage_AB_mean=np.array(coverage_AB_mean)
      coverage_AB_SE=np.array(coverage_AB_SE)  
      if regr=='ridge2': # For legend
        if agg=='tmean':
          ax1[i,j].plot(m_vals/train_size,coverage_AB_mean,linestyle='-',marker='o',label='j+aB fixed B')
          ax1[i,j].legend(loc='upper right',fontsize='small')
        else: 
          ax1[i,j].plot(m_vals/train_size,coverage_AB_mean,linestyle='-',marker='o')
      else:
        ax1[i,j].plot(m_vals/train_size,coverage_AB_mean,linestyle='-',marker='o')
      ax1[i,j].fill_between(m_vals/train_size,coverage_AB_mean-coverage_AB_SE,coverage_AB_mean+coverage_AB_SE,alpha=0.35)
      # Other Features
      ax1[i,j].axhline(0.9,linestyle='-.',color='black')
      ax1[i,j].set_ylim(0.8,1)
      if agg=='mean': # For axis label
        if regr=='neural_net':
          ax1[i,j].xaxis.set_ticks(np.arange(0.2, 1.01, 0.2))
          pass
        else:
          ax1[i,j].xaxis.set_ticks(np.arange(0.2, 1.01, 0.2))
          ax1[i,j].xaxis.set_visible(False)
      else:
        if regr=='neural_net':
          ax1[i,j].xaxis.set_ticks(np.arange(0.2, 1.01, 0.2))
          ax1[i,j].yaxis.set_visible(False)
        else: 
          ax1[i,j].xaxis.set_ticks(np.arange(0.2, 1.01, 0.2))
          ax1[i,j].xaxis.set_visible(False)
          ax1[i,j].yaxis.set_visible(False)
      if i==0 and j==0: # Set Title and Label
        ax1[i,j].set_title('Mean',fontsize=14)
        ax1[i,j].set_ylabel('Ridge',fontsize=14)
      if i==1 and j==0:
        ax1[i,j].set_ylabel('RF',fontsize=14)
      if i==2 and j==0:
        ax1[i,j].set_ylabel('NN',fontsize=14)
      if i==0 and j==1:
        ax1[i,j].set_title('Median',fontsize=14)
      if i==0 and j==2:
        ax1[i,j].set_title('Trimmed Mean',fontsize=14)
      fig.tight_layout()
      i+=1
    j+=1
  fig.savefig('Supp1_'+data_name+'_Cov.pdf',dpi=300)
  fig.show()
In [0]:
# For 1. Width
aggre=['mean','median','tmean']
Data_name = ['meps','communities','blog']
regressor = ['ridge2','random_forest_nB','neural_net']
for data_name in Data_name:
  plt.rcParams.update({'font.size': 14})
  fig, ax2 = plt.subplots(3,3,figsize=(12,12)) 
  Data=eval('X_'+data_name)
  d=Data.shape[1]
  j=0
  for agg in aggre:
    results=pd.read_csv('random_fixed-jack+aB_realdata_results-'+agg+'.csv')
    i=0
    for regr in regressor:
      # Random J+AB, width
      width_AB_mean=[]
      width_AB_SE=[]
      for m in m_vals:
        m=int(m)
        width_AB_mean.append(results[(results['dataset']==data_name) &\
                                        (results['method']=='random-jack+aB (m='+str(m)+')') &\
                                        (results['muh_fun']==regr)]['width'].mean())
        width_AB_SE.append(results[(results['dataset']==data_name) &\
                                      (results['method']=='random-jack+aB (m='+str(m)+')') &\
                                      (results['muh_fun']==regr)]['width'].std()\
                                    /np.sqrt(tot_trial))
      width_AB_mean=np.array(width_AB_mean)
      width_AB_SE=np.array(width_AB_SE)                                                               
      if regr=='ridge2': # For legend
        if agg == 'tmean':
          ax2[i,j].plot(m_vals/train_size,width_AB_mean,linestyle='-',marker='o',label='j+aB random B')
        else: 
          ax2[i,j].plot(m_vals/train_size,width_AB_mean,linestyle='-',marker='o')
      else:
        ax2[i,j].plot(m_vals/train_size,width_AB_mean,linestyle='-',marker='o')
      ax2[i,j].fill_between(m_vals/train_size,width_AB_mean-width_AB_SE,width_AB_mean+width_AB_SE,alpha=0.35)
      # Fixed J+AB, width
      width_AB_mean=[]
      width_AB_SE=[]
      for m in m_vals:
        m=int(m)
        width_AB_mean.append(results[(results['dataset']==data_name) &\
                                        (results['method']=='fixed-jack+aB (m='+str(m)+')') &\
                                        (results['muh_fun']==regr)]['width'].mean())
        width_AB_SE.append(results[(results['dataset']==data_name) &\
                                      (results['method']=='fixed-jack+aB (m='+str(m)+')') &\
                                      (results['muh_fun']==regr)]['width'].std()\
                                    /np.sqrt(tot_trial))
      width_AB_mean=np.array(width_AB_mean)
      width_AB_SE=np.array(width_AB_SE)                                                               
      if regr=='ridge2': # For legend
        if agg == 'tmean':
          ax2[i,j].plot(m_vals/train_size,width_AB_mean,linestyle='-',marker='o',label='j+aB fixed B')
          ax2[i,j].legend(loc='upper right',fontsize='small')
        else: 
          ax2[i,j].plot(m_vals/train_size,width_AB_mean,linestyle='-',marker='o')
      else:
        ax2[i,j].plot(m_vals/train_size,width_AB_mean,linestyle='-',marker='o')
      ax2[i,j].fill_between(m_vals/train_size,width_AB_mean-width_AB_SE,width_AB_mean+width_AB_SE,alpha=0.35)
      # Maintain appropriate spacing
      if regr=='neural_net':
        ax2[i,j].set_ylim(np.min(width_AB_mean-3*width_AB_SE),
                    np.max(width_AB_mean+3*width_AB_SE))
        ax2[i,j].xaxis.set_ticks(np.arange(0.2, 1.01, 0.2))
      else: 
        ax2[i,j].set_ylim(np.min(width_AB_mean-3*width_AB_SE),
                  np.max(width_AB_mean+3*width_AB_SE))
        ax2[i,j].xaxis.set_ticks(np.arange(0.2, 1.01, 0.2))    
        ax2[i,j].xaxis.set_visible(False)
      # Other Features
      if i==0 and j==0: # Set Title and Label
        ax2[i,j].set_title('Mean',fontsize=14)
        ax2[i,j].set_ylabel('Ridge',fontsize=14)
      if i==1 and j==0:
        ax2[i,j].set_ylabel('RF',fontsize=14)
      if i==2 and j==0:
        ax2[i,j].set_ylabel('NN',fontsize=14)
      if i==0 and j==1:
        ax2[i,j].set_title('Median',fontsize=14)
      if i==0 and j==2:
        ax2[i,j].set_title('Trimmed Mean',fontsize=14)
      fig.tight_layout()
      i+=1
    j+=1
  fig.savefig('Supp1_'+data_name+'_Width.pdf',dpi=300)
  fig.show()