############################################################################################
# app_witness_demo.R
#
# A demonstration on how to use WPP. This demo can be used as a template in other 
# applications. RECALL THAT THE CURRENT CODE SUPPORTS ONLY SMALL DIMENSIONAL PROBLEMS 
# (about a dozen variables or slightly more).
#
# The application is the influenza data discussed at several papers, including
#
# ** Hirano, Imbens, Rubin and Zhou (2000). "Assessing the effect of an influenza
#    vaccine in an encouragement design". Biostatistics, 1, 1, pp. 69-88.
#
# We do not provide the data here. Please contact us for more information.
#
# Our goal here is to estimate bounds on the ACE 
# P(Y = 1 | do(X = 1)) - P(Y = 1 | do(X = 0)) where Y = 1 indicates the patient
# has been hospitalized with flu symptoms, and X = 1 that the patient vaccinated
# against flu (hence, negative ACE values indicate that the vaccination appears to be
# effective).
#
# Code by
#
#  - Ricardo Silva (ricardo@stats.ucl.ac.uk)
#  - Robin Evans (robin.evans@stats.ox.ac.uk)
#
# Current version: 15/08/2014
# First version: 20/05/2014

source("witness.R")

library(ggplot2)

run_WPP <- FALSE # A flag that if FALSE doesn't re-run the WPP fitting procedure, instead
                 # loads model from a snapshot file

app_data <- read.table("dat/flu_data_tsr.dat", header = T, row.names = NULL)
cat("Data loaded, number of variables =", ncol(app_data), "\n")
var_names <- colnames(app_data)
app_data <- as.matrix(app_data)
app_data[which(app_data[, 4]  < 60), 4] <- 0 # "Age" is in column 4, we dichotomize it
app_data[which(app_data[, 4] >= 60), 4] <- 1


######################################################################################################
# Do analysis with Witness Protection Program

problem <- list(data = app_data, X_idx = 2, Y_idx = 3, counts = dtoc(app_data))
num_monte_carlo <- 5000
epsilons <- c(0.2, 0.2, 0.2, 0.2, 0.9, 1.1)  

if (run_WPP) {
    
  v <- witness_bound_search(problem, epsilons, num_monte_carlo, verbose = TRUE) 
  s <- witness_summarize_bounds(v, problem, epsilons, verbose = TRUE, taboo_vars = 1)
  s_with_GRP <- witness_summarize_bounds(v, problem, epsilons, verbose = TRUE)
  
  save(v, s, s_with_GRP, file = "influenza.RData")
  
} else {
  
  cat("Skipping model fitting, loading bounds from file\n")
  load("influenza.RData")
  
}

cat("WPP bounds:", s$highest_bound, "\n")

######################################################################################################
# Backdoor-analysis

bd_ace <- rep(0, length(v$w_list$witness))
bd_odds <- rep(0, length(v$w_list$witness))
for (i in 1:length(v$w_list$witness)) {
  effects <- bindag_causal_effect_backdoor_S(problem, v$w_list$Z[[i]])
  bd_ace[i] <- effects$ACE
  bd_odds[i] <- effects$odds
}
bd_ace <- sort(bd_ace)
bd_odds <- sort(bd_odds)
hist(bd_ace, main = "Back-door results")
cat("Back-door results:")
for (i in 1:length(bd_odds)) {
  cat(paste("[", i, "]: ", round(bd_ace[i] * 1000) / 1000, "::", round(bd_odds[i] * 1000) / 1000, "\n", sep = ""))
}

######################################################################################################
# Intention-to-treat analysis

ITT_m <- matrix(rep(0, 4), ncol = 2)
naive_m <- matrix(rep(0, 4), ncol = 2)
for (i in 0:1)
  for (j in 0:1) {
    ITT_m[i + 1, j + 1] <- mean((app_data[, 1] == i) * (app_data[, 3] == j))
    naive_m[i + 1, j + 1] <- mean((app_data[, 2] == i) * (app_data[, 3] == j))
  }
ITT <- ITT_m[2, 2] / (ITT_m[2, 1] + ITT_m[2, 2]) - ITT_m[1, 2] / (ITT_m[1, 1] + ITT_m[1, 2])
ITT_perc <- round(ITT / (ITT_m[1, 2] / (ITT_m[1, 1] + ITT_m[1, 2])) * 100) / 100
naive_1 <- naive_m[2, 2] / (naive_m[2, 1] + naive_m[2, 2]) - naive_m[1, 2] / (naive_m[1, 1] + naive_m[1, 2])
naive_1_perc <- naive_1 / (naive_m[1, 2] / (naive_m[1, 1] + naive_m[1, 2]))
cat("ITT ACE: ", ITT, " [odds ratio: ", ITT_perc, "]\n", sep = "")
cat("Naive ACE (1): ", naive_1, " [odds ratio: ", naive_1_perc, "]\n", sep = "")

######################################################################################################
# Do analysis assuming standard IV model {W -> X, X -> Y, X <- U -> Y}

P_ZETA <- rep(0, 8)
for (i in 0:1)
  for (j in 0:1)
    for (w in 0:1)
      P_ZETA[4 * w + i * 2 + j + 1] <- 
            sum((app_data[, 3] == i) * (app_data[, 2] == j) * (app_data[, 1] == w)) / sum(app_data[, 1] == w)
ACE_analytical_iv <- analytical_iv(P_ZETA)
cat("Standard IV bounds:", ACE_analytical_iv$bottom, ACE_analytical_iv$upper, "\n")

#######################################################################################################
# Plot results

bounds_posterior <- witness_generate_WZ_posterior(problem, epsilons, s$chosen_w, s$chosen_Z, num_monte_carlo, verbose = TRUE, numerical_method = TRUE)
bounds_dat <- data.frame(lower = bounds_posterior[, 1], upper = bounds_posterior[, 2])
cat("corr bounds:", cor(bounds_dat[which(!is.na(rowSums(bounds_dat))), ]), "\n")

ggplot(bounds_dat, aes(x = lower, y = upper)) + geom_point(shape = 1) +
  ggtitle("Posterior distribution") + xlab("Lower bound") + ylab("Upper bound") +
  theme(title = element_text(size = 20), axis.text.x = element_text(size = 18)) 

lower_bound <- data.frame(effect = bounds_posterior[, 1])
upper_bound <- data.frame(effect = bounds_posterior[, 2])
lower_bound$type <- 'lower'
upper_bound$type <- 'upper'
causal_effects <- rbind(lower_bound, upper_bound)
x11()
ggplot(causal_effects, aes(effect, fill = type)) + geom_density(alpha = 0.2) + xlim(-0.3, 0.3) +
  ggtitle(sprintf("Marginal Posterior Distribution (means: [%1.2f, %1.2f])", mean(bounds_posterior[ ,1], na.rm = T), mean(bounds_posterior[ ,2], na.rm = T))) +
  theme(legend.position = "none")

#######################################################################################################
# Analysis of plausible epsilons.

m_prior <- c(0.2, 0.2, 0.95)
v_prior <- c(Inf, Inf, Inf) #c(0.1, 0.1, 0.05) as an alternative
M <- 10000

an_eps <- witness_infer_eps(problem, s$chosen_w, s$chosen_Z, v$w_list$Z, m_prior, v_prior, M, taboo_vars = 1)
  
eps_w <- mean(an_eps$eps[1, ])  
eps_xy <- mean(an_eps$eps[2, ])  
eps_beta <- mean(an_eps$eps[3, ])  
epsilons_an <- c(eps_w, eps_xy, eps_xy, eps_xy, eps_beta, 1 / eps_beta)
bounds <- witness_simple_bound(problem, epsilons_an, s$chosen_w, s$chosen_Z)

eps_dat <- data.frame(iter = 1:M, w = an_eps$eps[1,], xy = an_eps$eps[2,], beta = an_eps$eps[3,])
eps_ACEs <- data.frame(ACEs = rnorm(10000000, mean = mean(an_eps$ACEs), sd = sd(an_eps$ACEs)))

ggplot(eps_dat, aes(x = iter, y = w)) + geom_line(colour = "red") + 
       geom_line(data = eps_dat, aes(y = xy), colour = "blue") +
       geom_line(data = eps_dat, aes(y = beta), colour = "black") +
       ggtitle("Relaxation parameters") + xlab("Iteration") + ylab("Samples") +
       theme(title = element_text(size = 20), axis.text.x = element_text(size = 18)) 

relax_param1 <- data.frame(param = eps_dat$w);    relax_param1$type <- 'W'
relax_param2 <- data.frame(param = eps_dat$xy);   relax_param2$type <- 'XY'
relax_param3 <- data.frame(param = eps_dat$beta); relax_param3$type <- 'beta'
joint_relax_params <- rbind(relax_param1, relax_param2, relax_param3)

x11()
ggplot(joint_relax_params, aes(param, fill = type)) + geom_density(alpha = 0.2) + xlim(0, 1) +
  ggtitle(sprintf("Posterior Distributions, Relaxation Parameters"))

x11()
ggplot(eps_ACEs, aes(ACEs)) + geom_density(alpha = 0.2, fill = "red") +
  theme(title = element_text(size = 20), axis.text.x = element_text(size = 18)) + 
  ggtitle("Density of Candidate ACEs")

epsilons_adjusted <- c(eps_w, eps_xy, eps_xy, eps_xy, eps_beta, 1 / eps_beta)
bounds_posterior_adjusted <- witness_generate_WZ_posterior(problem, epsilons_adjusted, s$chosen_w, s$chosen_Z, num_monte_carlo, verbose = TRUE, numerical_method = TRUE)
cat("Bounds adjusted: [", mean(bounds_posterior_adjusted), ", ", mean(bounds_posterior_adjusted[, 2]), "]\n", sep = "")

