% optimize_additive_model:
%
% The simplest setup for combining observational and experimental data.
% Given experimental data {X_train, Y_train}, fit a GP using marginal maximum likelihood.
%
% Input:
%
% - A: matrix mapping experimental units to design points
% - Y_exp: interventional data
% - X_space: design space
% - do_params_X_space: observational fit. If empty, ignore observational
%                      prior.
%
% Output:
%
% - mean_pred, cov_pred: evaluating of mean predictive function at design space
% - theta_hat: corresponding fitted hyperparameters

function [mean_pred, cov_pred, theta_hat] = optimize_additive_model(A, Y_exp, X_space, do_params_X_space)

num_X_space = length(X_space);
K_prior = do_params_X_space.K + get_noise_matrix(num_X_space); 
mu_prior = do_params_X_space.mu_do;
YY = (Y_exp - A * mu_prior) * (Y_exp - A * mu_prior)';

%% Prepare prior information

sd = sqrt(diag(do_params_X_space.K)); sd = sd / max(sd);
SD2 = sd * sd';
X1 = X_space - min(X_space); X1 = X1 / max(X1);
Y1 = do_params_X_space.mu_do - min(do_params_X_space.mu_do); Y1 = Y1 / max(Y1);
core = (X1(:, ones(num_X_space, 1)) - X1(:, ones(num_X_space, 1))').^2 + ...
       (Y1(:, ones(num_X_space, 1)) - Y1(:, ones(num_X_space, 1))').^2 ;  

%% Optimize

theta_0 = [0; 0; log(var(Y_exp - A * mu_prior))];
options = optimset('Display', 'none', 'LargeScale', 'Off');
theta_hat = fminunc(@(theta)optimize_additive_model_f(theta, YY, K_prior, SD2, core, A), theta_0, options);

sf2 = exp(theta_hat(1));
ell = exp(theta_hat(2));
v_y = exp(theta_hat(3));
K_f = K_prior + sf2 * SD2 .* exp(-0.5 * core / ell);

prior_meancov_f_obs = K_f \ mu_prior;
inv_cov_f = (A' * A) / v_y + inv(K_f);
mean_pred = inv_cov_f \ (A' * Y_exp / v_y + prior_meancov_f_obs);
cov_pred = inv(inv_cov_f);
