% observational_learning:
%
% This function learns the mapping between some treatment x and output y
% adjusting for covariates Z. This boils down to learn hyperparameters of a
% Gaussian process.
%
% Input:
%
% - dat: the data matrix;
% - x, y: indices of treatment and outcome;
% - Z: array of indices of adjustment covariates;
% - num_iter: number of iterations for hyperparameter fitting.
%
% Output:
%
% - prior_info: information necessary to build a Gaussian process prior out
%   of the given observational data;
% - x_values: values in which we assess the doses;
% - y_hat: corresponding expected outcomes.

function [prior_info, x_values, y_hat] = observational_learning(dat, x, y, Z, num_iter, verbose) %#ok<*INUSD>

if nargin < 6, verbose = false; end

p = length(Z);
n = size(dat, 1);

% Fit a GP model with ARD kernel

likfunc = @likGauss;
XZ_covfunc = {@covMaternard, 3}; hyp.cov = zeros(p + 2, 1); hyp.lik = log(0.1);
if verbose
  hyp2 = minimize(hyp, @gp, -num_iter, @infExact, [], XZ_covfunc, likfunc, dat(:, [x Z]), dat(:, y));
else
  [~, hyp2] = evalc('minimize(hyp, @gp, -num_iter, @infExact, [], XZ_covfunc, likfunc, dat(:, [x Z]), dat(:, y))');
end

% Generate statistics relevant for experimental learning

K = feval(XZ_covfunc{:}, hyp2.cov, dat(:, [x Z])) + eye(n) * exp(hyp2.lik)^2;
K_w = K \ dat(:, y);
[x_values, y_hat] = build_effect(XZ_covfunc, hyp2, dat, x, y, Z);

% Return information

prior_info.chol_K = chol(K);
prior_info.K_w = K_w;

prior_info.X_hyp.lik = var(dat(:, y) - y_hat);
prior_info.XZ_hyp = hyp2;
prior_info.XZ_covfunc = XZ_covfunc;
prior_info.XZ_likfunc = likfunc;
prior_info.mean_a = 1;
prior_info.mean_b = 0;
