Task
The task in this assessment is to solve the same learning problem as in the previous assessment, but this time using Quasi-Newton with the BFGS update formula (Algorithm 6.1 in the book), and a line search algorithm. For background on the problem, see Assessment 4.
The template contains most of the necessary functionality to solve the problem. Your task is to complete the backtracking line search implementation (Algorithm 3.1 in the book), and implement the missing parts of Algorithm 6.1 (the BFGS update formula). The places where your input is needed is marked with TODO.
Solution
Given this original script
x = 0:0.2:2*pi;
rng('default');
y = .1*(x + cos(x) + 1) + .1*rand(size(x));
N = size(x,2); % Number of data points
n_theta = 9;
% NN parameters
alpha = .2;
n_iter = 200;
% Allocate storage
ES = zeros(1,n_iter+1);
theta = zeros(n_theta,n_iter+1);
% initial value for parameters
theta(:,1) = zeros(n_theta,1);
% Initial value of E (objective function)
ES(1) = objfun(theta(:,1),x,y);
% function handle for objective function (for line search)
E = @(theta) objfun(theta,x,y);
% Initial BFGS matrix
I = eye(n_theta);
Hk = I;
% Initial gradient
dEdtheta = objfun_gradient(theta(:,1),x,y);
% Tolerance for convergence
tol = 1e-10;
k = 0; % Iteration condition
exit = 0; % Termination criteria
while not(exit),
k = k+1;
% BFGS search direction
pk = - Hk*dEdtheta;
% Linesearch update of theta -- TODO: You should update the linesearch function (see below)
alpha = backtrackingLS(E,ES(k),theta(:,k),dEdtheta,pk);
theta(:,k+1) = theta(:,k) + alpha*pk;
% Calculate next Hk -- TODO: Update according to BFGS formula
dEdtheta_new = objfun_gradient(theta(:,k+1),x,y);
sk = ...
yk = ...
rhok = ...
if rhok > 0, % Do not update Hk if curvature condition is not fulfilled
Hk = ...
end
% The reason for the if-clause is that we use a simple linesearch that
% does not ensure that the curvature condition is fulfilled.
% Next objective function value
ES(k+1) = objfun(theta(:,k+1),x,y);
% Assign for next iteration
dEdtheta = dEdtheta_new;
% Termination criteria
exit = (abs(ES(k+1)-ES(k))<tol) | (k>=n_iter);
end
theta_opt = theta(:,k+1);
figure(1)
plot(1:k+1,ES(1:k+1));
xlabel('Iteration'); ylabel('E');
figure(2);
plot(x,y); hold on;
plot(x,f_NN(x,theta(:,k+1)),'c--'); hold off
xlabel('x'); ylabel('y');
function alpha = backtrackingLS(f,f_val,xk,df,pk)
%
% backtracking line search (Algorithm 3.1 in N&W)
%
% f handle for objective function
% f_val objective function at xk
% xk current iterate
% df current gradient
% pk search direction
alpha = 1; % Initial guess at step length
rho = .9; % < 1 reduction factor of alpha
c = 1e-4; % Sufficient decrease constant
% TODO: fill in clause for while loop.
while ...,
alpha = rho*alpha;
if alpha < 1e-8,
error('alpha too small -- something is wrong');
end
end
end
function E = objfun(theta,x,y)
N = size(x,2);
E = 0;
for i = 1:N,
E = E + .5* (y(i) - f_NN(x(i),theta) )^2;
end
end
function dEdtheta = objfun_gradient(theta,x,y)
N = size(x,2);
% Find gradient
dEdtheta = 0;
for i = 1:N,
[dydthetai, hyi] = gradient_NN(x(i),theta);
dEdtheta = dEdtheta - (y(i) - hyi)*dydthetai;
end
end
function y = f_NN(x,theta)
% Do a forward pass of a 1-2-1 Neural Network.
%
% x input
% theta parameters theta = (w1, b1, w2, b2, w3, b3, w41, b41, w42, b4)
%
% y output
% Activation function ("logistic")
sigma = @(r) 1./(1+exp(-r));
w1 = theta(1);
b1 = theta(2);
w2 = theta(3);
b2 = theta(4);
w3 = theta(5);
b3 = theta(6);
w41 = theta(7);
w42 = theta(8);
b4 = theta(9);
y1 = sigma(w1*x + b1);
y2 = sigma(w2*y1 + b2);
y3 = sigma(w3*y1 + b3);
y = sigma(w41*y2 + w42*y3 + b4);
end
function [dydtheta, y] = gradient_NN(x,theta)
% Calculate the gradient of the 1-2-1 NN wrt parameters.
% Note: This is an inefficient implementation, using "forward" (chain rule)
% differentiation. "Reverse" (mode of automatic differentiation) aka
% back propagation would be much more efficient, but the difference does
% not matter for this small example.
%
% x input
% theta parameters theta = (w1, b1, w2, b2, w3, b3, w41, w42, b4)
%
% y output
% dydtheta gradient of y wrt theta
% Activation function ("logistic")
sigma = @(r) 1./(1+exp(-r));
% Derivative of activation function (expressed in terms of the output)
dsigmadr = @(sigma) sigma*(1-sigma);
w1 = theta(1);
b1 = theta(2);
w2 = theta(3);
b2 = theta(4);
w3 = theta(5);
b3 = theta(6);
w41 = theta(7);
w42 = theta(8);
b4 = theta(9);
y1 = sigma(w1*x + b1);
y2 = sigma(w2*y1 + b2);
y3 = sigma(w3*y1 + b3);
y = sigma(w41*y2 + w42*y3 + b4);
dydtheta = [
dsigmadr(y)*w41*dsigmadr(y2)*w2*dsigmadr(y1)*x + dsigmadr(y)*w42*dsigmadr(y3)*w3*dsigmadr(y1)*x; % dydw1
dsigmadr(y)*w41*dsigmadr(y2)*w2*dsigmadr(y1) + dsigmadr(y)*w42*dsigmadr(y3)*w3*dsigmadr(y1); % dydb1
dsigmadr(y)*w41*dsigmadr(y2)*y1; % dydw2
dsigmadr(y)*w41*dsigmadr(y2); % dydb2
dsigmadr(y)*w42*dsigmadr(y3)*y1; % dydw3
dsigmadr(y)*w42*dsigmadr(y3); % dydb3
dsigmadr(y)*y2; % dydw41
dsigmadr(y)*y3; % dydw42
dsigmadr(y); % dydb4
];
end
The following changes yielded the correct result:
x = 0:0.2:2*pi;
rng('default');
y = .1*(x + cos(x) + 1) + .1*rand(size(x));
N = size(x,2); % Number of data points
n_theta = 9;
% NN parameters
alpha = .2;
n_iter = 200;
% Allocate storage
ES = zeros(1,n_iter+1);
theta = zeros(n_theta,n_iter+1);
% initial value for parameters
theta(:,1) = zeros(n_theta,1);
% Initial value of E (objective function)
ES(1) = objfun(theta(:,1),x,y);
% function handle for objective function (for line search)
E = @(theta) objfun(theta,x,y);
% Initial BFGS matrix
I = eye(n_theta);
Hk = I;
% Initial gradient
dEdtheta = objfun_gradient(theta(:,1),x,y);
% Tolerance for convergence
tol = 1e-10;
k = 0; % Iteration condition
exit = 0; % Termination criteria
while not(exit),
k = k+1;
% BFGS search direction
pk = - Hk*dEdtheta;
% Linesearch update of theta -- TODO: You should update the linesearch function (see below)
alpha = backtrackingLS(E,ES(k),theta(:,k),dEdtheta,pk);
theta(:,k+1) = theta(:,k) + alpha*pk;
% Calculate next Hk -- TODO: Update according to BFGS formula
dEdtheta_new = objfun_gradient(theta(:,k+1),x,y);
sk = theta(:,k+1) - theta(:,k); % Step taken
yk = dEdtheta_new - dEdtheta; % Change in gradient
rhok = 1 / (yk' * sk);
if rhok > 0, % Do not update Hk if curvature condition is not fulfilled
Hk = (I - rhok * (sk * yk')) * Hk * (I - rhok * (yk * sk')) + rhok * (sk * sk');
end
% The reason for the if-clause is that we use a simple linesearch that
% does not ensure that the curvature condition is fulfilled.
% Next objective function value
ES(k+1) = objfun(theta(:,k+1),x,y);
% Assign for next iteration
dEdtheta = dEdtheta_new;
% Termination criteria
exit = (abs(ES(k+1)-ES(k))<tol) | (k>=n_iter);
end
theta_opt = theta(:,k+1);
figure(1)
plot(1:k+1,ES(1:k+1));
xlabel('Iteration'); ylabel('E');
figure(2);
plot(x,y); hold on;
plot(x,f_NN(x,theta(:,k+1)),'c--'); hold off
xlabel('x'); ylabel('y');
function alpha = backtrackingLS(f,f_val,xk,df,pk)
%
% backtracking line search (Algorithm 3.1 in N&W)
%
% f handle for objective function
% f_val objective function at xk
% xk current iterate
% df current gradient
% pk search direction
alpha = 1; % Initial guess at step length
rho = .9; % < 1 reduction factor of alpha
c = 1e-4; % Sufficient decrease constant
% TODO: fill in clause for while loop.
while f(xk + alpha * pk) > f_val + c * alpha * (df' * pk)
alpha = rho*alpha;
if alpha < 1e-8,
error('alpha too small -- something is wrong');
end
end
end
function E = objfun(theta,x,y)
N = size(x,2);
E = 0;
for i = 1:N,
E = E + .5* (y(i) - f_NN(x(i),theta) )^2;
end
end
function dEdtheta = objfun_gradient(theta,x,y)
N = size(x,2);
% Find gradient
dEdtheta = 0;
for i = 1:N,
[dydthetai, hyi] = gradient_NN(x(i),theta);
dEdtheta = dEdtheta - (y(i) - hyi)*dydthetai;
end
end
function y = f_NN(x,theta)
% Do a forward pass of a 1-2-1 Neural Network.
%
% x input
% theta parameters theta = (w1, b1, w2, b2, w3, b3, w41, b41, w42, b4)
%
% y output
% Activation function ("logistic")
sigma = @(r) 1./(1+exp(-r));
w1 = theta(1);
b1 = theta(2);
w2 = theta(3);
b2 = theta(4);
w3 = theta(5);
b3 = theta(6);
w41 = theta(7);
w42 = theta(8);
b4 = theta(9);
y1 = sigma(w1*x + b1);
y2 = sigma(w2*y1 + b2);
y3 = sigma(w3*y1 + b3);
y = sigma(w41*y2 + w42*y3 + b4);
end
function [dydtheta, y] = gradient_NN(x,theta)
% Calculate the gradient of the 1-2-1 NN wrt parameters.
% Note: This is an inefficient implementation, using "forward" (chain rule)
% differentiation. "Reverse" (mode of automatic differentiation) aka
% back propagation would be much more efficient, but the difference does
% not matter for this small example.
%
% x input
% theta parameters theta = (w1, b1, w2, b2, w3, b3, w41, w42, b4)
%
% y output
% dydtheta gradient of y wrt theta
% Activation function ("logistic")
sigma = @(r) 1./(1+exp(-r));
% Derivative of activation function (expressed in terms of the output)
dsigmadr = @(sigma) sigma*(1-sigma);
w1 = theta(1);
b1 = theta(2);
w2 = theta(3);
b2 = theta(4);
w3 = theta(5);
b3 = theta(6);
w41 = theta(7);
w42 = theta(8);
b4 = theta(9);
y1 = sigma(w1*x + b1);
y2 = sigma(w2*y1 + b2);
y3 = sigma(w3*y1 + b3);
y = sigma(w41*y2 + w42*y3 + b4);
dydtheta = [
dsigmadr(y)*w41*dsigmadr(y2)*w2*dsigmadr(y1)*x + dsigmadr(y)*w42*dsigmadr(y3)*w3*dsigmadr(y1)*x; % dydw1
dsigmadr(y)*w41*dsigmadr(y2)*w2*dsigmadr(y1) + dsigmadr(y)*w42*dsigmadr(y3)*w3*dsigmadr(y1); % dydb1
dsigmadr(y)*w41*dsigmadr(y2)*y1; % dydw2
dsigmadr(y)*w41*dsigmadr(y2); % dydb2
dsigmadr(y)*w42*dsigmadr(y3)*y1; % dydw3
dsigmadr(y)*w42*dsigmadr(y3); % dydb3
dsigmadr(y)*y2; % dydw41
dsigmadr(y)*y3; % dydw42
dsigmadr(y); % dydb4
];
end