What is a good implementation for calculating the gradient in a n-layered neural network?
Weight layers:
- First layer weights:
(n_inputs+1, n_units_layer)-matrix
- Hidden layer weights:
(n_units_layer+1, n_units_layer)-matrix
- Last layer weights:
(n_units_layer+1, n_outputs)-matrix
Notes:
- If there is only one hidden layer we would represent the net by using just two (weight) layers:
inputs --first_layer-> network_unit --second_layer-> output
- For a n-layer network with more than one hidden layer, we need to implement the (2) step.
This question contains running code for a network with 1 hidden layer. I haven't been able to hack it into being generalized.
A bit vague pseudocode:
weight_layers = [ layer1, layer2 ] # a list of layers as described above
input_values = [ [0,0], [0,0], [1,0], [0,1] ] # our test set (corresponds to XOR)
target_output = [ 0, 0, 1, 1 ] # what we want to train our net to output
output_layers = [] # output for the corresponding layers
for layer in weight_layers:
output <-- calculate the output # calculate the output from the current layer
output_layers <-- output # store the output from each layer
n_samples = input_values.shape[0]
n_outputs = target_output.shape[1]
error = ( output-target_output )/( n_samples*n_outputs )
""" calculate the gradient here """
My Python
code (derived from Chuck Anderson's snippet):
import numpy as np
from ann import NPNeuralNet as nn
import math
from copy import copy
import sys
from math import sqrt, ceil
FLOAT_PRECISION = sys.float_info.epsilon
net = nn(2,1,2,1)
input_values = np.array([ [0,1], [1,0], [0,0], [1,1] ]).reshape(4,2)
goal_output = np.array([1,1,0,0]).reshape(4,1)
def addOnes(A):
return np.hstack((np.ones((A.shape[0],1)),A))
def errorFunction( weight_list ):
weight_layers = net.unpack( weight_list )
output = input_values
for layer in weight_layers[:-1]:
output = np.tanh( addOnes(output).dot( layer ))
output = addOnes(output).dot( weight_layers[-1] )
return 0.5 * np.mean((output - goal_output)**2)
"""
Here comes the gradient function
"""
def errorGradient( weight_list ):
weight_layers = net.unpack( weight_list )
output, output_layers = input_values, [input_values]
for layer in weight_layers[:-1]:
output = np.tanh( addOnes(output).dot( layer ))
output_layers.append( output )
output = addOnes(output).dot( weight_layers[-1] )
n_samples = input_values.shape[0]
n_outputs = goal_output.shape[1]
error = ( output-goal_output )/( n_samples*n_outputs )
layers = list()
for index,layer in enumerate(weight_layers[1:]):
layers.append( addOnes(output_layers[index]).T.dot( error.dot(layer[1:,:].T)*(1-output_layers[index+1]**2) ).flat)
layers.append( addOnes(output_layers[-1]).T.dot( error ).flat)
return np.hstack(layers)
#end
def scg(x, f,gradf, *fargs, **params):
"""
All credits to Chuck Anderson
"""
evalFunc = params.pop("evalFunc",lambda x: "Eval "+str(x))
nIterations = params.pop("nIterations",1000)
xPrecision = params.pop("xPrecision", 1.e-8) # 1.e-8 is default value
fPrecision = params.pop("fPrecision", 1.e-8)
xtracep = params.pop("xtracep",False)
ftracep = params.pop("ftracep",False)
xtracep = True
ftracep = True
### from Nabney's netlab matlab library
nvars = len(x)
sigma0 = 1.0e-4
fold = f(x, *fargs)
fnow = fold
gradnew = gradf(x, *fargs)
gradold = copy(gradnew)
d = -gradnew # Initial search direction.
success = True # Force calculation of directional derivs.
nsuccess = 0 # nsuccess counts number of successes.
beta = 1.0e-6 # Initial scale parameter.
betamin = 1.0e-15 # Lower bound on scale.
betamax = 1.0e100 # Upper bound on scale.
j = 1 # j counts number of iterations.
if xtracep:
xtrace = np.zeros((nIterations+1,len(x)))
xtrace[0,:] = x
else:
xtrace = None
if ftracep:
ftrace = np.zeros(nIterations+1)
ftrace[0] = fold
else:
ftrace = None
### Main optimization loop.
while j <= nIterations:
# Calculate first and second directional derivatives.
if success:
mu = np.dot(d, gradnew)
if mu==np.nan: print "mu is NaN"
if mu >= 0:
d = -gradnew
mu = np.dot(d, gradnew)
kappa = np.dot(d, d)
if kappa < FLOAT_PRECISION:
return {'x':x, 'f':fnow, 'nIterations':j, 'xtrace':xtrace[:j,:], 'ftrace':ftrace[:j],
'reason':"limit on machine precision"}
sigma = sigma0/sqrt(kappa)
xplus = x + sigma * d
gplus = gradf(xplus, *fargs)
theta = np.dot(d, gplus - gradnew)/sigma
## Increase effective curvature and evaluate step size alpha.
delta = theta + beta * kappa
if delta is np.nan: print "delta is NaN"
if delta <= 0:
delta = beta * kappa
beta = beta - theta/kappa
alpha = -mu/delta
## Calculate the comparison ratio.
xnew = x + alpha * d
fnew = f(xnew, *fargs)
Delta = 2 * (fnew - fold) / (alpha*mu)
if Delta is not np.nan and Delta >= 0:
success = True
nsuccess += 1
x = xnew
fnow = fnew
if xtracep:
xtrace[j,:] = x
if ftracep:
ftrace[j] = fnew
else:
success = False
fnow = fold
if j % ceil(nIterations/10) == 0:
print "SCG: Iteration",j,"fValue",evalFunc(fnow),"Scale",beta
if success:
## Test for termination
##print(c(max(abs(alpha*d)),max(abs(fnew-fold))))
if max(abs(alpha*d)) < xPrecision:
return {'x':x, 'f':fnow, 'nIterations':j, 'xtrace':xtrace[:j,:], 'ftrace':ftrace[:j],
'reason':"limit on x Precision"}
elif abs(fnew-fold) < fPrecision:
return {'x':x, 'f':fnow, 'nIterations':j, 'xtrace':xtrace[:j,:], 'ftrace':ftrace[:j],
'reason':"limit on f Precision"}
else:
## Update variables for new position
fold = fnew
gradold = gradnew
gradnew = gradf(x, *fargs)
#print "gradold",gradold
#print "gradnew",gradnew
## If the gradient is zero then we are done.
if np.dot(gradnew, gradnew) == 0:
return {'x':x, 'f':fnow, 'nIterations':j, 'xtrace':xtrace[:j,:], 'ftrace':ftrace[:j],
'reason':"zero gradient"}
## Adjust beta according to comparison ratio.
if Delta is np.nan or Delta < 0.25:
beta = min(4.0*beta, betamax)
elif Delta > 0.75:
beta = max(0.5*beta, betamin)
## Update search direction using Polak-Ribiere formula, or re-start
## in direction of negative gradient after nparams steps.
if nsuccess == nvars:
d = -gradnew
nsuccess = 0
elif success:
gamma = np.dot(gradold - gradnew, gradnew/mu)
#print "gamma",gamma
d = gamma * d - gradnew
#print "end d",d
j += 1
## If we get here, then we haven't terminated in the given number of
## iterations.
##print("Did not converge.")
return {'x':x, 'f':fnow, 'nIterations':j, 'xtrace':xtrace[:j,:], 'ftrace':ftrace[:j],
'reason':"did not converge"}
# end scg
result = scg( net.get_weights(), errorFunction, errorGradient,
xPrecision = 10**-15,
fPrecision = 10**-12,
nIterations = 1000,
ftracep = True)
print 'Termination reason:', result['reason']
errorTrace = result['ftrace']
print 'Ran for',len(errorTrace),'iterations'
# print the error corresponding to the input
weight_layers = net.unpack( result['x'] )
output = input_values
for layer in weight_layers[:-1]:
output = np.tanh( addOnes(output).dot( layer ))
print addOnes(output).dot( weight_layers[-1] )
The NPNeuralNet class
import numpy as np
import math
import random
class NPNeuralNet:
def __init__(self,n_inputs, n_outputs, n_hiddens, n_hidden_layers):
self.n_inputs = n_inputs
self.n_hiddens = n_hiddens
self.n_outputs = n_outputs
self.n_hidden_layers = n_hidden_layers
if n_hidden_layers>0:
self.n_weights = (n_inputs+1)*n_hiddens+(n_hiddens**2+n_hiddens)*(n_hidden_layers-1)+n_hiddens*n_outputs+n_outputs
else:
self.n_weights = ((n_inputs+1)*n_outputs)
# init weights
self.set_weights( list(np.random.uniform(size=(1,self.n_weights)).flat) )
def unpack(self, weight_list ):
if self.n_hidden_layers>0:
weight_layers = [ np.array(weight_list[:(self.n_inputs+1)*self.n_hiddens]).reshape(self.n_inputs+1,self.n_hiddens) ]
weight_layers += [ np.array(weight_list[(self.n_inputs+1)*self.n_hiddens+(i*(self.n_hiddens**2+self.n_hiddens)):(self.n_inputs+1)*self.n_hiddens+((i+1)*(self.n_hiddens**2+self.n_hiddens))]).reshape(self.n_hiddens+1,self.n_hiddens) for i in xrange(self.n_hidden_layers-1) ]
weight_layers += [ np.array(weight_list[(self.n_inputs+1)*self.n_hiddens+((self.n_hidden_layers-1)*(self.n_hiddens**2+self.n_hiddens)):]).reshape(self.n_hiddens+1,self.n_outputs) ]
else:
weight_layers = [ np.array(weight_list[:(self.n_inputs+1)]).reshape(self.n_inputs+1,self.n_outputs) ]
return weight_layers
def set_weights(self, weight_list ):
self.weights = self.unpack( weight_list )
def get_weights(self, ):
return self.pack(self.weights)
def pack(self, weight_layers ):
return np.hstack(l.flat for l in weight_layers)
def get_n_weights(self, ):
return self.n_weights
def update(self, input_values ):
assert type(input_values) == np.ndarray, "Wanted: NumPy array, got: "+str(type(input_values))
assert input_values.shape == (1,self.n_inputs), "Input not a NumPy array of correct size"+str((1,self.n_inputs))+"!="+str(input_values.shape)
def addOnes(A):
return np.hstack((np.ones((A.shape[0],1)),A))
#end addOnes
output = input_values
for layer in self.weights[:-1]:
output = np.tanh( addOnes(output).dot( layer ))
output = addOnes(output).dot( self.weights[-1] )
return output[0]
My code snippet requires the following network structure, and the
tanh-activation function
Neural network structure graphic
The final implementation
def forward( layer_input, weight_layer, activation_function ):
output = layer_input.dot(weight_layer)
if activation_function:
return activation_function(output)
return output
#end forward
def backprop( dEdY, weight_layer, dadY, output_values ):
if dadY:
return np.dot( dEdY, weight_layer[1:,:].T ) * dadY(output_values)
else:
return np.dot( dEdY, weight_layer[1:,:].T )
# end backprop
def errorFunction( weight_list ):
weight_layers = NETWORK.unpack( weight_list )
output = input_values
for layer in weight_layers[:-1]:
output = forward( addOnes(output), layer, np.tanh )
output = forward( addOnes(output), weight_layers[-1], None )
return 0.5 * np.mean((output - goal_output)**2)
def errorGradient( weight_list ):
weight_layers = NETWORK.unpack( weight_list )
output, output_layers = input_values, [input_values]
for layer in weight_layers[:-1]:
output = forward( addOnes(output), layer, np.tanh )
output_layers.append( output )
output = forward( addOnes(output), weight_layers[-1], None )
n_samples = addOnes(input_values).shape[0]
n_outputs = goal_output.shape[1]
error = ( output-goal_output )/( n_samples*n_outputs )
layers = list()
for index in xrange(-1,-(len(weight_layers)+1),-1):
layers.append( np.dot( addOnes(output_layers[index]).T, error ).flat )
error = backprop( error, weight_layers[index], dadY, output_layers[index] )
return np.hstack( reversed(layers) )