I implemented ANN algorithm in C# after submitted Machine Learning exercise 5 in Octave/MatLib. Tested it with a four layer network of 2 X 100 X 800 X 1, see demo below:
ANN class inherits IBinaryClassifier in order to be adaptive to AdaBoost algorithm as its weak classifier. The code of the NeuralNetwork class is blow, InternalTrainAsync(), CostFunction(), ComputeThetaGradients() are the most important parts of ANN algorithm.
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.Linq;
using System.Text;
using System.Threading;
using System.Threading.Tasks;
namespace MachineLearning.ANN
{
[Serializable]
public partial class NeuralNetwork : IBinaryClassifier
{
private double[][][] _thetas;
public NeuralNetwork(params int[] nodeCountInEachLayer)
{
this.NodeCounts = nodeCountInEachLayer;
this.Alpha = 0.1f;
this.Lambda = 1f;
this.LoopCount = 1000;
this.CostThreshold = 0.3f;
}
#region properties
public int[] NodeCounts
{
get { return _nodeCounts; }
private set
{
if (value == null)
throw new ArgumentNullException("NodeCounts cannot be null.");
if (value.Length < 2)
throw new ArgumentOutOfRangeException("Neural network cannot have less than 2 layers.");
for (int layer = 0; layer < value.Length; layer++)
{
if (value[layer] <= 0)
throw new ArgumentException("Neural network must have at least one node in a layer.");
}
_nodeCounts = value;
//allocat thetas for all layers
_thetas = AllocateAllLayerThetas(true);
}
}
private int[] _nodeCounts;
public int Depth
{
get { return _nodeCounts?.Length ?? 0; }
}
public int FeatureCount
{
get { return NodeCountInLayer(0); }
}
public int OutputCount
{
get { return NodeCountInLayer(Depth - 1); }
}
/// <summary>
/// Node count in a layer, not include bias node.
/// </summary>
/// <param name="layer"></param>
/// <returns></returns>
public int NodeCountInLayer(int layer)
{
return _nodeCounts?[layer] ?? 0;
}
/// <summary>
/// Learn rate.
/// </summary>
public float Alpha { get; set; }
/// <summary>
/// Regularization parameter.
/// </summary>
public float Lambda { get; set; }
/// <summary>
/// Maximum number of learning loop for training termination
/// </summary>
public int LoopCount { get; set; }
/// <summary>
/// Cost function threshold for training termination.
/// </summary>
public float CostThreshold { get; set; }
/// <summary>
/// NN is binary classifier when there is only one output node.
/// </summary>
public bool IsBinaryClassifier
{
get { return OutputCount == 1; }
}
#endregion
#region IBinaryClassifier features
public event EventHandler<ClassifierTrainingEventArgs> TrainingEvent;
public void CopySettingsFrom(IBinaryClassifier template)
{
var copy = template as NeuralNetwork;
this.NodeCounts = copy.NodeCounts;
this.Alpha = copy.Alpha;
this.Lambda = copy.Lambda;
this.LoopCount = copy.LoopCount;
this.CostThreshold = copy.CostThreshold;
}
public bool Train(float[][] samples, bool[] responses)
{
using (var cts = new CancellationTokenSource())
{
return TrainAsync(samples, responses, cts.Token).Result;
}
}
public bool Train(float[][] samples, bool[] responses, float[] weights)
{
using (var cts = new CancellationTokenSource())
{
return TrainAsync(samples, responses, weights, cts.Token).Result;
}
}
public async Task<bool> TrainAsync(float[][] samples, bool[] responses, CancellationToken ct)
{
float[] weights = new float[samples.Length];
for (int i = 0; i < weights.Length; i++)
{
weights[i] = 1.0f;
}
return await TrainAsync(samples, responses, weights, ct);
}
public async Task<bool> TrainAsync(float[][] samples, bool[] responses, float[] weights, CancellationToken ct)
{
if (!this.IsBinaryClassifier)
throw new Exception("Neural network as binary classifier cannot have more than 1 output node.");
if (samples.Length == 0)
throw new ArgumentException("The number of samples is zero.");
if (samples[0].Length != this.FeatureCount)
throw new ArgumentException("The number of features in a sample is not the same as the number of input nodes.");
if (samples.Length != responses.Length)
throw new ArgumentException("The number of samples must be the same as the number of responses.");
if (samples.Length != weights.Length)
throw new ArgumentException("The number of samples must be the same as the number of weights.");
if (weights.Any(x => x < 0))
throw new ArgumentException("There exists negative weight of sample.");
//resample by top 80% of weights
var selectedIndics = new List<int>(samples.Length);
//select all samples weighted as 1
selectedIndics.AddRange(Enumerable.Range(0, weights.Length).Where(i => weights[i] >= 1));
//select top 80% of the other samples by weights
selectedIndics.AddRange(Enumerable.Range(0, weights.Length)
.Select(i => new { i, weight = weights[i] })
.Where(p => p.weight < 1)
.OrderByDescending(p => p.weight)
.Take((int)(weights.Length * 0.8))
.Where(p => p.weight > 0)
.Select(p => p.i));
if (selectedIndics.Count == 0)
return false;
float[][] X = selectedIndics.Select(i => samples[i]).ToArray();
int[][] Y = selectedIndics.Select(i => new int[1] { responses[i] ? 1 : 0 }).ToArray();
return await InternalTrainAsync(X, Y, ct);
}
public bool Predict(float[] sample)
{
if (!this.IsBinaryClassifier)
throw new Exception("Neural network as binary classifier cannot have more than 1 output node.");
var h = ComputeInputLayerActivations(sample);
for (int l = 1; l < Depth; l++)
{
h = ComputeNonInputLayerActivations(l, h);
}
return h[1] > 0.5; //ignore a0
}
#endregion
#region Mulitple category classifier
public bool Train(float[][] samples, int[] responses)
{
using (var cts = new CancellationTokenSource())
{
return TrainAsync(samples, responses, cts.Token).Result;
}
}
/// <summary>
/// Train NN with samples.
/// </summary>
/// <param name="samples">Samples</param>
/// <param name="responses">The index of categories or labels, starting with 0</param>
/// <param name="ct">Cancellation token</param>
/// <returns></returns>
public async Task<bool> TrainAsync(float[][] samples, int[] responses, CancellationToken ct)
{
if (samples.Length == 0)
throw new ArgumentException("The number of samples is zero.");
if (samples[0].Length != this.FeatureCount)
throw new ArgumentException("The number of features in a sample is not the same as the number of input nodes.");
if (samples.Length != responses.Length)
throw new ArgumentException("The number of samples must be the same as the number of responses.");
//recode the labels as vectors containing only values 0 or 1
int[][] labels = new int[samples.Length][];
for (int i = 0; i < labels.Length; i++)
{
labels[i] = new int[this.OutputCount];
labels[i][responses[i]] = 1;
}
return await InternalTrainAsync(samples, labels, ct);
}
public int PredictMultiCategory(float[] sample)
{
var h = ComputeInputLayerActivations(sample);
for (int l = 1; l < Depth; l++)
{
h = ComputeNonInputLayerActivations(l, h);
}
double max = h[1]; //ignore a0
int index = 1;
for (int i = 2; i < h.Length; i++)
{
if (h[i] > max)
{
index = i;
max = h[i];
}
}
return index - 1;
}
#endregion
public override string ToString()
{
if (_nodeCounts == null)
return null;
var sb = new StringBuilder();
sb.Append(_nodeCounts[0].ToString());
for (int i = 1; i < _nodeCounts.Length; i++)
{
sb.Append(',');
sb.Append(_nodeCounts[i].ToString());
}
return sb.ToString();
}
/// <summary>
/// g(z) = 1/(1+exp(z))
/// </summary>
/// <param name="z"></param>
/// <returns></returns>
public static double Sigmoid(double z)
{
return 1 / (1 + Math.Exp(-z));
}
#region Private methods
private async Task<bool> InternalTrainAsync(float[][] X, int[][] Y, CancellationToken ct)
{
var eventArgs = new ClassifierTrainingEventArgs();
var stopWatch = new Stopwatch();
stopWatch.Start();
//train NN many times by Gradient Descent algorithm
int t = 0;
double cost = 0.0;
for (; t < LoopCount; t++)
{
if (ct.IsCancellationRequested)
{
Debug.WriteLine("Training was cancelled.");
if (TrainingEvent != null)
{
eventArgs.Loop = t;
eventArgs.Cost = cost;
eventArgs.Message = "Training was cancelled.";
TrainingEvent(this, eventArgs);
}
return false;
}
await Task.Run(() =>
{
//calculate weight gradients
var thetaGradients = ComputeThetaGradients(X, Y, Lambda);
//descent weights
Parallel.For(0, _thetas.Length, (l) => //for (int l = 0; l < _thetas.Length; l++)
{
var thetas = _thetas[l];
var thetaGrads = thetaGradients[l];
for (int j = 0; j < thetas.Length; j++)
{
var thetaVector = thetas[j];
var thetaGradVector = thetaGrads[j];
for (int i = 0; i < thetaVector.Length; i++)
{
thetaVector[i] -= Alpha * thetaGradVector[i];
}
}
});
//check cost function
cost = this.CostFunction(X, Y, Lambda);
Debug.WriteLine(string.Format("{0} - cost: {1}", t, cost));
if (TrainingEvent != null)
{
eventArgs.Loop = t;
eventArgs.Cost = cost;
eventArgs.Message = string.Format("{0} - cost: {1}", t, cost);
TrainingEvent(this, eventArgs);
}
});
if (cost <= CostThreshold)
break;
}
stopWatch.Stop();
if (TrainingEvent != null)
{
eventArgs.Loop = t;
eventArgs.Cost = cost;
eventArgs.Message = "Training succeeded in " + stopWatch.ElapsedMilliseconds + "ms";
TrainingEvent(this, eventArgs);
}
return true;
}
private double CostFunction(float[][] X, int[][] Y, float lambda)
{
int m = X.Length;
double cost = 0;
for (int i = 0; i < X.Length; i++)
{
var x = X[i];
var y = Y[i];
//compute hepothesis of sampe x: h(x)
var h = ComputeInputLayerActivations(x);
for (int l = 1; l < Depth; l++)
{
h = ComputeNonInputLayerActivations(l, h);
}
for (int k = 0; k < y.Length; k++)
{
cost += y[k] * Math.Log(h[k + 1]) + (1 - y[k]) * Math.Log(1 - h[k + 1]); //ignore h0
}
}
cost = -cost / m;
//regularization, exclude first column in Theta1, Theta2, ...
double punish = 0;
foreach (var thetasOfLayer in _thetas)
{
foreach (var inputThetas in thetasOfLayer)
{
for (int i = 1; i < inputThetas.Length; i++) //exclude inputThetas[j,0]
{
punish += inputThetas[i] * inputThetas[i];
}
}
}
cost += lambda / (m + m) * punish;
return cost;
}
/// <summary>
/// Compute theta/weight gradients
/// </summary>
/// <param name="X">Samples</param>
/// <param name="Y">Labels of samples X. Y[i][j]=1 when sample X[i] belongs to the j-th category, Y[i][j]=0 otherwise.</param>
/// <param name="lambda">regulalization parameter</param>
/// <returns></returns>
private double[][][] ComputeThetaGradients(float[][] X, int[][] Y, float lambda)
{
int m = X.Length; //the number of samples
// Implement the backpropagation algorithm to compute the gradients of thetas.
// They are the partial derivatives of the cost function with respect to each theta in arraies of _thetas[0], _theta[1], ...
double[][][] thetaGrads = AllocateAllLayerThetas(false);
double[][] nodeOutputs = new double[Depth][];
for (int s = 0; s < X.Length; s++)
{
var x = X[s];
//Feedforward to compute node outputs: a1, a2, a3, ...
Debug.WriteLine("Computer layer 0 activation");
nodeOutputs[0] = ComputeInputLayerActivations(x);
for (int l = 1; l < Depth; l++)
{
Debug.WriteLine("Computer layer {0} activation", l);
nodeOutputs[l] = ComputeNonInputLayerActivations(l, nodeOutputs[l - 1]);
}
//Compute errors of nodes in output layer by deltas[i+1]-labels[i], ignore delta 0
Debug.WriteLine("Computer layer {0} errors", Depth - 1);
double[] deltas = nodeOutputs[Depth - 1]; //reuse the space of last layer's outputs to store its errors
for (int j = 1; j < deltas.Length; j++)
{
deltas[j] -= Y[s][j - 1];
}
//back propogate errors to each hidden layer
for (int l = Depth - 2; l >= 0; l--)
{
//Compute theta grad between layer l and layer l+1
Debug.WriteLine("Update theta gradients between layer {0} and layer {1}", l + 1, l);
var thetaGrad = thetaGrads[l];
UpdateThetaGradients(thetaGrad, nodeOutputs[l], deltas);
if (l > 0)
{
//back propogate errors to hidden layer l
Debug.WriteLine("Computer layer {0} errors", l);
deltas = ComputeHiddenLayerErrors(l, nodeOutputs[l], deltas);
}
}
}
//average weight gradients
double coef = lambda / m;
//for (int l = 0; l < thetaGrads.Length; l++)
Parallel.For(0, thetaGrads.Length, (l) =>
{
var thetas = _thetas[l];
var grads = thetaGrads[l];
for (int j = 0; j < grads.Length; j++)
{
var g = grads[j];
for (int i = 0; i < g.Length; i++)
{
g[i] /= m;
}
// Implement regularization with gradients. Exclude first column in Theta1, Theta2, ...
var t = thetas[j];
for (int i = 1; i < g.Length; i++)
{
g[i] += coef * t[i];
}
}
});
return thetaGrads;
}
private double[] ComputeInputLayerActivations(float[] inputs)
{
var a = new double[NodeCountInLayer(0) + 1];
a[0] = 1; //add bias node
inputs.CopyTo(a, 1);
return a;
}
private double[] ComputeNonInputLayerActivations(int layerIndex, double[] inputs)
{
var a = new double[NodeCountInLayer(layerIndex) + 1];
a[0] = 1; //add bias node, it is useless in output layer
Parallel.For(1, a.Length, (j) => //for (int j = 1; j < a.Length; j++)
{
a[j] = ComputeNonInputNodeActivation(layerIndex, j, inputs);
});
return a;
}
private double ComputeNonInputNodeActivation(int layerIndex, int nodeIndex, double[] inputs)
{
var inputThetas = _thetas[layerIndex - 1][nodeIndex - 1]; //The input weights of layer, nodeIndex-th node in layerIndex-th layer
Debug.Assert(inputThetas.Length == inputs.Length);
double totalWeightedInput = 0.0;
for (int i = 0; i < inputs.Length; i++)
{
totalWeightedInput += inputs[i] * inputThetas[i];
}
return Sigmoid(totalWeightedInput);
}
/// <summary>
/// Compute error of each node in the hidden layer, reuse hiddenLayerNodesOutputs as space to store node errors.
/// </summary>
/// <param name="hiddenLayer">Index of hidden layer (start with 0)</param>
/// <param name="hiddenLayerNodesOutputs">As input, it stores activations/outputs of hidden layer nodes. As output, it stores errors of the same nodes.</param>
/// <param name="higherLayerDeltas">The errors of nodes in next layer.</param>
/// <returns></returns>
private double[] ComputeHiddenLayerErrors(int hiddenLayer, double[] hiddenLayerNodesOutputs, double[] higherLayerDeltas)
{
var thetas = _thetas[hiddenLayer];
double[] delters = hiddenLayerNodesOutputs;
//for (int i = 1; i < hiddenLayerNodesOutputs.Length; i++) //ignore first hidden node
Parallel.For(1, hiddenLayerNodesOutputs.Length, (i) =>
{
double totalWeightedErrorsFromHigherLayer = 0.0;
for (int j = 1; j < higherLayerDeltas.Length; j++) //ignore delta 0
{
totalWeightedErrorsFromHigherLayer += thetas[j - 1][i] * higherLayerDeltas[j];
}
//Let a=sigmoid(x), its derivative sigmoid'(x) = a*(1-a)
double a = hiddenLayerNodesOutputs[i];
delters[i] = totalWeightedErrorsFromHigherLayer * a * (1 - a);
});
return delters;
}
private void UpdateThetaGradients(double[][] thetaGrad, double[] inputNodesActivations, double[] outputNodesDeltas)
{
//for (int j = 1; j < outputNodesDeltas.Length; j++) //ignore delta 0
Parallel.For(1, outputNodesDeltas.Length, (j) =>
{
var outputNodeDelta = outputNodesDeltas[j];
var g = thetaGrad[j - 1];
for (int i = 0; i < inputNodesActivations.Length; i++)
{
g[i] += inputNodesActivations[i] * outputNodeDelta;
}
});
}
private double[][][] AllocateAllLayerThetas(bool initialize)
{
var layers = new double[Depth - 1][][];
for (int l = 0; l < layers.Length; l++)
{
int currentLayerNodeCount = _nodeCounts[l];
int nextLayerNodeCount = _nodeCounts[l + 1];
var layer = layers[l] = new double[nextLayerNodeCount][];
for (int j = 0; j < layer.Length; j++)
{
layer[j] = new double[currentLayerNodeCount + 1];
}
}
if (initialize)
{
//init thetas with small random numbers in [-epsilon, epsilon], epsilon=sqrt(6)/[sqrt(s(layer))+sqrt(s(layer+1))]
const double epsilonInit = 0.12;
var random = new Random();
foreach (var layer in layers)
{
foreach (var inputThetas in layer)
{
for (int i = 0; i < inputThetas.Length; i++)
{
inputThetas[i] = random.NextDouble() * 2 * epsilonInit - epsilonInit;
}
}
}
}
return layers;
}
#endregion
}
}
The second part of the NeuralNetwork class below is the implementation of CheckGradients() method, it is useful to verify if cost function J() and partial derivative of theta is correct.
using System;
using System.Diagnostics;
namespace MachineLearning.ANN
{
public partial class NeuralNetwork
{
#if DEBUG
private double[][][] ComputeNumericalGradients(float[][] X, int[][] Y)
{
const double e = 1e-4;
double[][][] thetaGrads = new double[_thetas.Length][][];
for (int l = 0; l < _thetas.Length; l++)
{
var theta = _thetas[l];
var thetaGrad = thetaGrads[l] = new double[theta.Length][];
for (int j = 0; j < theta.Length; j++)
{
var t = theta[j];
var g = thetaGrad[j] = new double[t.Length];
for (int i = 0; i < t.Length; i++)
{
var oldTheta = t[i];
t[i] -= e;
double loss1 = CostFunction(X, Y, 0);
t[i] = oldTheta + e;
double loss2 = CostFunction(X, Y, 0);
g[i] = (loss2 - loss1) / (2 * e);
t[i] = oldTheta;
}
}
}
return thetaGrads;
}
public static bool CheckGradients()
{
var nn = new NeuralNetwork(3, 5, 3);
//update thetas using "sin"
var thetas = nn._thetas;
for (int l = 0; l < thetas.Length; l++)
{
var a = thetas[l];
for (int x = 1, i = 0; i < a[0].Length; i++)
{
for (int j = 0; j < a.Length; j++)
{
a[j][i] = Math.Sin(x++)/10;
}
}
}
//create samples
int m = 5; //sample count
var X = new float[m][];
var Y = new int[m][];
for (int i = 0; i < X.Length; i++)
{
X[i] = new float[3];
}
for (int x = 1, j = 0; j < 3; j++)
{
for (int i = 0; i < X.Length; i++)
{
X[i][j] = (float)Math.Sin(x++) / 10;
}
}
for (int i = 0; i < Y.Length; i++)
{
Y[i] = new int[3];
Y[i][(i+1) % 3] = 1;
}
var grad = nn.ComputeThetaGradients(X, Y, 0);
var numGrad = nn.ComputeNumericalGradients(X, Y);
for (int l = 0; l < grad.Length; l++)
{
var a = grad[l];
var b = numGrad[l];
for (int j = 0; j < a.Length; j++)
{
var g = a[j];
var n = b[j];
for (int i = 0; i < g.Length; i++)
{
Debug.WriteLine("{0}, {1}", n[i], g[i]);
}
}
}
double diff = numGrad.Minus(grad).Norm() / numGrad.Add(grad).Norm();
Debug.WriteLine("Relative defference should be less than 1e-9: {0}", diff);
return diff < 1e-9;
}
#endif
}
}
In addition, post IBinaryClassifier code here, classes implements it is adaptive to AdaBoost algorithm.
public interface IBinaryClassifier
{
event EventHandler<ClassifierTrainingEventArgs> TrainingEvent;
void CopySettingsFrom(IBinaryClassifier template);
bool Train(float[][] samples, bool[] responses);
bool Train(float[][] samples, bool[] responses, float[] weights);
Task<bool> TrainAsync(float[][] samples, bool[] responses, CancellationToken ct);
Task<bool> TrainAsync(float[][] samples, bool[] responses, float[] weights, CancellationToken ct);
bool Predict(float[] sample);
}
No comments:
Post a Comment