Wednesday, September 28, 2016

Adam Algorithm - Probably the Best Adaptive Learning Rate Method for Deep Learning

The name of "Adam" derives from the phrase "adaptive moment estimation".

I implemented it in C# by inheriting the basic NeuralNetwork class I created before for <<Stanford Machine Learning>> online course, and tested it in my Machine Learning Demo application, the cost function is dropped surprisingly fast.

Here is the comparison:
Basic SGD: Cost drops from 0.812 to 0.797 after 3000 iterations
Adam: Cost drops from 0.812 to 0.375 after 3000 iterations

The cost function trend charts are in screenshots.



The Adam algorithm was first time published in this paper below:



My source code:

using System;
using System.Threading.Tasks;

namespace MachineLearning.ANN
{
    /// <summary>
    /// Adaptive learning rate neural network based on Adam algorithm: http://arxiv.org/pdf/1412.6980v8.pdf
    /// </summary>
    public class AdamAlgorithmNeuralNetwork : NeuralNetwork
    {
        private double[][][] _s;    //1st moment variables in Adam algorithm
        private double[][][] _r;    //2nd moment variables in Adam algorithm

        public AdamAlgorithmNeuralNetwork(params int[] nodeCountInEachLayer):base(nodeCountInEachLayer)
        {
            this.Alpha = 0.001f;
            this.Ro1 = 0.9f;
            this.Ro2 = 0.999f;
        }

        /// <summary>
        /// Exponential decay rates for first moment estimates
        /// </summary>
        public float Ro2 { get; set; }
        /// <summary>
        /// Exponential decay rates for second moment estimates
        /// </summary>
        public float Ro1 { get; set; }

        protected override void BeforeTraining()
        {
            base.BeforeTraining();
            _s = AllocateAllLayerThetas(false);
            _r = AllocateAllLayerThetas(false);
        }

        protected override void AfterTraining()
        {
            base.AfterTraining();
            _s = null;
            _r = null;
        }

        /// <summary>
        /// Descend thetas by Adam algorithm
        /// </summary>
        /// <param name="timeStep">time step "t"</param>
        protected override void DescendThetas(int timeStep)
        {
            int t = timeStep + 1;
            Parallel.For(0, _thetas.Length, (l) => //for (int l = 0; l < _thetas.Length; l++)
            {
                var thetas = _thetas[l];
                var thetaGrads = _thetaGradients[l];
                var sl = _s[l];
                var rl = _r[l];
                for (int j = 0; j < thetas.Length; j++)
                {
                    var thetaVector = thetas[j];
                    var thetaGradVector = thetaGrads[j];
                    var s = sl[j];
                    var r = rl[j];
                    for (int i = 0; i < thetaVector.Length; i++)
                    {
                        var g = thetaGradVector[i];
                        var sc =
                            (s[i] = Ro1*s[i] + (1 - Ro1)*g) //Update biased first moment estimate
                            / (1 - Math.Pow(Ro1, t));       //Correct bias in first moment
                        var rc =
                            (r[i] = Ro2*r[i] + (1 - Ro2)*g*g)   //update biased second meoment estimate
                            / (1 - Math.Pow(Ro2, t));           //Correct bias in second moment

                        thetaVector[i] -= Alpha*sc/(Math.Sqrt(rc) + 0.00000001);
                    }
                }
            });
        }
    }
}

1 comment: