Added learning rate schedulers and decay for neural nets.

2024-11-12 10:15:01 +01:00 · 2022-01-30 01:04:23 -08:00 · 2022-01-30 01:04:23 -08:00 · a13e0e344b
commit a13e0e344b
parent e1e8c251e4
6 changed files with 63 additions and 26 deletions
--- a/MLPP/ANN/ANN.cpp
+++ b/MLPP/ANN/ANN.cpp
@ -16,7 +16,7 @@

 namespace MLPP {
    ANN::ANN(std::vector<std::vector<double>> inputSet, std::vector<double> outputSet)
-    : inputSet(inputSet), outputSet(outputSet), n(inputSet.size()), k(inputSet[0].size())
+    : inputSet(inputSet), outputSet(outputSet), n(inputSet.size()), k(inputSet[0].size()), lrScheduler("None"), decayConstant(0)
    {

    }
@ -66,6 +66,7 @@ namespace MLPP {

        alg.printMatrix(network[network.size() - 1].weights);
        while(true){
+            learning_rate = applyLearningRateScheduler(learning_rate, decayConstant, epoch);
            cost_prev = Cost(y_hat, outputSet);

            auto [cumulativeHiddenLayerWGrad, outputWGrad] = computeGradients(y_hat, outputSet);
@ -96,6 +97,7 @@ namespace MLPP {
        // always do forward pass only ONCE at end.
        auto [inputMiniBatches, outputMiniBatches] = Utilities::createMiniBatches(inputSet, outputSet, n_mini_batch);
        while(true){
+            learning_rate = applyLearningRateScheduler(learning_rate, decayConstant, epoch);
            for(int i = 0; i < n_mini_batch; i++){
                std::vector<double> y_hat = modelSetTest(inputMiniBatches[i]);
                cost_prev = Cost(y_hat, outputMiniBatches[i]);
@ -133,6 +135,7 @@ namespace MLPP {
        
        std::vector<double> v_output;
        while(true){
+            learning_rate = applyLearningRateScheduler(learning_rate, decayConstant, epoch);
            for(int i = 0; i < n_mini_batch; i++){
                std::vector<double> y_hat = modelSetTest(inputMiniBatches[i]);
                cost_prev = Cost(y_hat, outputMiniBatches[i]);
@ -184,6 +187,7 @@ namespace MLPP {
        
        std::vector<double> v_output;
        while(true){
+            learning_rate = applyLearningRateScheduler(learning_rate, decayConstant, epoch);
            for(int i = 0; i < n_mini_batch; i++){
                std::vector<double> y_hat = modelSetTest(inputMiniBatches[i]);
                cost_prev = Cost(y_hat, outputMiniBatches[i]);
@ -234,6 +238,7 @@ namespace MLPP {
        
        std::vector<double> v_output;
        while(true){
+            learning_rate = applyLearningRateScheduler(learning_rate, decayConstant, epoch);
            for(int i = 0; i < n_mini_batch; i++){
                std::vector<double> y_hat = modelSetTest(inputMiniBatches[i]);
                cost_prev = Cost(y_hat, outputMiniBatches[i]);
@ -286,6 +291,7 @@ void ANN::Adam(double learning_rate, int max_epoch, int mini_batch_size, double
        std::vector<double> m_output;
        std::vector<double> v_output;
        while(true){
+            learning_rate = applyLearningRateScheduler(learning_rate, decayConstant, epoch);
            for(int i = 0; i < n_mini_batch; i++){
                std::vector<double> y_hat = modelSetTest(inputMiniBatches[i]);
                cost_prev = Cost(y_hat, outputMiniBatches[i]);
@ -348,6 +354,7 @@ void ANN::Adam(double learning_rate, int max_epoch, int mini_batch_size, double
        std::vector<double> m_output;
        std::vector<double> u_output;
        while(true){
+            learning_rate = applyLearningRateScheduler(learning_rate, decayConstant, epoch);
            for(int i = 0; i < n_mini_batch; i++){
                std::vector<double> y_hat = modelSetTest(inputMiniBatches[i]);
                cost_prev = Cost(y_hat, outputMiniBatches[i]);
@ -409,6 +416,7 @@ void ANN::Adam(double learning_rate, int max_epoch, int mini_batch_size, double
        std::vector<double> m_output;
        std::vector<double> v_output;
        while(true){
+            learning_rate = applyLearningRateScheduler(learning_rate, decayConstant, epoch);
            for(int i = 0; i < n_mini_batch; i++){
                std::vector<double> y_hat = modelSetTest(inputMiniBatches[i]);
                cost_prev = Cost(y_hat, outputMiniBatches[i]);
@ -478,6 +486,7 @@ void ANN::Adam(double learning_rate, int max_epoch, int mini_batch_size, double

        std::vector<double> v_output_hat;
        while(true){
+            learning_rate = applyLearningRateScheduler(learning_rate, decayConstant, epoch);
            for(int i = 0; i < n_mini_batch; i++){
                std::vector<double> y_hat = modelSetTest(inputMiniBatches[i]);
                cost_prev = Cost(y_hat, outputMiniBatches[i]);
@ -540,6 +549,25 @@ void ANN::Adam(double learning_rate, int max_epoch, int mini_batch_size, double
        }
     }

+     void ANN::setLearningRateScheduler(std::string type, double decayConstant){
+        lrScheduler = type;
+        ANN::decayConstant = decayConstant;
+     }
+
+    // https://en.wikipedia.org/wiki/Learning_rate
+    // Learning Rate Decay (C2W2L09) - Andrew Ng - Deep Learning Specialization
+     double ANN::applyLearningRateScheduler(double learningRate, double decayConstant, double epoch){
+         if(lrScheduler == "Time"){
+             return learningRate / (1 + decayConstant * epoch);
+         }
+         else if(lrScheduler == "Exponential"){
+             return learningRate * std::exp(-decayConstant * epoch);
+         }
+         else if(lrScheduler == "Epoch"){
+             return learningRate * (decayConstant / std::sqrt(epoch));
+         }
+     }
+
    void ANN::addLayer(int n_hidden, std::string activation, std::string weightInit, std::string reg, double lambda, double alpha){
        if(network.empty()){
            network.push_back(HiddenLayer(n_hidden, activation, inputSet, weightInit, reg, lambda, alpha));
@ -612,8 +640,7 @@ void ANN::Adam(double learning_rate, int max_epoch, int mini_batch_size, double
    }
    
    std::tuple<std::vector<std::vector<std::vector<double>>>, std::vector<double>> ANN::computeGradients(std::vector<double> y_hat, std::vector<double> outputSet){
-        std::cout << "BEGIN" << std::endl;
-        std::cout << k << std::endl;
+       // std::cout << "BEGIN" << std::endl;
        class Cost cost; 
        Activation avn;
        LinAlg alg;
--- a/MLPP/ANN/ANN.hpp
+++ b/MLPP/ANN/ANN.hpp
@ -34,6 +34,9 @@ class ANN{
        double score(); 
        void save(std::string fileName);

+        void setLearningRateScheduler(std::string type, double k);
+        double applyLearningRateScheduler(double learningRate, double decayConstant, double epoch);
+
        void addLayer(int n_hidden, std::string activation, std::string weightInit = "Default", std::string reg = "None", double lambda = 0.5, double alpha = 0.5); 
        void addOutputLayer(std::string activation, std::string loss, std::string weightInit = "Default", std::string reg = "None", double lambda = 0.5, double alpha = 0.5); 
        
@ -56,6 +59,9 @@ class ANN{

            int n;
            int k;
+
+            std::string lrScheduler;
+            double decayConstant;
    };
 }

--- a/MLPP/GAN/GAN.hpp
+++ b/MLPP/GAN/GAN.hpp
@ -21,7 +21,6 @@ class GAN{
        GAN(double k, std::vector<std::vector<double>> outputSet);
        ~GAN();
        std::vector<std::vector<double>> generateExample(int n);
-        double modelTest(std::vector<double> x);
        void gradientDescent(double learning_rate, int max_epoch, bool UI = 1);
        double score(); 
        void save(std::string fileName);
--- a/README.md
+++ b/README.md
@ -131,6 +131,10 @@ The result will be the model's predictions for the entire dataset.
        - He Uniform
        - LeCun Normal
        - LeCun Uniform
+    6. Possible Learning Rate Schedulers
+        - Time Based 
+        - Exponential 
+        - Epoch Based
 3. ***Prebuilt Neural Networks***
    1. Multilayer Peceptron
    2. Autoencoder
--- a/a.out
+++ b/a.out
--- a/main.cpp
+++ b/main.cpp
@ -363,30 +363,31 @@ int main() {
    // Possible Weight Init Methods: Default, Uniform, HeNormal, HeUniform, XavierNormal, XavierUniform
    // Possible Activations: Linear, Sigmoid, Swish, Softplus, Softsign, CLogLog, Ar{Sinh, Cosh, Tanh, Csch, Sech, Coth},  GaussianCDF, GELU, UnitStep
    // Possible Loss Functions: MSE, RMSE, MBE, LogLoss, CrossEntropy, HingeLoss
-    // std::vector<std::vector<double>> inputSet = {{0,0,1,1}, {0,1,0,1}};
-    // std::vector<double> outputSet = {0,1,1,0};
-    // ANN ann(alg.transpose(inputSet), outputSet);
-    // //ann.addLayer(10, "Sigmoid");
-    // ann.addLayer(10, "Sigmoid");
-    // ann.addOutputLayer("Sigmoid", "LogLoss");
-    // //ann.AMSGrad(0.1, 10000, 1, 0.9, 0.999, 0.000001, 1);
-    // //ann.Adadelta(1, 1000, 2, 0.9, 0.000001, 1);
-    // ann.Momentum(0.1, 8000, 2, 0.9, true, 1);
-    // //ann.MBGD(0.1, 1000, 2, 1);
-    // alg.printVector(ann.modelSetTest(alg.transpose(inputSet)));
-    // std::cout << "ACCURACY: " << 100 * ann.score() << "%" << std::endl;
+    std::vector<std::vector<double>> inputSet = {{0,0,1,1}, {0,1,0,1}};
+    std::vector<double> outputSet = {0,1,1,0};
+    ANN ann(alg.transpose(inputSet), outputSet);
+    ann.addLayer(2, "Sigmoid");
+    ann.addLayer(2, "Sigmoid");
+    ann.addOutputLayer("Sigmoid", "LogLoss");
+    //ann.AMSGrad(0.1, 10000, 1, 0.9, 0.999, 0.000001, 1);
+    //ann.Adadelta(1, 1000, 2, 0.9, 0.000001, 1);
+    //ann.Momentum(0.1, 8000, 2, 0.9, true, 1);
+    ann.setLearningRateScheduler("Time", 0.000000000001);
+    ann.gradientDescent(0.1, 20000, 1);
+    alg.printVector(ann.modelSetTest(alg.transpose(inputSet)));
+    std::cout << "ACCURACY: " << 100 * ann.score() << "%" << std::endl;

-    std::vector<std::vector<double>> outputSet = {{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20}, 
-                                                {2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40}};
+    //std::vector<std::vector<double>> outputSet = {{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20}, 
+    //                                            {2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40}};
    //Vector outputSet = {0,1,1,0};
-    GAN gan(2, alg.transpose(outputSet));
-    gan.addLayer(5, "Sigmoid");
-    gan.addLayer(2, "RELU");
-    gan.addLayer(5, "Sigmoid");
-    gan.addOutputLayer("Sigmoid", "LogLoss");
-    gan.gradientDescent(0.1, 25000, 0);
-    std::cout << "GENERATED INPUT: (Gaussian-sampled noise):" << std::endl;
-    alg.printMatrix(gan.generateExample(5));
+    // GAN gan(2, alg.transpose(outputSet));
+    // gan.addLayer(5, "Sigmoid");
+    // gan.addLayer(2, "RELU");
+    // gan.addLayer(5, "Sigmoid");
+    // gan.addOutputLayer("Sigmoid", "LogLoss");
+    // gan.gradientDescent(0.1, 25000, 0);
+    // std::cout << "GENERATED INPUT: (Gaussian-sampled noise):" << std::endl;
+    // alg.printMatrix(gan.generateExample(100));


    // typedef std::vector<std::vector<double>> Matrix;