diff --git a/MLPP/ANN/ANN.cpp b/MLPP/ANN/ANN.cpp index d924e2f..753b3d9 100644 --- a/MLPP/ANN/ANN.cpp +++ b/MLPP/ANN/ANN.cpp @@ -58,64 +58,133 @@ namespace MLPP { void ANN::gradientDescent(double learning_rate, int max_epoch, bool UI){ class Cost cost; - Activation avn; LinAlg alg; - Reg regularization; - double cost_prev = 0; int epoch = 1; forwardPass(); + alg.printMatrix(network[network.size() - 1].weights); while(true){ cost_prev = Cost(y_hat, outputSet); - - auto costDeriv = outputLayer->costDeriv_map[outputLayer->cost]; - auto outputAvn = outputLayer->activation_map[outputLayer->activation]; - outputLayer->delta = alg.hadamard_product((cost.*costDeriv)(y_hat, outputSet), (avn.*outputAvn)(outputLayer->z, 1)); - std::vector outputWGrad = alg.mat_vec_mult(alg.transpose(outputLayer->input), outputLayer->delta); - outputLayer->weights = alg.subtraction(outputLayer->weights, alg.scalarMultiply(learning_rate/n, outputWGrad)); - outputLayer->weights = regularization.regWeights(outputLayer->weights, outputLayer->lambda, outputLayer->alpha, outputLayer->reg); - outputLayer->bias -= learning_rate * alg.sum_elements(outputLayer->delta) / n; + auto [cumulativeHiddenLayerWGrad, outputWGrad] = computeGradients(y_hat, outputSet); + cumulativeHiddenLayerWGrad = alg.scalarMultiply(learning_rate/n, cumulativeHiddenLayerWGrad); + outputWGrad = alg.scalarMultiply(learning_rate/n, outputWGrad); - if(!network.empty()){ - auto hiddenLayerAvn = network[network.size() - 1].activation_map[network[network.size() - 1].activation]; - network[network.size() - 1].delta = alg.hadamard_product(alg.outerProduct(outputLayer->delta, outputLayer->weights), (avn.*hiddenLayerAvn)(network[network.size() - 1].z, 1)); - std::vector> hiddenLayerWGrad = alg.matmult(alg.transpose(network[network.size() - 1].input), network[network.size() - 1].delta); - - network[network.size() - 1].weights = alg.subtraction(network[network.size() - 1].weights, alg.scalarMultiply(learning_rate/n, hiddenLayerWGrad)); - network[network.size() - 1].weights = regularization.regWeights(network[network.size() - 1].weights, network[network.size() - 1].lambda, network[network.size() - 1].alpha, network[network.size() - 1].reg); - network[network.size() - 1].bias = alg.subtractMatrixRows(network[network.size() - 1].bias, alg.scalarMultiply(learning_rate/n, network[network.size() - 1].delta)); + updateParameters(cumulativeHiddenLayerWGrad, outputWGrad, learning_rate); // subject to change. may want bias to have this matrix too. - for(int i = network.size() - 2; i >= 0; i--){ - auto hiddenLayerAvn = network[i].activation_map[network[i].activation]; - network[i].delta = alg.hadamard_product(alg.matmult(network[i + 1].delta, network[i + 1].weights), (avn.*hiddenLayerAvn)(network[i].z, 1)); - std::vector> hiddenLayerWGrad = alg.matmult(alg.transpose(network[i].input), network[i].delta); - network[i].weights = alg.subtraction(network[i].weights, alg.scalarMultiply(learning_rate/n, hiddenLayerWGrad)); - network[i].weights = regularization.regWeights(network[i].weights, network[i].lambda, network[i].alpha, network[i].reg); - network[i].bias = alg.subtractMatrixRows(network[i].bias, alg.scalarMultiply(learning_rate/n, network[i].delta)); - } - } - forwardPass(); - if(UI) { - Utilities::CostInfo(epoch, cost_prev, Cost(y_hat, outputSet)); - std::cout << "Layer " << network.size() + 1 << ": " << std::endl; - Utilities::UI(outputLayer->weights, outputLayer->bias); - if(!network.empty()){ - for(int i = network.size() - 1; i >= 0; i--){ - std::cout << "Layer " << i + 1 << ": " << std::endl; - Utilities::UI(network[i].weights, network[i].bias); - } - } - } + if(UI) { ANN::UI(epoch, cost_prev, y_hat, outputSet); } epoch++; if(epoch > max_epoch) { break; } } } + void ANN::MBGD(double learning_rate, int max_epoch, int mini_batch_size, bool UI){ + class Cost cost; + LinAlg alg; + + double cost_prev = 0; + int epoch = 1; + + // Creating the mini-batches + int n_mini_batch = n/mini_batch_size; + // always evaluate the result + // always do forward pass only ONCE at end. + auto [inputMiniBatches, outputMiniBatches] = Utilities::createMiniBatches(inputSet, outputSet, n_mini_batch); + while(true){ + for(int i = 0; i < n_mini_batch; i++){ + std::vector y_hat = modelSetTest(inputMiniBatches[i]); + cost_prev = Cost(y_hat, outputMiniBatches[i]); + + auto [cumulativeHiddenLayerWGrad, outputWGrad] = computeGradients(y_hat, outputMiniBatches[i]); + cumulativeHiddenLayerWGrad = alg.scalarMultiply(learning_rate/n, cumulativeHiddenLayerWGrad); + outputWGrad = alg.scalarMultiply(learning_rate/n, outputWGrad); + + updateParameters(cumulativeHiddenLayerWGrad, outputWGrad, learning_rate); // subject to change. may want bias to have this matrix too. + y_hat = modelSetTest(inputMiniBatches[i]); + + if(UI) { ANN::UI(epoch, cost_prev, y_hat, outputMiniBatches[i]); } + } + epoch++; + if(epoch > max_epoch) { break; } + } + forwardPass(); + } + + void ANN::Adam(double learning_rate, int max_epoch, int mini_batch_size, double b1, double b2, double e, bool UI){ + class Cost cost; + LinAlg alg; + + double cost_prev = 0; + int epoch = 1; + + // Creating the mini-batches + int n_mini_batch = n/mini_batch_size; + // always evaluate the result + // always do forward pass only ONCE at end. + auto [inputMiniBatches, outputMiniBatches] = Utilities::createMiniBatches(inputSet, outputSet, n_mini_batch); + + // Initializing necessary components for Adam. + std::vector>> m_hidden; + std::vector>> v_hidden; + + std::vector m_output; + std::vector v_output; + while(true){ + for(int i = 0; i < n_mini_batch; i++){ + std::vector y_hat = modelSetTest(inputMiniBatches[i]); + cost_prev = Cost(y_hat, outputMiniBatches[i]); + + auto [cumulativeHiddenLayerWGrad, outputWGrad] = computeGradients(y_hat, outputMiniBatches[i]); + + if(!network.empty() && m_hidden.empty() && v_hidden.empty()){ // Initing our tensor + m_hidden.resize(cumulativeHiddenLayerWGrad.size()); + v_hidden.resize(cumulativeHiddenLayerWGrad.size()); + for(int i = 0; i < cumulativeHiddenLayerWGrad.size(); i++){ + m_hidden[i].resize(cumulativeHiddenLayerWGrad[i].size()); + v_hidden[i].resize(cumulativeHiddenLayerWGrad[i].size()); + for(int j = 0; j < cumulativeHiddenLayerWGrad[i].size(); j++){ + m_hidden[i][j].resize(cumulativeHiddenLayerWGrad[i][j].size()); + v_hidden[i][j].resize(cumulativeHiddenLayerWGrad[i][j].size()); + } + } + } + + if(m_output.empty() && v_output.empty()){ + m_output.resize(outputWGrad.size()); + v_output.resize(outputWGrad.size()); + } + + m_hidden = alg.addition(alg.scalarMultiply(b1, m_hidden), alg.scalarMultiply(1 - b1, cumulativeHiddenLayerWGrad)); + v_hidden = alg.addition(alg.scalarMultiply(b2, v_hidden), alg.scalarMultiply(1 - b2, alg.exponentiate(cumulativeHiddenLayerWGrad, 2))); + + m_output = alg.addition(alg.scalarMultiply(b1, m_output), alg.scalarMultiply(1 - b1, outputWGrad)); + v_output = alg.addition(alg.scalarMultiply(b2, v_output), alg.scalarMultiply(1 - b2, alg.exponentiate(outputWGrad, 2))); + + std::vector>> m_hidden_hat = alg.scalarMultiply(1/(1 - pow(b1, epoch)), m_hidden); + std::vector>> v_hidden_hat = alg.scalarMultiply(1/(1 - pow(b2, epoch)), v_hidden); + + std::vector m_output_hat = alg.scalarMultiply(1/(1 - pow(b1, epoch)), m_output); + std::vector v_output_hat = alg.scalarMultiply(1/(1 - pow(b2, epoch)), v_output); + + std::vector>> hiddenLayerUpdations = alg.scalarMultiply(learning_rate/n, alg.elementWiseDivision(m_hidden_hat, alg.scalarAdd(e, alg.sqrt(v_hidden_hat)))); + std::vector outputLayerUpdation = alg.scalarMultiply(learning_rate/n, alg.elementWiseDivision(m_output_hat, alg.scalarAdd(e, alg.sqrt(v_output_hat)))); + + + updateParameters(hiddenLayerUpdations, outputLayerUpdation, learning_rate); // subject to change. may want bias to have this matrix too. + y_hat = modelSetTest(inputMiniBatches[i]); + + if(UI) { ANN::UI(epoch, cost_prev, y_hat, outputMiniBatches[i]); } + } + epoch++; + if(epoch > max_epoch) { break; } + } + forwardPass(); + } + double ANN::score(){ Utilities util; forwardPass(); @@ -148,6 +217,7 @@ namespace MLPP { } void ANN::addOutputLayer(std::string activation, std::string loss, std::string weightInit, std::string reg, double lambda, double alpha){ + LinAlg alg; if(!network.empty()){ outputLayer = new OutputLayer(network[0].n_hidden, activation, loss, network[network.size() - 1].a, weightInit, reg, lambda, alpha); } @@ -187,4 +257,67 @@ namespace MLPP { outputLayer->forwardPass(); y_hat = outputLayer->a; } + + void ANN::updateParameters(std::vector>> hiddenLayerUpdations, std::vector outputLayerUpdation, double learning_rate){ + LinAlg alg; + + outputLayer->weights = alg.subtraction(outputLayer->weights, outputLayerUpdation); + outputLayer->bias -= learning_rate * alg.sum_elements(outputLayer->delta) / n; + + if(!network.empty()){ + + network[network.size() - 1].weights = alg.subtraction(network[network.size() - 1].weights, hiddenLayerUpdations[0]); + network[network.size() - 1].bias = alg.subtractMatrixRows(network[network.size() - 1].bias, alg.scalarMultiply(learning_rate/n, network[network.size() - 1].delta)); + + for(int i = network.size() - 2; i >= 0; i--){ + network[i].weights = alg.subtraction(network[i].weights, hiddenLayerUpdations[(network.size() - 2) - i + 1]); + network[i].bias = alg.subtractMatrixRows(network[i].bias, alg.scalarMultiply(learning_rate/n, network[i].delta)); + } + } + } + + std::tuple>>, std::vector> ANN::computeGradients(std::vector y_hat, std::vector outputSet){ + class Cost cost; + Activation avn; + LinAlg alg; + Reg regularization; + + std::vector>> cumulativeHiddenLayerWGrad; // Tensor containing ALL hidden grads. + + auto costDeriv = outputLayer->costDeriv_map[outputLayer->cost]; + auto outputAvn = outputLayer->activation_map[outputLayer->activation]; + outputLayer->delta = alg.hadamard_product((cost.*costDeriv)(y_hat, outputSet), (avn.*outputAvn)(outputLayer->z, 1)); + std::vector outputWGrad = alg.mat_vec_mult(alg.transpose(outputLayer->input), outputLayer->delta); + outputWGrad = alg.addition(outputWGrad, regularization.regDerivTerm(outputLayer->weights, outputLayer->lambda, outputLayer->alpha, outputLayer->reg)); + + if(!network.empty()){ + auto hiddenLayerAvn = network[network.size() - 1].activation_map[network[network.size() - 1].activation]; + network[network.size() - 1].delta = alg.hadamard_product(alg.outerProduct(outputLayer->delta, outputLayer->weights), (avn.*hiddenLayerAvn)(network[network.size() - 1].z, 1)); + std::vector> hiddenLayerWGrad = alg.matmult(alg.transpose(network[network.size() - 1].input), network[network.size() - 1].delta); + + cumulativeHiddenLayerWGrad.push_back(alg.addition(hiddenLayerWGrad, regularization.regDerivTerm(network[network.size() - 1].weights, network[network.size() - 1].lambda, network[network.size() - 1].alpha, network[network.size() - 1].reg))); // Adding to our cumulative hidden layer grads. Maintain reg terms as well. + + for(int i = network.size() - 2; i >= 0; i--){ + auto hiddenLayerAvn = network[i].activation_map[network[i].activation]; + network[i].delta = alg.hadamard_product(alg.matmult(network[i + 1].delta, network[i + 1].weights), (avn.*hiddenLayerAvn)(network[i].z, 1)); + std::vector> hiddenLayerWGrad = alg.matmult(alg.transpose(network[i].input), network[i].delta); + + cumulativeHiddenLayerWGrad.push_back(alg.addition(hiddenLayerWGrad, regularization.regDerivTerm(network[i].weights, network[i].lambda, network[i].alpha, network[i].reg))); // Adding to our cumulative hidden layer grads. Maintain reg terms as well. + + } + } + return {cumulativeHiddenLayerWGrad, outputWGrad}; + } + + void ANN::UI(int epoch, double cost_prev, std::vector y_hat, std::vector outputSet){ + Utilities::CostInfo(epoch, cost_prev, Cost(y_hat, outputSet)); + std::cout << "Layer " << network.size() + 1 << ": " << std::endl; + Utilities::UI(outputLayer->weights, outputLayer->bias); + if(!network.empty()){ + for(int i = network.size() - 1; i >= 0; i--){ + std::cout << "Layer " << i + 1 << ": " << std::endl; + Utilities::UI(network[i].weights, network[i].bias); + } + } + } } \ No newline at end of file diff --git a/MLPP/ANN/ANN.hpp b/MLPP/ANN/ANN.hpp index 4964acf..64b1e75 100644 --- a/MLPP/ANN/ANN.hpp +++ b/MLPP/ANN/ANN.hpp @@ -11,6 +11,7 @@ #include "OutputLayer/OutputLayer.hpp" #include +#include #include namespace MLPP{ @@ -22,6 +23,8 @@ class ANN{ std::vector modelSetTest(std::vector> X); double modelTest(std::vector x); void gradientDescent(double learning_rate, int max_epoch, bool UI = 1); + void MBGD(double learning_rate, int max_epoch, int mini_batch_size, bool UI = 1); + void Adam(double learning_rate, int max_epoch, int mini_batch_size, double b1, double b2, double e, bool UI = 1); double score(); void save(std::string fileName); @@ -30,7 +33,13 @@ class ANN{ private: double Cost(std::vector y_hat, std::vector y); + void forwardPass(); + void updateParameters(std::vector>> hiddenLayerUpdations, std::vector outputLayerUpdation, double learning_rate); + std::tuple>>, std::vector> computeGradients(std::vector y_hat, std::vector outputSet); + + void UI(int epoch, double cost_prev, std::vector y_hat, std::vector outputSet); + std::vector> inputSet; std::vector outputSet; diff --git a/MLPP/LinAlg/LinAlg.cpp b/MLPP/LinAlg/LinAlg.cpp index 66a514b..1fec7ac 100644 --- a/MLPP/LinAlg/LinAlg.cpp +++ b/MLPP/LinAlg/LinAlg.cpp @@ -1059,6 +1059,34 @@ namespace MLPP{ return c; } + std::vector>> LinAlg::addition(std::vector>> A, std::vector>> B){ + for(int i = 0; i < A.size(); i++){ + A[i] = addition(A[i], B[i]); + } + return A; + } + + std::vector>> LinAlg::elementWiseDivision(std::vector>> A, std::vector>> B){ + for(int i = 0; i < A.size(); i++){ + A[i] = elementWiseDivision(A[i], B[i]); + } + return A; + } + + std::vector>> LinAlg::sqrt(std::vector>> A){ + for(int i = 0; i < A.size(); i++){ + A[i] = sqrt(A[i]); + } + return A; + } + + std::vector>> LinAlg::exponentiate(std::vector>> A, double p){ + for(int i = 0; i < A.size(); i++){ + A[i] = exponentiate(A[i], p); + } + return A; + } + std::vector> LinAlg::tensor_vec_mult(std::vector>> A, std::vector b){ std::vector> C; C.resize(A.size()); @@ -1088,4 +1116,18 @@ namespace MLPP{ if(i != A.size() - 1) { std::cout << std::endl; } } } + + std::vector>> LinAlg::scalarMultiply(double scalar, std::vector>> A){ + for(int i = 0; i < A.size(); i++){ + A[i] = scalarMultiply(scalar, A[i]); + } + return A; + } + + std::vector>> LinAlg::scalarAdd(double scalar, std::vector>> A){ + for(int i = 0; i < A.size(); i++){ + A[i] = scalarAdd(scalar, A[i]); + } + return A; + } } \ No newline at end of file diff --git a/MLPP/LinAlg/LinAlg.hpp b/MLPP/LinAlg/LinAlg.hpp index 75922a3..5f9402a 100644 --- a/MLPP/LinAlg/LinAlg.hpp +++ b/MLPP/LinAlg/LinAlg.hpp @@ -190,12 +190,26 @@ namespace MLPP{ std::vector mat_vec_mult(std::vector> A, std::vector b); // TENSOR FUNCTIONS + std::vector>> addition(std::vector>> A, std::vector>> B); + + std::vector>> elementWiseDivision(std::vector>> A, std::vector>> B); + + std::vector>> sqrt(std::vector>> A); + + std::vector>> exponentiate(std::vector>> A, double p); + std::vector> tensor_vec_mult(std::vector>> A, std::vector b); std::vector flatten(std::vector>> A); void printTensor(std::vector>> A); + std::vector>> scalarMultiply(double scalar, std::vector>> A); + + std::vector>> scalarAdd(double scalar, std::vector>> A); + + std::vector>> resize(std::vector>> A, std::vector>> B); + private: }; diff --git a/MLPP/LinReg/LinReg.hpp b/MLPP/LinReg/LinReg.hpp index bcc7a92..dbf484a 100644 --- a/MLPP/LinReg/LinReg.hpp +++ b/MLPP/LinReg/LinReg.hpp @@ -20,14 +20,14 @@ namespace MLPP{ void NewtonRaphson(double learning_rate, int max_epoch, bool UI); void gradientDescent(double learning_rate, int max_epoch, bool UI = 1); void SGD(double learning_rate, int max_epoch, bool UI = 1); - void MBGD(double learning_rate, int max_epoch, int mini_batch_size, bool UI = 1); - void Momentum(double learning_rate, int max_epoch, int mini_batch_size, double gamma, bool UI = 1); - void NAG(double learning_rate, int max_epoch, int mini_batch_size, double gamma, bool UI = 1); - void Adagrad(double learning_rate, int max_epoch, int mini_batch_size, double e, bool UI = 1); - void Adadelta(double learning_rate, int max_epoch, int mini_batch_size, double b1, double e, bool UI = 1); - void Adam(double learning_rate, int max_epoch, int mini_batch_size, double b1, double b2, double e, bool UI = 1); - void Adamax(double learning_rate, int max_epoch, int mini_batch_size, double b1, double b2, double e, bool UI = 1); - void Nadam(double learning_rate, int max_epoch, int mini_batch_size, double b1, double b2, double e, bool UI = 1); + // void MBGD(double learning_rate, int max_epoch, int mini_batch_size, bool UI = 1); + // void Momentum(double learning_rate, int max_epoch, int mini_batch_size, double gamma, bool UI = 1); + // void NAG(double learning_rate, int max_epoch, int mini_batch_size, double gamma, bool UI = 1); + // void Adagrad(double learning_rate, int max_epoch, int mini_batch_size, double e, bool UI = 1); + // void Adadelta(double learning_rate, int max_epoch, int mini_batch_size, double b1, double e, bool UI = 1); + // void Adam(double learning_rate, int max_epoch, int mini_batch_size, double b1, double b2, double e, bool UI = 1); + // void Adamax(double learning_rate, int max_epoch, int mini_batch_size, double b1, double b2, double e, bool UI = 1); + // void Nadam(double learning_rate, int max_epoch, int mini_batch_size, double b1, double b2, double e, bool UI = 1); void normalEquation(); double score(); void save(std::string fileName); diff --git a/a.out b/a.out index 2521504..81eff6f 100755 Binary files a/a.out and b/a.out differ diff --git a/main.cpp b/main.cpp index 939a8f4..8e63a10 100644 --- a/main.cpp +++ b/main.cpp @@ -212,10 +212,10 @@ int main() { // alg.printVector(model.modelSetTest(inputSet)); // // MULIVARIATE LINEAR REGRESSION - std::vector> inputSet = {{1,2,3,4,5,6,7,8,9,10}, {3,5,9,12,15,18,21,24,27,30}}; - std::vector outputSet = {2,4,6,8,10,12,14,16,18,20}; + // std::vector> inputSet = {{1,2,3,4,5,6,7,8,9,10}, {3,5,9,12,15,18,21,24,27,30}}; + // std::vector outputSet = {2,4,6,8,10,12,14,16,18,20}; - LinReg model(alg.transpose(inputSet), outputSet); // Can use Lasso, Ridge, ElasticNet Reg + //LinReg model(alg.transpose(inputSet), outputSet); // Can use Lasso, Ridge, ElasticNet Reg //model.gradientDescent(0.001, 30, 0); //model.SGD(0.001, 30000, 1); @@ -224,10 +224,10 @@ int main() { - LinReg adamModel(alg.transpose(inputSet), outputSet); - adamModel.Nadam(0.1, 5, 1, 0.9, 0.999, 1e-8, 0); // Change batch size = sgd, bgd - alg.printVector(adamModel.modelSetTest(alg.transpose(inputSet))); - std::cout << "ACCURACY: " << 100 * adamModel.score() << "%" << std::endl; + // LinReg adamModel(alg.transpose(inputSet), outputSet); + // adamModel.Nadam(0.1, 5, 1, 0.9, 0.999, 1e-8, 0); // Change batch size = sgd, bgd + // alg.printVector(adamModel.modelSetTest(alg.transpose(inputSet))); + // std::cout << "ACCURACY: " << 100 * adamModel.score() << "%" << std::endl; // const int TRIAL_NUM = 1000; @@ -361,15 +361,16 @@ int main() { // Possible Weight Init Methods: Default, Uniform, HeNormal, HeUniform, XavierNormal, XavierUniform // Possible Activations: Linear, Sigmoid, Swish, Softplus, Softsign, CLogLog, Ar{Sinh, Cosh, Tanh, Csch, Sech, Coth}, GaussianCDF, GELU, UnitStep // Possible Loss Functions: MSE, RMSE, MBE, LogLoss, CrossEntropy, HingeLoss - // std::vector> inputSet = {{0,0,1,1}, {0,1,0,1}}; - // std::vector outputSet = {0,1,1,0}; - // ANN ann(alg.transpose(inputSet), outputSet); - // ann.addLayer(10, "RELU", "Default", "Ridge", 0.0001); - // ann.addLayer(10, "Sigmoid", "Default"); - // ann.addOutputLayer("Sigmoid", "LogLoss", "XavierNormal"); - // ann.gradientDescent(0.1, 80000, 0); - // alg.printVector(ann.modelSetTest(alg.transpose(inputSet))); - // std::cout << "ACCURACY: " << 100 * ann.score() << "%" << std::endl; + std::vector> inputSet = {{0,0,1,1}, {0,1,0,1}}; + std::vector outputSet = {0,1,1,0}; + ANN ann(alg.transpose(inputSet), outputSet); + //ann.addLayer(10, "RELU", "Default", "Ridge", 0.0001); + ann.addLayer(10, "RELU", "Default", "XavierNormal"); + ann.addOutputLayer("Sigmoid", "LogLoss"); + ann.Adam(0.1, 800, 2, 0.9, 0.999, 1e-8, 1); + //ann.MBGD(0.1, 1000, 2, 1); + alg.printVector(ann.modelSetTest(alg.transpose(inputSet))); + std::cout << "ACCURACY: " << 100 * ann.score() << "%" << std::endl; // typedef std::vector> Matrix; // typedef std::vector Vector;