diff --git a/mlpp/ann/ann.cpp b/mlpp/ann/ann.cpp index 3635afd..f949b28 100644 --- a/mlpp/ann/ann.cpp +++ b/mlpp/ann/ann.cpp @@ -8,6 +8,7 @@ #include "../activation/activation.h" #include "../cost/cost.h" +#include "../lin_alg/lin_alg.h" #include "../regularization/reg.h" #include "../utilities/utilities.h" #include "core/log/logger.h" @@ -62,6 +63,7 @@ real_t MLPPANN::model_test(const Ref &x) { void MLPPANN::gradient_descent(real_t learning_rate, int max_epoch, bool ui) { MLPPCost mlpp_cost; + MLPPLinAlg alg; real_t cost_prev = 0; int epoch = 1; @@ -75,8 +77,8 @@ void MLPPANN::gradient_descent(real_t learning_rate, int max_epoch, bool ui) { cost_prev = cost(_y_hat, _output_set); ComputeGradientsResult grads = compute_gradients(_y_hat, _output_set); + grads.cumulative_hidden_layer_w_grad = alg.scalar_multiplynvt(learning_rate / _n, grads.cumulative_hidden_layer_w_grad); - grads.cumulative_hidden_layer_w_grad->scalar_multiply(learning_rate / _n); grads.output_w_grad->scalar_multiply(learning_rate / _n); update_parameters(grads.cumulative_hidden_layer_w_grad, grads.output_w_grad, learning_rate); // subject to change. may want bias to have this matrix too. @@ -97,6 +99,7 @@ void MLPPANN::gradient_descent(real_t learning_rate, int max_epoch, bool ui) { void MLPPANN::sgd(real_t learning_rate, int max_epoch, bool ui) { MLPPCost mlpp_cost; + MLPPLinAlg alg; real_t cost_prev = 0; int epoch = 1; @@ -134,7 +137,7 @@ void MLPPANN::sgd(real_t learning_rate, int max_epoch, bool ui) { ComputeGradientsResult grads = compute_gradients(y_hat_row_tmp, output_set_row_tmp); - grads.cumulative_hidden_layer_w_grad->scalar_multiply(learning_rate / _n); + grads.cumulative_hidden_layer_w_grad = alg.scalar_multiplynvt(learning_rate / _n, grads.cumulative_hidden_layer_w_grad); grads.output_w_grad->scalar_multiply(learning_rate / _n); update_parameters(grads.cumulative_hidden_layer_w_grad, grads.output_w_grad, learning_rate); // subject to change. may want bias to have this matrix too. @@ -156,6 +159,7 @@ void MLPPANN::sgd(real_t learning_rate, int max_epoch, bool ui) { void MLPPANN::mbgd(real_t learning_rate, int max_epoch, int mini_batch_size, bool ui) { MLPPCost mlpp_cost; + MLPPLinAlg alg; real_t cost_prev = 0; int epoch = 1; @@ -180,7 +184,7 @@ void MLPPANN::mbgd(real_t learning_rate, int max_epoch, int mini_batch_size, boo ComputeGradientsResult grads = compute_gradients(y_hat, current_output_batch); - grads.cumulative_hidden_layer_w_grad->scalar_multiply(learning_rate / _n); + grads.cumulative_hidden_layer_w_grad = alg.scalar_multiplynvt(learning_rate / _n, grads.cumulative_hidden_layer_w_grad); grads.output_w_grad->scalar_multiply(learning_rate / _n); update_parameters(grads.cumulative_hidden_layer_w_grad, grads.output_w_grad, learning_rate); // subject to change. may want bias to have this matrix too. @@ -203,6 +207,7 @@ void MLPPANN::mbgd(real_t learning_rate, int max_epoch, int mini_batch_size, boo void MLPPANN::momentum(real_t learning_rate, int max_epoch, int mini_batch_size, real_t gamma, bool nag, bool ui) { class MLPPCost mlpp_cost; + MLPPLinAlg alg; real_t cost_prev = 0; int epoch = 1; @@ -216,8 +221,7 @@ void MLPPANN::momentum(real_t learning_rate, int max_epoch, int mini_batch_size, MLPPUtilities::CreateMiniBatchMVBatch batches = MLPPUtilities::create_mini_batchesmv(_input_set, _output_set, n_mini_batch); // Initializing necessary components for Adam. - Ref v_hidden; - v_hidden.instance(); + Vector> v_hidden; Ref v_output; v_output.instance(); @@ -234,8 +238,8 @@ void MLPPANN::momentum(real_t learning_rate, int max_epoch, int mini_batch_size, ComputeGradientsResult grads = compute_gradients(y_hat, current_output_batch); - if (!_network.empty() && v_hidden->size() == Size3i()) { // Initing our tensor - v_hidden->resize(grads.cumulative_hidden_layer_w_grad->size()); + if (!_network.empty() && v_hidden.empty()) { // Initing our tensor + alg.resizevt(v_hidden, grads.cumulative_hidden_layer_w_grad); } if (v_output->size() == 0) { @@ -246,7 +250,7 @@ void MLPPANN::momentum(real_t learning_rate, int max_epoch, int mini_batch_size, update_parameters(v_hidden, v_output, 0); // DON'T update bias. } - v_hidden = v_hidden->scalar_multiplyn(gamma)->addn(grads.cumulative_hidden_layer_w_grad->scalar_multiplyn(learning_rate / _n)); + v_hidden = alg.additionnvt(alg.scalar_multiplynvt(gamma, v_hidden), alg.scalar_multiplynvt(learning_rate / _n, grads.cumulative_hidden_layer_w_grad)); v_output = v_output->scalar_multiplyn(gamma)->addn(grads.output_w_grad->scalar_multiplyn(learning_rate / _n)); update_parameters(v_hidden, v_output, learning_rate); // subject to change. may want bias to have this matrix too. @@ -269,6 +273,7 @@ void MLPPANN::momentum(real_t learning_rate, int max_epoch, int mini_batch_size, void MLPPANN::adagrad(real_t learning_rate, int max_epoch, int mini_batch_size, real_t e, bool ui) { MLPPCost mlpp_cost; + MLPPLinAlg alg; real_t cost_prev = 0; int epoch = 1; @@ -282,8 +287,7 @@ void MLPPANN::adagrad(real_t learning_rate, int max_epoch, int mini_batch_size, MLPPUtilities::CreateMiniBatchMVBatch batches = MLPPUtilities::create_mini_batchesmv(_input_set, _output_set, n_mini_batch); // Initializing necessary components for Adam. - Ref v_hidden; - v_hidden.instance(); + Vector> v_hidden; Ref v_output; v_output.instance(); @@ -300,18 +304,19 @@ void MLPPANN::adagrad(real_t learning_rate, int max_epoch, int mini_batch_size, ComputeGradientsResult grads = compute_gradients(y_hat, current_output_batch); - if (!_network.empty() && v_hidden->size() == Size3i()) { // Initing our tensor - v_hidden->resize(grads.cumulative_hidden_layer_w_grad->size()); + if (!_network.empty() && v_hidden.empty()) { // Initing our tensor + alg.resizevt(v_hidden, grads.cumulative_hidden_layer_w_grad); } if (v_output->size() == 0) { v_output->resize(grads.output_w_grad->size()); } - v_hidden->add(grads.cumulative_hidden_layer_w_grad->exponentiaten(2)); + v_hidden = alg.additionnvt(v_hidden, alg.exponentiatenvt(grads.cumulative_hidden_layer_w_grad, 2)); v_output->add(grads.output_w_grad->exponentiaten(2)); - Ref hidden_layer_updations = grads.cumulative_hidden_layer_w_grad->division_element_wisen(v_hidden->sqrtn()->scalar_addn(e))->scalar_multiplyn(learning_rate / _n); + Vector> hidden_layer_updations = alg.scalar_multiplynvt(learning_rate / _n, alg.division_element_wisenvnvt(grads.cumulative_hidden_layer_w_grad, alg.scalar_addnvt(e, alg.sqrtnvt(v_hidden)))); + Ref output_layer_updation = grads.output_w_grad->division_element_wisen(v_output->sqrtn()->scalar_addn(e))->scalar_multiplyn(learning_rate / _n); update_parameters(hidden_layer_updations, output_layer_updation, learning_rate); // subject to change. may want bias to have this matrix too. @@ -334,6 +339,7 @@ void MLPPANN::adagrad(real_t learning_rate, int max_epoch, int mini_batch_size, void MLPPANN::adadelta(real_t learning_rate, int max_epoch, int mini_batch_size, real_t b1, real_t e, bool ui) { MLPPCost mlpp_cost; + MLPPLinAlg alg; real_t cost_prev = 0; int epoch = 1; @@ -347,8 +353,7 @@ void MLPPANN::adadelta(real_t learning_rate, int max_epoch, int mini_batch_size, MLPPUtilities::CreateMiniBatchMVBatch batches = MLPPUtilities::create_mini_batchesmv(_input_set, _output_set, n_mini_batch); // Initializing necessary components for Adam. - Ref v_hidden; - v_hidden.instance(); + Vector> v_hidden; Ref v_output; v_output.instance(); @@ -364,18 +369,18 @@ void MLPPANN::adadelta(real_t learning_rate, int max_epoch, int mini_batch_size, ComputeGradientsResult grads = compute_gradients(y_hat, current_output_batch); - if (!_network.empty() && v_hidden->size() == Size3i()) { // Initing our tensor - v_hidden->resize(grads.cumulative_hidden_layer_w_grad->size()); + if (!_network.empty() && v_hidden.empty()) { // Initing our tensor + alg.resizevt(v_hidden, grads.cumulative_hidden_layer_w_grad); } if (v_output->size() == 0) { v_output->resize(grads.output_w_grad->size()); } - v_hidden = v_hidden->scalar_multiplyn(1 - b1)->addn(grads.cumulative_hidden_layer_w_grad->exponentiaten(2)->scalar_multiplyn(b1)); + v_hidden = alg.additionnvt(alg.scalar_multiplynvt(1 - b1, v_hidden), alg.scalar_multiplynvt(b1, alg.exponentiatenvt(grads.cumulative_hidden_layer_w_grad, 2))); v_output->add(grads.output_w_grad->exponentiaten(2)); - Ref hidden_layer_updations = grads.cumulative_hidden_layer_w_grad->division_element_wisen(v_hidden->sqrtn()->scalar_addn(e))->scalar_multiplyn(learning_rate / _n); + Vector> hidden_layer_updations = alg.scalar_multiplynvt(learning_rate / _n, alg.division_element_wisenvnvt(grads.cumulative_hidden_layer_w_grad, alg.scalar_addnvt(e, alg.sqrtnvt(v_hidden)))); Ref output_layer_updation = grads.output_w_grad->division_element_wisen(v_output->sqrtn()->scalar_addn(e))->scalar_multiplyn(learning_rate / _n); update_parameters(hidden_layer_updations, output_layer_updation, learning_rate); // subject to change. may want bias to have this matrix too. @@ -398,6 +403,7 @@ void MLPPANN::adadelta(real_t learning_rate, int max_epoch, int mini_batch_size, void MLPPANN::adam(real_t learning_rate, int max_epoch, int mini_batch_size, real_t b1, real_t b2, real_t e, bool ui) { MLPPCost mlpp_cost; + MLPPLinAlg alg; real_t cost_prev = 0; int epoch = 1; @@ -411,10 +417,8 @@ void MLPPANN::adam(real_t learning_rate, int max_epoch, int mini_batch_size, rea MLPPUtilities::CreateMiniBatchMVBatch batches = MLPPUtilities::create_mini_batchesmv(_input_set, _output_set, n_mini_batch); // Initializing necessary components for Adam. - Ref m_hidden; - Ref v_hidden; - m_hidden.instance(); - v_hidden.instance(); + Vector> m_hidden; + Vector> v_hidden; Ref m_output; Ref v_output; @@ -432,9 +436,9 @@ void MLPPANN::adam(real_t learning_rate, int max_epoch, int mini_batch_size, rea ComputeGradientsResult grads = compute_gradients(y_hat, current_output_batch); - if (!_network.empty() && m_hidden->size() == Size3i() && v_hidden->size() == Size3i()) { // Initing our tensor - m_hidden->resize(grads.cumulative_hidden_layer_w_grad->size()); - v_hidden->resize(grads.cumulative_hidden_layer_w_grad->size()); + if (!_network.empty() && m_hidden.empty() && v_hidden.empty()) { // Initing our tensor + alg.resizevt(m_hidden, grads.cumulative_hidden_layer_w_grad); + alg.resizevt(v_hidden, grads.cumulative_hidden_layer_w_grad); } if (m_output->size() == 0 && v_output->size()) { @@ -442,19 +446,19 @@ void MLPPANN::adam(real_t learning_rate, int max_epoch, int mini_batch_size, rea v_output->resize(grads.output_w_grad->size()); } - m_hidden = m_hidden->scalar_multiplyn(b1)->addn(grads.cumulative_hidden_layer_w_grad->scalar_multiplyn(1 - b1)); - v_hidden = v_hidden->scalar_multiplyn(b2)->addn(grads.cumulative_hidden_layer_w_grad->exponentiaten(2)->scalar_multiplyn(1 - b2)); + m_hidden = alg.additionnvt(alg.scalar_multiplynvt(b1, m_hidden), alg.scalar_multiplynvt(1 - b1, grads.cumulative_hidden_layer_w_grad)); + v_hidden = alg.additionnvt(alg.scalar_multiplynvt(b2, v_hidden), alg.scalar_multiplynvt(1 - b2, alg.exponentiatenvt(grads.cumulative_hidden_layer_w_grad, 2))); m_output = m_output->scalar_multiplyn(b1)->addn(grads.output_w_grad->scalar_multiplyn(1 - b1)); v_output = v_output->scalar_multiplyn(b2)->addn(grads.output_w_grad->exponentiaten(2)->scalar_multiplyn(1 - b2)); - Ref m_hidden_hat = m_hidden->scalar_multiplyn(1 / (1 - Math::pow(b1, epoch))); - Ref v_hidden_hat = v_hidden->scalar_multiplyn(1 / (1 - Math::pow(b2, epoch))); + Vector> m_hidden_hat = alg.scalar_multiplynvt(1 / (1 - Math::pow(b1, epoch)), m_hidden); + Vector> v_hidden_hat = alg.scalar_multiplynvt(1 / (1 - Math::pow(b2, epoch)), v_hidden); Ref m_output_hat = m_output->scalar_multiplyn(1 / (1 - Math::pow(b1, epoch))); Ref v_output_hat = v_output->scalar_multiplyn(1 / (1 - Math::pow(b2, epoch))); - Ref hidden_layer_updations = m_hidden_hat->division_element_wisen(v_hidden_hat->sqrtn()->scalar_addn(e))->scalar_multiplyn(learning_rate / _n); + Vector> hidden_layer_updations = alg.scalar_multiplynvt(learning_rate / _n, alg.division_element_wisenvnvt(m_hidden_hat, alg.scalar_addnvt(e, alg.sqrtnvt(v_hidden_hat)))); Ref output_layer_updation = m_output_hat->division_element_wisen(v_output_hat->sqrtn()->scalar_addn(e))->scalar_multiplyn(learning_rate / _n); update_parameters(hidden_layer_updations, output_layer_updation, learning_rate); // subject to change. may want bias to have this matrix too. @@ -476,6 +480,7 @@ void MLPPANN::adam(real_t learning_rate, int max_epoch, int mini_batch_size, rea void MLPPANN::adamax(real_t learning_rate, int max_epoch, int mini_batch_size, real_t b1, real_t b2, real_t e, bool ui) { MLPPCost mlpp_cost; + MLPPLinAlg alg; real_t cost_prev = 0; int epoch = 1; @@ -489,10 +494,8 @@ void MLPPANN::adamax(real_t learning_rate, int max_epoch, int mini_batch_size, r MLPPUtilities::CreateMiniBatchMVBatch batches = MLPPUtilities::create_mini_batchesmv(_input_set, _output_set, n_mini_batch); // Initializing necessary components for Adam. - Ref m_hidden; - Ref u_hidden; - m_hidden.instance(); - u_hidden.instance(); + Vector> m_hidden; + Vector> u_hidden; Ref m_output; Ref u_output; @@ -511,9 +514,9 @@ void MLPPANN::adamax(real_t learning_rate, int max_epoch, int mini_batch_size, r ComputeGradientsResult grads = compute_gradients(y_hat, current_output_batch); - if (!_network.empty() && m_hidden->size() == Size3i() && u_hidden->size() == Size3i()) { // Initing our tensor - m_hidden->resize(grads.cumulative_hidden_layer_w_grad->size()); - u_hidden->resize(grads.cumulative_hidden_layer_w_grad->size()); + if (!_network.empty() && m_hidden.empty() && u_hidden.empty()) { // Initing our tensor + alg.resizevt(m_hidden, grads.cumulative_hidden_layer_w_grad); + alg.resizevt(u_hidden, grads.cumulative_hidden_layer_w_grad); } if (m_output->size() == 0 && u_output->size() == 0) { @@ -521,16 +524,16 @@ void MLPPANN::adamax(real_t learning_rate, int max_epoch, int mini_batch_size, r u_output->resize(grads.output_w_grad->size()); } - m_hidden->addb(m_hidden->scalar_multiplyn(b1), grads.cumulative_hidden_layer_w_grad->scalar_multiplyn(1 - b1)); - u_hidden->maxb(u_hidden->scalar_multiplyn(b2), grads.cumulative_hidden_layer_w_grad->absn()); + m_hidden = alg.additionnvt(alg.scalar_multiplynvt(b1, m_hidden), alg.scalar_multiplynvt(1 - b1, grads.cumulative_hidden_layer_w_grad)); + u_hidden = alg.maxnvt(alg.scalar_multiplynvt(b2, u_hidden), alg.absnvt(grads.cumulative_hidden_layer_w_grad)); m_output->addb(m_output->scalar_multiplyn(b1), grads.output_w_grad->scalar_multiplyn(1 - b1)); u_output->maxb(u_output->scalar_multiplyn(b2), grads.output_w_grad->absn()); - Ref m_hidden_hat = m_hidden->scalar_multiplyn(1 / (1 - Math::pow(b1, epoch))); + Vector> m_hidden_hat = alg.scalar_multiplynvt(1 / (1 - Math::pow(b1, epoch)), m_hidden); Ref m_output_hat = m_output->scalar_multiplyn(1 / (1 - Math::pow(b1, epoch))); - Ref hidden_layer_updations = m_hidden_hat->division_element_wisen(u_hidden->scalar_addn(e))->scalar_multiplyn(learning_rate / _n); + Vector> hidden_layer_updations = alg.scalar_multiplynvt(learning_rate / _n, alg.division_element_wisenvnvt(m_hidden_hat, alg.scalar_addnvt(e, u_hidden))); Ref output_layer_updation = m_output_hat->division_element_wisen(u_output->scalar_addn(e))->scalar_multiplyn(learning_rate / _n); update_parameters(hidden_layer_updations, output_layer_updation, learning_rate); // subject to change. may want bias to have this matrix too. @@ -552,6 +555,7 @@ void MLPPANN::adamax(real_t learning_rate, int max_epoch, int mini_batch_size, r void MLPPANN::nadam(real_t learning_rate, int max_epoch, int mini_batch_size, real_t b1, real_t b2, real_t e, bool ui) { MLPPCost mlpp_cost; + MLPPLinAlg alg; real_t cost_prev = 0; int epoch = 1; @@ -565,10 +569,8 @@ void MLPPANN::nadam(real_t learning_rate, int max_epoch, int mini_batch_size, re MLPPUtilities::CreateMiniBatchMVBatch batches = MLPPUtilities::create_mini_batchesmv(_input_set, _output_set, n_mini_batch); // Initializing necessary components for Adam. - Ref m_hidden; - Ref v_hidden; - m_hidden.instance(); - v_hidden.instance(); + Vector> m_hidden; + Vector> v_hidden; Ref m_output; Ref v_output; @@ -587,9 +589,9 @@ void MLPPANN::nadam(real_t learning_rate, int max_epoch, int mini_batch_size, re ComputeGradientsResult grads = compute_gradients(y_hat, current_output_batch); - if (!_network.empty() && m_hidden->size() == Size3i() && v_hidden->size() == Size3i()) { // Initing our tensor - m_hidden->resize(grads.cumulative_hidden_layer_w_grad->size()); - v_hidden->resize(grads.cumulative_hidden_layer_w_grad->size()); + if (!_network.empty() && m_hidden.empty() && v_hidden.empty()) { // Initing our tensor + alg.resizevt(m_hidden, grads.cumulative_hidden_layer_w_grad); + alg.resizevt(v_hidden, grads.cumulative_hidden_layer_w_grad); } if (m_output->size() == 0 && v_output->size() == 0) { @@ -597,21 +599,21 @@ void MLPPANN::nadam(real_t learning_rate, int max_epoch, int mini_batch_size, re v_output->resize(grads.output_w_grad->size()); } - m_hidden->addb(m_hidden->scalar_multiplyn(b1), grads.cumulative_hidden_layer_w_grad->scalar_multiplyn(1 - b1)); - v_hidden->addb(v_hidden->scalar_multiplyn(b2), grads.cumulative_hidden_layer_w_grad->exponentiaten(2)->scalar_multiplyn(1 - b2)); + m_hidden = alg.additionnvt(alg.scalar_multiplynvt(b1, m_hidden), alg.scalar_multiplynvt(1 - b1, grads.cumulative_hidden_layer_w_grad)); + v_hidden = alg.additionnvt(alg.scalar_multiplynvt(b2, v_hidden), alg.scalar_multiplynvt(1 - b2, alg.exponentiatenvt(grads.cumulative_hidden_layer_w_grad, 2))); m_output->addb(m_output->scalar_multiplyn(b1), grads.output_w_grad->scalar_multiplyn(1 - b1)); v_output->addb(v_output->scalar_multiplyn(b2), grads.output_w_grad->exponentiaten(2)->scalar_multiplyn(1 - b2)); - Ref m_hidden_hat = m_hidden->scalar_multiplyn(1 / (1.0 - Math::pow(b1, epoch))); - Ref v_hidden_hat = v_hidden->scalar_multiplyn(1 / (1.0 - Math::pow(b2, epoch))); - Ref m_hidden_final = m_hidden_hat->scalar_multiplyn(b1)->addn(grads.cumulative_hidden_layer_w_grad->scalar_multiplyn((1 - b1) / (1 - Math::pow(b1, epoch)))); + Vector> m_hidden_hat = alg.scalar_multiplynvt(1 / (1 - Math::pow(b1, epoch)), m_hidden); + Vector> v_hidden_hat = alg.scalar_multiplynvt(1 / (1 - Math::pow(b2, epoch)), v_hidden); + Vector> m_hidden_final = alg.additionnvt(alg.scalar_multiplynvt(b1, m_hidden_hat), alg.scalar_multiplynvt((1 - b1) / (1 - Math::pow(b1, epoch)), grads.cumulative_hidden_layer_w_grad)); Ref m_output_hat = m_output->scalar_multiplyn(1 / (1.0 - Math::pow(b1, epoch))); Ref v_output_hat = v_output->scalar_multiplyn(1 / (1.0 - Math::pow(b2, epoch))); Ref m_output_final = m_output_hat->scalar_multiplyn(b1)->addn(grads.output_w_grad->scalar_multiplyn((1 - b1) / (1.0 - Math::pow(b1, epoch)))); - Ref hidden_layer_updations = m_hidden_final->division_element_wisen(v_hidden_hat->sqrtn()->scalar_multiplyn(e))->scalar_multiplyn(learning_rate / _n); + Vector> hidden_layer_updations = alg.scalar_multiplynvt(learning_rate / _n, alg.division_element_wisenvnvt(m_hidden_final, alg.scalar_addnvt(e, alg.sqrtnvt(v_hidden_hat)))); Ref output_layer_updation = m_output_final->division_element_wisen(v_output_hat->sqrtn()->scalar_addn(e))->scalar_multiplyn(learning_rate / _n); update_parameters(hidden_layer_updations, output_layer_updation, learning_rate); // subject to change. may want bias to have this matrix too. @@ -635,6 +637,7 @@ void MLPPANN::nadam(real_t learning_rate, int max_epoch, int mini_batch_size, re void MLPPANN::amsgrad(real_t learning_rate, int max_epoch, int mini_batch_size, real_t b1, real_t b2, real_t e, bool ui) { MLPPCost mlpp_cost; + MLPPLinAlg alg; real_t cost_prev = 0; int epoch = 1; @@ -648,13 +651,10 @@ void MLPPANN::amsgrad(real_t learning_rate, int max_epoch, int mini_batch_size, MLPPUtilities::CreateMiniBatchMVBatch batches = MLPPUtilities::create_mini_batchesmv(_input_set, _output_set, n_mini_batch); // Initializing necessary components for Adam. - Ref m_hidden; - Ref v_hidden; - m_hidden.instance(); - v_hidden.instance(); + Vector> m_hidden; + Vector> v_hidden; - Ref v_hidden_hat; - v_hidden_hat.instance(); + Vector> v_hidden_hat; Ref m_output; Ref v_output; @@ -676,11 +676,10 @@ void MLPPANN::amsgrad(real_t learning_rate, int max_epoch, int mini_batch_size, ComputeGradientsResult grads = compute_gradients(y_hat, current_output_batch); - if (!_network.empty() && m_hidden->size() == Size3i() && v_hidden->size() == Size3i()) { // Initing our tensor - - m_hidden->resize(grads.cumulative_hidden_layer_w_grad->size()); - v_hidden->resize(grads.cumulative_hidden_layer_w_grad->size()); - v_hidden_hat->resize(grads.cumulative_hidden_layer_w_grad->size()); + if (!_network.empty() && m_hidden.size() == 0 && v_hidden.size() == 0) { // Initing our tensor + alg.resizevt(m_hidden, grads.cumulative_hidden_layer_w_grad); + alg.resizevt(v_hidden, grads.cumulative_hidden_layer_w_grad); + alg.resizevt(v_hidden_hat, grads.cumulative_hidden_layer_w_grad); } if (m_output->size() == 0 && v_output->size() == 0) { @@ -689,16 +688,16 @@ void MLPPANN::amsgrad(real_t learning_rate, int max_epoch, int mini_batch_size, v_output_hat->resize(grads.output_w_grad->size()); } - m_hidden->addb(m_hidden->scalar_multiplyn(b1), grads.cumulative_hidden_layer_w_grad->scalar_multiplyn(1 - b1)); - v_hidden->addb(v_hidden->scalar_multiplyn(b2), grads.cumulative_hidden_layer_w_grad->exponentiaten(2)->scalar_multiplyn(1 - b2)); + m_hidden = alg.additionnvt(alg.scalar_multiplynvt(b1, m_hidden), alg.scalar_multiplynvt(1 - b1, grads.cumulative_hidden_layer_w_grad)); + v_hidden = alg.additionnvt(alg.scalar_multiplynvt(b2, v_hidden), alg.scalar_multiplynvt(1 - b2, alg.exponentiatenvt(grads.cumulative_hidden_layer_w_grad, 2))); m_output->addb(m_output->scalar_multiplyn(b1), grads.output_w_grad->scalar_multiplyn(1 - b1)); v_output->addb(v_output->scalar_multiplyn(b2), grads.output_w_grad->exponentiaten(2)->scalar_multiplyn(1 - b2)); - v_hidden_hat->max(v_hidden); + v_hidden_hat = alg.maxnvt(v_hidden_hat, v_hidden); v_output_hat->max(v_output); - Ref hidden_layer_updations = m_hidden->division_element_wisen(v_hidden_hat->sqrtn()->scalar_addn(e))->scalar_multiplyn(learning_rate / _n); + Vector> hidden_layer_updations = alg.scalar_multiplynvt(learning_rate / _n, alg.division_element_wisenvnvt(m_hidden, alg.scalar_addnvt(e, alg.sqrtnvt(v_hidden_hat)))); Ref output_layer_updation = m_output->division_element_wisen(v_output_hat->sqrtn()->scalar_addn(e))->scalar_multiplyn(learning_rate / _n); update_parameters(hidden_layer_updations, output_layer_updation, learning_rate); // subject to change. may want bias to have this matrix too. @@ -847,17 +846,16 @@ void MLPPANN::forward_pass() { _y_hat = _output_layer->get_a(); } -void MLPPANN::update_parameters(const Ref &hidden_layer_updations, const Ref &output_layer_updation, real_t learning_rate) { +void MLPPANN::update_parameters(const Vector> &hidden_layer_updations, const Ref &output_layer_updation, real_t learning_rate) { _output_layer->set_weights(_output_layer->get_weights()->subn(output_layer_updation)); _output_layer->set_bias(_output_layer->get_bias() - learning_rate * _output_layer->get_delta()->sum_elements() / _n); Ref slice; - slice.instance(); if (!_network.empty()) { Ref layer = _network[_network.size() - 1]; - hidden_layer_updations->z_slice_get_into_mlpp_matrix(0, slice); + slice = hidden_layer_updations[0]; layer->set_weights(layer->get_weights()->subn(slice)); layer->set_bias(layer->get_bias()->subtract_matrix_rowsn(layer->get_delta()->scalar_multiplyn(learning_rate / _n))); @@ -865,7 +863,7 @@ void MLPPANN::update_parameters(const Ref &hidden_layer_updations, for (int i = _network.size() - 2; i >= 0; i--) { layer = _network[i]; - hidden_layer_updations->z_slice_get_into_mlpp_matrix((_network.size() - 2) - i + 1, slice); + slice = hidden_layer_updations[(_network.size() - 2) - i + 1]; layer->set_weights(layer->get_weights()->subn(slice)); layer->set_bias(layer->get_bias()->subtract_matrix_rowsn(layer->get_delta()->scalar_multiplyn(learning_rate / _n))); @@ -893,11 +891,8 @@ MLPPANN::ComputeGradientsResult MLPPANN::compute_gradients(const Ref Ref hidden_layer_w_grad = layer->get_input()->transposen()->multn(layer->get_delta()); - // TODO Are these always uniform in size? - res.cumulative_hidden_layer_w_grad->resize(Size3i(hidden_layer_w_grad->size().x, hidden_layer_w_grad->size().y, 0)); - // Adding to our cumulative hidden layer grads. Maintain reg terms as well. - res.cumulative_hidden_layer_w_grad->z_slice_add_mlpp_matrix(hidden_layer_w_grad->addn(regularization.reg_deriv_termm(layer->get_weights(), layer->get_lambda(), layer->get_alpha(), layer->get_reg()))); + res.cumulative_hidden_layer_w_grad.push_back(hidden_layer_w_grad->addn(regularization.reg_deriv_termm(layer->get_weights(), layer->get_lambda(), layer->get_alpha(), layer->get_reg()))); for (int i = _network.size() - 2; i >= 0; i--) { layer = _network[i]; @@ -905,7 +900,7 @@ MLPPANN::ComputeGradientsResult MLPPANN::compute_gradients(const Ref layer->set_delta(next_layer->get_delta()->multn(next_layer->get_weights()->transposen())->hadamard_productn(avn.run_activation_deriv_matrix(layer->get_activation(), layer->get_z()))); hidden_layer_w_grad = layer->get_input()->transposen()->multn(layer->get_delta()); - res.cumulative_hidden_layer_w_grad->z_slice_add_mlpp_matrix(hidden_layer_w_grad->addn(regularization.reg_deriv_termm(layer->get_weights(), layer->get_lambda(), layer->get_alpha(), layer->get_reg()))); // Adding to our cumulative hidden layer grads. Maintain reg terms as well. + res.cumulative_hidden_layer_w_grad.push_back(hidden_layer_w_grad->addn(regularization.reg_deriv_termm(layer->get_weights(), layer->get_lambda(), layer->get_alpha(), layer->get_reg()))); // Adding to our cumulative hidden layer grads. Maintain reg terms as well. } } diff --git a/mlpp/ann/ann.h b/mlpp/ann/ann.h index bea0330..8fe524a 100644 --- a/mlpp/ann/ann.h +++ b/mlpp/ann/ann.h @@ -70,14 +70,13 @@ protected: real_t cost(const Ref &y_hat, const Ref &y); void forward_pass(); - void update_parameters(const Ref &hidden_layer_updations, const Ref &output_layer_updation, real_t learning_rate); + void update_parameters(const Vector> &hidden_layer_updations, const Ref &output_layer_updation, real_t learning_rate); struct ComputeGradientsResult { - Ref cumulative_hidden_layer_w_grad; + Vector> cumulative_hidden_layer_w_grad; Ref output_w_grad; ComputeGradientsResult() { - cumulative_hidden_layer_w_grad.instance(); output_w_grad.instance(); } }; diff --git a/mlpp/lin_alg/lin_alg.cpp b/mlpp/lin_alg/lin_alg.cpp index 9190857..fa1d5aa 100644 --- a/mlpp/lin_alg/lin_alg.cpp +++ b/mlpp/lin_alg/lin_alg.cpp @@ -1768,6 +1768,14 @@ Vector> MLPPLinAlg::additionnvt(const Vector> &A return res; } +void MLPPLinAlg::division_element_wisevt(const Vector> &A, const Vector> &B) { + for (int i = 0; i < A.size(); i++) { + Ref m = A[i]; + + m->division_element_wise(B[i]); + } +} + Vector> MLPPLinAlg::division_element_wisenvnvt(const Vector> &A, const Vector> &B) { Vector> res; res.resize(A.size()); @@ -1841,14 +1849,26 @@ Vector> MLPPLinAlg::scalar_addnvt(real_t scalar, Vector> MLPPLinAlg::resizenvt(const Vector> &A, const Vector> &B) { +void MLPPLinAlg::resizevt(Vector> &r_target, const Vector> &A) { + r_target.resize(A.size()); + + for (int i = 0; i < r_target.size(); i++) { + Ref m; + m.instance(); + m->resize(A[i]->size()); + + r_target.write[i] = m; + } +} + +Vector> MLPPLinAlg::resizencvt(const Vector> &A) { Vector> res; - res.resize(B.size()); + res.resize(A.size()); for (int i = 0; i < res.size(); i++) { Ref m; m.instance(); - m->resize(B[i]->size()); + m->resize(A[i]->size()); res.write[i] = m; } diff --git a/mlpp/lin_alg/lin_alg.h b/mlpp/lin_alg/lin_alg.h index 9d76d89..e71308d 100644 --- a/mlpp/lin_alg/lin_alg.h +++ b/mlpp/lin_alg/lin_alg.h @@ -219,6 +219,7 @@ public: // TENSOR FUNCTIONS Vector> additionnvt(const Vector> &A, const Vector> &B); + void division_element_wisevt(const Vector> &A, const Vector> &B); Vector> division_element_wisenvnvt(const Vector> &A, const Vector> &B); Vector> sqrtnvt(const Vector> &A); @@ -232,7 +233,8 @@ public: Vector> scalar_multiplynvt(real_t scalar, Vector> A); Vector> scalar_addnvt(real_t scalar, Vector> A); - Vector> resizenvt(const Vector> &A, const Vector> &B); + void resizevt(Vector> &r_target, const Vector> &A); + Vector> resizencvt(const Vector> &A); //std::vector>> hadamard_product(std::vector>> A, std::vector>> B);