From c79bd2050d06d0e492b1d4b47089005fcc547495 Mon Sep 17 00:00:00 2001
From: Relintai <relintai@protonmail.com>
Date: Wed, 25 Jan 2023 23:43:21 +0100
Subject: [PATCH] Restore some old optimization methods to MLPPLinReg.

---
 mlpp/lin_reg/lin_reg.cpp | 375 ++++++++++++++++++++++++++++++++++++---
 mlpp/lin_reg/lin_reg.h   |  11 +-
 2 files changed, 364 insertions(+), 22 deletions(-)

diff --git a/mlpp/lin_reg/lin_reg.cpp b/mlpp/lin_reg/lin_reg.cpp
index c5d26db..9baec0b 100644
--- a/mlpp/lin_reg/lin_reg.cpp
+++ b/mlpp/lin_reg/lin_reg.cpp
@@ -15,8 +15,6 @@
 #include <iostream>
 #include <random>
 
-
-
 MLPPLinReg::MLPPLinReg(std::vector<std::vector<double>> inputSet, std::vector<double> outputSet, std::string reg, double lambda, double alpha) :
 		inputSet(inputSet), outputSet(outputSet), n(inputSet.size()), k(inputSet[0].size()), reg(reg), lambda(lambda), alpha(alpha) {
 	y_hat.resize(n);
@@ -173,9 +171,344 @@ void MLPPLinReg::MBGD(double learning_rate, int max_epoch, int mini_batch_size,
 	forwardPass();
 }
 
+void MLPPLinReg::Momentum(double learning_rate, int max_epoch, int mini_batch_size, double gamma, bool UI) {
+	MLPPLinAlg alg;
+	MLPPReg regularization;
+	double cost_prev = 0;
+	int epoch = 1;
+
+	// Creating the mini-batches
+	int n_mini_batch = n / mini_batch_size;
+	auto [inputMiniBatches, outputMiniBatches] = MLPPUtilities::createMiniBatches(inputSet, outputSet, n_mini_batch);
+
+	// Initializing necessary components for Momentum.
+	std::vector<double> v = alg.zerovec(weights.size());
+	while (true) {
+		for (int i = 0; i < n_mini_batch; i++) {
+			std::vector<double> y_hat = Evaluate(inputMiniBatches[i]);
+			cost_prev = Cost(y_hat, outputMiniBatches[i]);
+
+			std::vector<double> error = alg.subtraction(y_hat, outputMiniBatches[i]);
+
+			// Calculating the weight gradients
+			std::vector<double> gradient = alg.scalarMultiply(1 / outputMiniBatches[i].size(), alg.mat_vec_mult(alg.transpose(inputMiniBatches[i]), error));
+			std::vector<double> RegDerivTerm = regularization.regDerivTerm(weights, lambda, alpha, reg);
+			std::vector<double> weight_grad = alg.addition(gradient, RegDerivTerm); // Weight_grad_final
+
+			v = alg.addition(alg.scalarMultiply(gamma, v), alg.scalarMultiply(learning_rate, weight_grad));
+
+			weights = alg.subtraction(weights, v);
+
+			// Calculating the bias gradients
+			bias -= learning_rate * alg.sum_elements(error) / outputMiniBatches[i].size(); // As normal
+			y_hat = Evaluate(inputMiniBatches[i]);
+
+			if (UI) {
+				MLPPUtilities::CostInfo(epoch, cost_prev, Cost(y_hat, outputMiniBatches[i]));
+				MLPPUtilities::UI(weights, bias);
+			}
+		}
+		epoch++;
+		if (epoch > max_epoch) {
+			break;
+		}
+	}
+	forwardPass();
+}
+
+void MLPPLinReg::NAG(double learning_rate, int max_epoch, int mini_batch_size, double gamma, bool UI) {
+	MLPPLinAlg alg;
+	MLPPReg regularization;
+	double cost_prev = 0;
+	int epoch = 1;
+
+	// Creating the mini-batches
+	int n_mini_batch = n / mini_batch_size;
+	auto [inputMiniBatches, outputMiniBatches] = MLPPUtilities::createMiniBatches(inputSet, outputSet, n_mini_batch);
+
+	// Initializing necessary components for Momentum.
+	std::vector<double> v = alg.zerovec(weights.size());
+	while (true) {
+		for (int i = 0; i < n_mini_batch; i++) {
+			weights = alg.subtraction(weights, alg.scalarMultiply(gamma, v)); // "Aposterori" calculation
+
+			std::vector<double> y_hat = Evaluate(inputMiniBatches[i]);
+			cost_prev = Cost(y_hat, outputMiniBatches[i]);
+
+			std::vector<double> error = alg.subtraction(y_hat, outputMiniBatches[i]);
+
+			// Calculating the weight gradients
+			std::vector<double> gradient = alg.scalarMultiply(1 / outputMiniBatches[i].size(), alg.mat_vec_mult(alg.transpose(inputMiniBatches[i]), error));
+			std::vector<double> RegDerivTerm = regularization.regDerivTerm(weights, lambda, alpha, reg);
+			std::vector<double> weight_grad = alg.addition(gradient, RegDerivTerm); // Weight_grad_final
+
+			v = alg.addition(alg.scalarMultiply(gamma, v), alg.scalarMultiply(learning_rate, weight_grad));
+
+			weights = alg.subtraction(weights, v);
+
+			// Calculating the bias gradients
+			bias -= learning_rate * alg.sum_elements(error) / outputMiniBatches[i].size(); // As normal
+			y_hat = Evaluate(inputMiniBatches[i]);
+
+			if (UI) {
+				MLPPUtilities::CostInfo(epoch, cost_prev, Cost(y_hat, outputMiniBatches[i]));
+				MLPPUtilities::UI(weights, bias);
+			}
+		}
+		epoch++;
+		if (epoch > max_epoch) {
+			break;
+		}
+	}
+	forwardPass();
+}
+
+void MLPPLinReg::Adagrad(double learning_rate, int max_epoch, int mini_batch_size, double e, bool UI) {
+	MLPPLinAlg alg;
+	MLPPReg regularization;
+	double cost_prev = 0;
+	int epoch = 1;
+
+	// Creating the mini-batches
+	int n_mini_batch = n / mini_batch_size;
+	auto [inputMiniBatches, outputMiniBatches] = MLPPUtilities::createMiniBatches(inputSet, outputSet, n_mini_batch);
+
+	// Initializing necessary components for Adagrad.
+	std::vector<double> v = alg.zerovec(weights.size());
+	while (true) {
+		for (int i = 0; i < n_mini_batch; i++) {
+			std::vector<double> y_hat = Evaluate(inputMiniBatches[i]);
+			cost_prev = Cost(y_hat, outputMiniBatches[i]);
+
+			std::vector<double> error = alg.subtraction(y_hat, outputMiniBatches[i]);
+
+			// Calculating the weight gradients
+			std::vector<double> gradient = alg.scalarMultiply(1 / outputMiniBatches[i].size(), alg.mat_vec_mult(alg.transpose(inputMiniBatches[i]), error));
+			std::vector<double> RegDerivTerm = regularization.regDerivTerm(weights, lambda, alpha, reg);
+			std::vector<double> weight_grad = alg.addition(gradient, RegDerivTerm); // Weight_grad_final
+
+			v = alg.hadamard_product(weight_grad, weight_grad);
+
+			weights = alg.subtraction(weights, alg.scalarMultiply(learning_rate, alg.elementWiseDivision(weight_grad, alg.sqrt(alg.scalarAdd(e, v)))));
+
+			// Calculating the bias gradients
+			bias -= learning_rate * alg.sum_elements(error) / outputMiniBatches[i].size(); // As normal
+			y_hat = Evaluate(inputMiniBatches[i]);
+
+			if (UI) {
+				MLPPUtilities::CostInfo(epoch, cost_prev, Cost(y_hat, outputMiniBatches[i]));
+				MLPPUtilities::UI(weights, bias);
+			}
+		}
+		epoch++;
+		if (epoch > max_epoch) {
+			break;
+		}
+	}
+	forwardPass();
+}
+
+void MLPPLinReg::Adadelta(double learning_rate, int max_epoch, int mini_batch_size, double b1, double e, bool UI) {
+	// Adagrad upgrade. Momentum is applied.
+	MLPPLinAlg alg;
+	MLPPReg regularization;
+	double cost_prev = 0;
+	int epoch = 1;
+
+	// Creating the mini-batches
+	int n_mini_batch = n / mini_batch_size;
+	auto [inputMiniBatches, outputMiniBatches] = MLPPUtilities::createMiniBatches(inputSet, outputSet, n_mini_batch);
+
+	// Initializing necessary components for Adagrad.
+	std::vector<double> v = alg.zerovec(weights.size());
+	while (true) {
+		for (int i = 0; i < n_mini_batch; i++) {
+			std::vector<double> y_hat = Evaluate(inputMiniBatches[i]);
+			cost_prev = Cost(y_hat, outputMiniBatches[i]);
+
+			std::vector<double> error = alg.subtraction(y_hat, outputMiniBatches[i]);
+
+			// Calculating the weight gradients
+			std::vector<double> gradient = alg.scalarMultiply(1 / outputMiniBatches[i].size(), alg.mat_vec_mult(alg.transpose(inputMiniBatches[i]), error));
+			std::vector<double> RegDerivTerm = regularization.regDerivTerm(weights, lambda, alpha, reg);
+			std::vector<double> weight_grad = alg.addition(gradient, RegDerivTerm); // Weight_grad_final
+
+			v = alg.addition(alg.scalarMultiply(b1, v), alg.scalarMultiply(1 - b1, alg.hadamard_product(weight_grad, weight_grad)));
+
+			weights = alg.subtraction(weights, alg.scalarMultiply(learning_rate, alg.elementWiseDivision(weight_grad, alg.sqrt(alg.scalarAdd(e, v)))));
+
+			// Calculating the bias gradients
+			bias -= learning_rate * alg.sum_elements(error) / outputMiniBatches[i].size(); // As normal
+			y_hat = Evaluate(inputMiniBatches[i]);
+
+			if (UI) {
+				MLPPUtilities::CostInfo(epoch, cost_prev, Cost(y_hat, outputMiniBatches[i]));
+				MLPPUtilities::UI(weights, bias);
+			}
+		}
+		epoch++;
+		if (epoch > max_epoch) {
+			break;
+		}
+	}
+	forwardPass();
+}
+
+void MLPPLinReg::Adam(double learning_rate, int max_epoch, int mini_batch_size, double b1, double b2, double e, bool UI) {
+	MLPPLinAlg alg;
+	MLPPReg regularization;
+	double cost_prev = 0;
+	int epoch = 1;
+
+	// Creating the mini-batches
+	int n_mini_batch = n / mini_batch_size;
+	auto [inputMiniBatches, outputMiniBatches] = MLPPUtilities::createMiniBatches(inputSet, outputSet, n_mini_batch);
+
+	// Initializing necessary components for Adam.
+	std::vector<double> m = alg.zerovec(weights.size());
+
+	std::vector<double> v = alg.zerovec(weights.size());
+	while (true) {
+		for (int i = 0; i < n_mini_batch; i++) {
+			std::vector<double> y_hat = Evaluate(inputMiniBatches[i]);
+			cost_prev = Cost(y_hat, outputMiniBatches[i]);
+
+			std::vector<double> error = alg.subtraction(y_hat, outputMiniBatches[i]);
+
+			// Calculating the weight gradients
+			std::vector<double> gradient = alg.scalarMultiply(1 / outputMiniBatches[i].size(), alg.mat_vec_mult(alg.transpose(inputMiniBatches[i]), error));
+			std::vector<double> RegDerivTerm = regularization.regDerivTerm(weights, lambda, alpha, reg);
+			std::vector<double> weight_grad = alg.addition(gradient, RegDerivTerm); // Weight_grad_final
+
+			m = alg.addition(alg.scalarMultiply(b1, m), alg.scalarMultiply(1 - b1, weight_grad));
+			v = alg.addition(alg.scalarMultiply(b2, v), alg.scalarMultiply(1 - b2, alg.exponentiate(weight_grad, 2)));
+
+			std::vector<double> m_hat = alg.scalarMultiply(1 / (1 - pow(b1, epoch)), m);
+			std::vector<double> v_hat = alg.scalarMultiply(1 / (1 - pow(b2, epoch)), v);
+
+			weights = alg.subtraction(weights, alg.scalarMultiply(learning_rate, alg.elementWiseDivision(m_hat, alg.scalarAdd(e, alg.sqrt(v_hat)))));
+
+			// Calculating the bias gradients
+			bias -= learning_rate * alg.sum_elements(error) / outputMiniBatches[i].size(); // As normal
+			y_hat = Evaluate(inputMiniBatches[i]);
+
+			if (UI) {
+				MLPPUtilities::CostInfo(epoch, cost_prev, Cost(y_hat, outputMiniBatches[i]));
+				MLPPUtilities::UI(weights, bias);
+			}
+		}
+		epoch++;
+		if (epoch > max_epoch) {
+			break;
+		}
+	}
+	forwardPass();
+}
+
+void MLPPLinReg::Adamax(double learning_rate, int max_epoch, int mini_batch_size, double b1, double b2, double e, bool UI) {
+	MLPPLinAlg alg;
+	MLPPReg regularization;
+	double cost_prev = 0;
+	int epoch = 1;
+
+	// Creating the mini-batches
+	int n_mini_batch = n / mini_batch_size;
+	auto [inputMiniBatches, outputMiniBatches] = MLPPUtilities::createMiniBatches(inputSet, outputSet, n_mini_batch);
+
+	std::vector<double> m = alg.zerovec(weights.size());
+
+	std::vector<double> u = alg.zerovec(weights.size());
+	while (true) {
+		for (int i = 0; i < n_mini_batch; i++) {
+			std::vector<double> y_hat = Evaluate(inputMiniBatches[i]);
+			cost_prev = Cost(y_hat, outputMiniBatches[i]);
+
+			std::vector<double> error = alg.subtraction(y_hat, outputMiniBatches[i]);
+
+			// Calculating the weight gradients
+			std::vector<double> gradient = alg.scalarMultiply(1 / outputMiniBatches[i].size(), alg.mat_vec_mult(alg.transpose(inputMiniBatches[i]), error));
+			std::vector<double> RegDerivTerm = regularization.regDerivTerm(weights, lambda, alpha, reg);
+			std::vector<double> weight_grad = alg.addition(gradient, RegDerivTerm); // Weight_grad_final
+
+			m = alg.addition(alg.scalarMultiply(b1, m), alg.scalarMultiply(1 - b1, weight_grad));
+			u = alg.max(alg.scalarMultiply(b2, u), alg.abs(weight_grad));
+
+			std::vector<double> m_hat = alg.scalarMultiply(1 / (1 - pow(b1, epoch)), m);
+
+			weights = alg.subtraction(weights, alg.scalarMultiply(learning_rate, alg.elementWiseDivision(m_hat, u)));
+
+			// Calculating the bias gradients
+			bias -= learning_rate * alg.sum_elements(error) / outputMiniBatches[i].size(); // As normal
+			y_hat = Evaluate(inputMiniBatches[i]);
+
+			if (UI) {
+				MLPPUtilities::CostInfo(epoch, cost_prev, Cost(y_hat, outputMiniBatches[i]));
+				MLPPUtilities::UI(weights, bias);
+			}
+		}
+		epoch++;
+		if (epoch > max_epoch) {
+			break;
+		}
+	}
+	forwardPass();
+}
+
+void MLPPLinReg::Nadam(double learning_rate, int max_epoch, int mini_batch_size, double b1, double b2, double e, bool UI) {
+	MLPPLinAlg alg;
+	MLPPReg regularization;
+	double cost_prev = 0;
+	int epoch = 1;
+
+	// Creating the mini-batches
+	int n_mini_batch = n / mini_batch_size;
+	auto [inputMiniBatches, outputMiniBatches] = MLPPUtilities::createMiniBatches(inputSet, outputSet, n_mini_batch);
+
+	// Initializing necessary components for Adam.
+	std::vector<double> m = alg.zerovec(weights.size());
+	std::vector<double> v = alg.zerovec(weights.size());
+	std::vector<double> m_final = alg.zerovec(weights.size());
+	while (true) {
+		for (int i = 0; i < n_mini_batch; i++) {
+			std::vector<double> y_hat = Evaluate(inputMiniBatches[i]);
+			cost_prev = Cost(y_hat, outputMiniBatches[i]);
+
+			std::vector<double> error = alg.subtraction(y_hat, outputMiniBatches[i]);
+
+			// Calculating the weight gradients
+			std::vector<double> gradient = alg.scalarMultiply(1 / outputMiniBatches[i].size(), alg.mat_vec_mult(alg.transpose(inputMiniBatches[i]), error));
+			std::vector<double> RegDerivTerm = regularization.regDerivTerm(weights, lambda, alpha, reg);
+			std::vector<double> weight_grad = alg.addition(gradient, RegDerivTerm); // Weight_grad_final
+
+			m = alg.addition(alg.scalarMultiply(b1, m), alg.scalarMultiply(1 - b1, weight_grad));
+			v = alg.addition(alg.scalarMultiply(b2, v), alg.scalarMultiply(1 - b2, alg.exponentiate(weight_grad, 2)));
+			m_final = alg.addition(alg.scalarMultiply(b1, m), alg.scalarMultiply((1 - b1) / (1 - pow(b1, epoch)), weight_grad));
+
+			std::vector<double> m_hat = alg.scalarMultiply(1 / (1 - pow(b1, epoch)), m);
+			std::vector<double> v_hat = alg.scalarMultiply(1 / (1 - pow(b2, epoch)), v);
+
+			weights = alg.subtraction(weights, alg.scalarMultiply(learning_rate, alg.elementWiseDivision(m_final, alg.scalarAdd(e, alg.sqrt(v_hat)))));
+
+			// Calculating the bias gradients
+			bias -= learning_rate * alg.sum_elements(error) / outputMiniBatches[i].size(); // As normal
+			y_hat = Evaluate(inputMiniBatches[i]);
+
+			if (UI) {
+				MLPPUtilities::CostInfo(epoch, cost_prev, Cost(y_hat, outputMiniBatches[i]));
+				MLPPUtilities::UI(weights, bias);
+			}
+		}
+		epoch++;
+		if (epoch > max_epoch) {
+			break;
+		}
+	}
+	forwardPass();
+}
+
 void MLPPLinReg::normalEquation() {
 	MLPPLinAlg alg;
-	MLPPStat  stat;
+	MLPPStat stat;
 	std::vector<double> x_means;
 	std::vector<std::vector<double>> inputSetT = alg.transpose(inputSet);
 
@@ -185,35 +518,37 @@ void MLPPLinReg::normalEquation() {
 	}
 
 	//try {
-		std::vector<double> temp;
-		temp.resize(k);
-		temp = alg.mat_vec_mult(alg.inverse(alg.matmult(alg.transpose(inputSet), inputSet)), alg.mat_vec_mult(alg.transpose(inputSet), outputSet));
-		if (std::isnan(temp[0])) {
-			//throw 99;
-			//TODO ERR_FAIL_COND
+	std::vector<double> temp;
+	temp.resize(k);
+	temp = alg.mat_vec_mult(alg.inverse(alg.matmult(alg.transpose(inputSet), inputSet)), alg.mat_vec_mult(alg.transpose(inputSet), outputSet));
+	if (std::isnan(temp[0])) {
+		//throw 99;
+		//TODO ERR_FAIL_COND
+		std::cout << "ERR: Resulting matrix was noninvertible/degenerate, and so the normal equation could not be performed. Try utilizing gradient descent." << std::endl;
+		return;
+	} else {
+		if (reg == "Ridge") {
+			weights = alg.mat_vec_mult(alg.inverse(alg.addition(alg.matmult(alg.transpose(inputSet), inputSet), alg.scalarMultiply(lambda, alg.identity(k)))), alg.mat_vec_mult(alg.transpose(inputSet), outputSet));
 		} else {
-			if (reg == "Ridge") {
-				weights = alg.mat_vec_mult(alg.inverse(alg.addition(alg.matmult(alg.transpose(inputSet), inputSet), alg.scalarMultiply(lambda, alg.identity(k)))), alg.mat_vec_mult(alg.transpose(inputSet), outputSet));
-			} else {
-				weights = alg.mat_vec_mult(alg.inverse(alg.matmult(alg.transpose(inputSet), inputSet)), alg.mat_vec_mult(alg.transpose(inputSet), outputSet));
-			}
-
-			bias = stat.mean(outputSet) - alg.dot(weights, x_means);
-
-			forwardPass();
+			weights = alg.mat_vec_mult(alg.inverse(alg.matmult(alg.transpose(inputSet), inputSet)), alg.mat_vec_mult(alg.transpose(inputSet), outputSet));
 		}
+
+		bias = stat.mean(outputSet) - alg.dot(weights, x_means);
+
+		forwardPass();
+	}
 	//} catch (int err_num) {
 	//	std::cout << "ERR " << err_num << ": Resulting matrix was noninvertible/degenerate, and so the normal equation could not be performed. Try utilizing gradient descent." << std::endl;
 	//}
 }
 
 double MLPPLinReg::score() {
-	MLPPUtilities   util;
+	MLPPUtilities util;
 	return util.performance(y_hat, outputSet);
 }
 
 void MLPPLinReg::save(std::string fileName) {
-	MLPPUtilities   util;
+	MLPPUtilities util;
 	util.saveParameters(fileName, weights, bias);
 }
 
diff --git a/mlpp/lin_reg/lin_reg.h b/mlpp/lin_reg/lin_reg.h
index af45fe4..2882fe7 100644
--- a/mlpp/lin_reg/lin_reg.h
+++ b/mlpp/lin_reg/lin_reg.h
@@ -11,7 +11,6 @@
 #include <string>
 #include <vector>
 
-
 class MLPPLinReg {
 public:
 	MLPPLinReg(std::vector<std::vector<double>> inputSet, std::vector<double> outputSet, std::string reg = "None", double lambda = 0.5, double alpha = 0.5);
@@ -20,6 +19,15 @@ public:
 	void NewtonRaphson(double learning_rate, int max_epoch, bool UI);
 	void gradientDescent(double learning_rate, int max_epoch, bool UI = 1);
 	void SGD(double learning_rate, int max_epoch, bool UI = 1);
+
+	void Momentum(double learning_rate, int max_epoch, int mini_batch_size, double gamma, bool UI = 1);
+	void NAG(double learning_rate, int max_epoch, int mini_batch_size, double gamma, bool UI = 1);
+	void Adagrad(double learning_rate, int max_epoch, int mini_batch_size, double e, bool UI = 1);
+	void Adadelta(double learning_rate, int max_epoch, int mini_batch_size, double b1, double e, bool UI = 1);
+	void Adam(double learning_rate, int max_epoch, int mini_batch_size, double b1, double b2, double e, bool UI = 1);
+	void Adamax(double learning_rate, int max_epoch, int mini_batch_size, double b1, double b2, double e, bool UI = 1);
+	void Nadam(double learning_rate, int max_epoch, int mini_batch_size, double b1, double b2, double e, bool UI = 1);
+
 	void MBGD(double learning_rate, int max_epoch, int mini_batch_size, bool UI = 1);
 	void normalEquation();
 	double score();
@@ -47,5 +55,4 @@ private:
 	int alpha; /* This is the controlling param for Elastic Net*/
 };
 
-
 #endif /* LinReg_hpp */