From 074af18c64750f82ed6dc365f1d2c5ae4fa0a3c1 Mon Sep 17 00:00:00 2001
From: Relintai <relintai@protonmail.com>
Date: Thu, 28 Dec 2023 21:06:16 +0100
Subject: [PATCH] Reworked more methods.

---
 mlpp/data/data.cpp      | 153 +++++++++++++++++++++-------------------
 mlpp/data/data.h        |   9 +--
 mlpp/pca/pca_old.cpp    |   4 +-
 test/mlpp_tests.cpp     |  22 +++---
 test/mlpp_tests_old.cpp |   1 -
 5 files changed, 95 insertions(+), 94 deletions(-)
diff --git a/mlpp/data/data.cpp b/mlpp/data/data.cpp
index 1705f5c..a3cbbcd 100644
--- a/mlpp/data/data.cpp
+++ b/mlpp/data/data.cpp
@@ -15,6 +15,7 @@
 #include "../lin_alg/lin_alg_old.h"
 #include "../softmax_net/softmax_net.h"
 #include "../stat/stat_old.h"
+#include "data_old.h"
 
 #include <algorithm>
 #include <cmath>
@@ -430,9 +431,10 @@ std::tuple<std::vector<std::vector<real_t>>, std::vector<std::vector<real_t>>> M
 	const int ONE_HOT_NUM = 3;
 	std::vector<std::vector<real_t>> inputSet;
 	std::vector<real_t> tempOutputSet;
+	MLPPDataOld d;
 
 	setData(IRIS_SIZE, "/Users/marcmelikyan/Desktop/Data/Iris.csv", inputSet, tempOutputSet);
-	std::vector<std::vector<real_t>> outputSet = oneHotRep(tempOutputSet, ONE_HOT_NUM);
+	std::vector<std::vector<real_t>> outputSet = d.oneHotRep(tempOutputSet, ONE_HOT_NUM);
 	return { inputSet, outputSet };
 }
 
@@ -441,9 +443,10 @@ std::tuple<std::vector<std::vector<real_t>>, std::vector<std::vector<real_t>>> M
 	const int ONE_HOT_NUM = 3;
 	std::vector<std::vector<real_t>> inputSet;
 	std::vector<real_t> tempOutputSet;
+	MLPPDataOld d;
 
 	setData(WINE_SIZE, "MLPP/Data/Datasets/Iris.csv", inputSet, tempOutputSet);
-	std::vector<std::vector<real_t>> outputSet = oneHotRep(tempOutputSet, ONE_HOT_NUM);
+	std::vector<std::vector<real_t>> outputSet = d.oneHotRep(tempOutputSet, ONE_HOT_NUM);
 	return { inputSet, outputSet };
 }
 
@@ -452,9 +455,10 @@ std::tuple<std::vector<std::vector<real_t>>, std::vector<std::vector<real_t>>> M
 	const int ONE_HOT_NUM = 10;
 	std::vector<std::vector<real_t>> inputSet;
 	std::vector<real_t> tempOutputSet;
+	MLPPDataOld d;
 
 	setData(MNIST_SIZE, "MLPP/Data/Datasets/MnistTrain.csv", inputSet, tempOutputSet);
-	std::vector<std::vector<real_t>> outputSet = oneHotRep(tempOutputSet, ONE_HOT_NUM);
+	std::vector<std::vector<real_t>> outputSet = d.oneHotRep(tempOutputSet, ONE_HOT_NUM);
 	return { inputSet, outputSet };
 }
 
@@ -463,9 +467,10 @@ std::tuple<std::vector<std::vector<real_t>>, std::vector<std::vector<real_t>>> M
 	const int ONE_HOT_NUM = 10;
 	std::vector<std::vector<real_t>> inputSet;
 	std::vector<real_t> tempOutputSet;
+	MLPPDataOld d;
 
 	setData(MNIST_SIZE, "MLPP/Data/Datasets/MnistTest.csv", inputSet, tempOutputSet);
-	std::vector<std::vector<real_t>> outputSet = oneHotRep(tempOutputSet, ONE_HOT_NUM);
+	std::vector<std::vector<real_t>> outputSet = d.oneHotRep(tempOutputSet, ONE_HOT_NUM);
 	return { inputSet, outputSet };
 }
 
@@ -1117,80 +1122,40 @@ void MLPPData::setInputNames(std::string fileName, std::vector<std::string> &inp
 	dataFile.close();
 }
 
-std::vector<std::vector<real_t>> MLPPData::featureScaling(std::vector<std::vector<real_t>> X) {
-	MLPPLinAlgOld alg;
-	X = alg.transpose(X);
-	std::vector<real_t> max_elements, min_elements;
-	max_elements.resize(X.size());
-	min_elements.resize(X.size());
+Ref<MLPPMatrix> MLPPData::feature_scaling(const Ref<MLPPMatrix> &p_X) {
+	Ref<MLPPMatrix> X = p_X->transposen();
 
-	for (uint32_t i = 0; i < X.size(); i++) {
-		max_elements[i] = alg.max(X[i]);
-		min_elements[i] = alg.min(X[i]);
+	Size2i x_size = X->size();
+
+	LocalVector<real_t> max_elements;
+	LocalVector<real_t> min_elements;
+
+	max_elements.resize(x_size.y);
+	min_elements.resize(x_size.y);
+
+	Ref<MLPPVector> row_tmp;
+	row_tmp.instance();
+	row_tmp->resize(x_size.x);
+
+	for (int i = 0; i < x_size.y; ++i) {
+		X->row_get_into_mlpp_vector(i, row_tmp);
+
+		max_elements[i] = row_tmp->max_element();
+		min_elements[i] = row_tmp->min_element();
 	}
 
-	for (uint32_t i = 0; i < X.size(); i++) {
-		for (uint32_t j = 0; j < X[i].size(); j++) {
-			X[i][j] = (X[i][j] - min_elements[i]) / (max_elements[i] - min_elements[i]);
+	for (int i = 0; i < x_size.y; i++) {
+		real_t maxe = max_elements[i];
+		real_t mine = min_elements[i];
+
+		for (int j = 0; j < x_size.x; j++) {
+			real_t xij = X->element_get(i, j);
+
+			X->element_set(i, j, (xij - mine) / (maxe - mine));
 		}
 	}
-	return alg.transpose(X);
-}
 
-std::vector<std::vector<real_t>> MLPPData::meanNormalization(std::vector<std::vector<real_t>> X) {
-	MLPPLinAlgOld alg;
-	MLPPStatOld stat;
-	// (X_j - mu_j) / std_j, for every j
-
-	X = meanCentering(X);
-	for (uint32_t i = 0; i < X.size(); i++) {
-		X[i] = alg.scalarMultiply(1 / stat.standardDeviation(X[i]), X[i]);
-	}
-	return X;
-}
-
-std::vector<std::vector<real_t>> MLPPData::meanCentering(std::vector<std::vector<real_t>> X) {
-	MLPPStatOld stat;
-	for (uint32_t i = 0; i < X.size(); i++) {
-		real_t mean_i = stat.mean(X[i]);
-		for (uint32_t j = 0; j < X[i].size(); j++) {
-			X[i][j] -= mean_i;
-		}
-	}
-	return X;
-}
-
-std::vector<std::vector<real_t>> MLPPData::oneHotRep(std::vector<real_t> tempOutputSet, int n_class) {
-	std::vector<std::vector<real_t>> outputSet;
-	outputSet.resize(tempOutputSet.size());
-	for (uint32_t i = 0; i < tempOutputSet.size(); i++) {
-		for (int j = 0; j <= n_class - 1; j++) {
-			if (tempOutputSet[i] == j) {
-				outputSet[i].push_back(1);
-			} else {
-				outputSet[i].push_back(0);
-			}
-		}
-	}
-	return outputSet;
-}
-
-std::vector<real_t> MLPPData::reverseOneHot(std::vector<std::vector<real_t>> tempOutputSet) {
-	std::vector<real_t> outputSet;
-	//uint32_t n_class = tempOutputSet[0].size();
-	for (uint32_t i = 0; i < tempOutputSet.size(); i++) {
-		int current_class = 1;
-		for (uint32_t j = 0; j < tempOutputSet[i].size(); j++) {
-			if (tempOutputSet[i][j] == 1) {
-				break;
-			} else {
-				current_class++;
-			}
-		}
-		outputSet.push_back(current_class);
-	}
-
-	return outputSet;
+	return X->transposen();
 }
 
 Ref<MLPPMatrix> MLPPData::mean_centering(const Ref<MLPPMatrix> &p_X) {
@@ -1207,7 +1172,7 @@ Ref<MLPPMatrix> MLPPData::mean_centering(const Ref<MLPPMatrix> &p_X) {
 	x_row_tmp->resize(x_size.x);
 
 	for (int i = 0; i < x_size.y; ++i) {
-		X->row_get_into_mlpp_vector(i, x_row_tmp);
+		p_X->row_get_into_mlpp_vector(i, x_row_tmp);
 
 		real_t mean_i = stat.meanv(x_row_tmp);
 
@@ -1219,6 +1184,30 @@ Ref<MLPPMatrix> MLPPData::mean_centering(const Ref<MLPPMatrix> &p_X) {
 	return X;
 }
 
+Ref<MLPPMatrix> MLPPData::mean_normalization(const Ref<MLPPMatrix> &p_X) {
+	MLPPLinAlg alg;
+	MLPPStat stat;
+
+	// (X_j - mu_j) / std_j, for every j
+
+	Ref<MLPPMatrix> X = mean_centering(p_X);
+	Size2i x_size = X->size();
+
+	Ref<MLPPVector> x_row_tmp;
+	x_row_tmp.instance();
+	x_row_tmp->resize(x_size.x);
+
+	for (int i = 0; i < x_size.y; i++) {
+		X->row_get_into_mlpp_vector(i, x_row_tmp);
+
+		x_row_tmp->scalar_multiply((real_t)1 / stat.standard_deviationv(x_row_tmp));
+
+		X->row_set_mlpp_vector(i, x_row_tmp);
+	}
+
+	return X;
+}
+
 Ref<MLPPMatrix> MLPPData::one_hot_rep(const Ref<MLPPVector> &temp_output_set, int n_class) {
 	ERR_FAIL_COND_V(!temp_output_set.is_valid(), Ref<MLPPMatrix>());
 
@@ -1243,6 +1232,24 @@ Ref<MLPPMatrix> MLPPData::one_hot_rep(const Ref<MLPPVector> &temp_output_set, in
 	return output_set;
 }
 
+std::vector<real_t> MLPPData::reverseOneHot(std::vector<std::vector<real_t>> tempOutputSet) {
+	std::vector<real_t> outputSet;
+	//uint32_t n_class = tempOutputSet[0].size();
+	for (uint32_t i = 0; i < tempOutputSet.size(); i++) {
+		int current_class = 1;
+		for (uint32_t j = 0; j < tempOutputSet[i].size(); j++) {
+			if (tempOutputSet[i][j] == 1) {
+				break;
+			} else {
+				current_class++;
+			}
+		}
+		outputSet.push_back(current_class);
+	}
+
+	return outputSet;
+}
+
 void MLPPData::load_default_suffixes() {
 	// Our list of suffixes which we use to compare against
 	suffixes = String("eer er ion ity ment ness or sion ship th able ible al ant ary ful ic ious ous ive less y ed en ing ize ise ly ward wise").split_spaces();
diff --git a/mlpp/data/data.h b/mlpp/data/data.h
index bb8b7e1..565eef1 100644
--- a/mlpp/data/data.h
+++ b/mlpp/data/data.h
@@ -176,14 +176,11 @@ public:
 
 	// Extra
 	void setInputNames(std::string fileName, std::vector<std::string> &inputNames);
-	std::vector<std::vector<real_t>> featureScaling(std::vector<std::vector<real_t>> X);
-	std::vector<std::vector<real_t>> meanNormalization(std::vector<std::vector<real_t>> X);
-	std::vector<std::vector<real_t>> meanCentering(std::vector<std::vector<real_t>> X);
-	std::vector<std::vector<real_t>> oneHotRep(std::vector<real_t> tempOutputSet, int n_class);
-	std::vector<real_t> reverseOneHot(std::vector<std::vector<real_t>> tempOutputSet);
-
+	Ref<MLPPMatrix> feature_scaling(const Ref<MLPPMatrix> &X);
 	Ref<MLPPMatrix> mean_centering(const Ref<MLPPMatrix> &X);
+	Ref<MLPPMatrix> mean_normalization(const Ref<MLPPMatrix> &X);
 	Ref<MLPPMatrix> one_hot_rep(const Ref<MLPPVector> &temp_output_set, int n_class);
+	std::vector<real_t> reverseOneHot(std::vector<std::vector<real_t>> tempOutputSet);
 
 	template <class T>
 	std::vector<T> vecToSet(std::vector<T> inputSet) {
diff --git a/mlpp/pca/pca_old.cpp b/mlpp/pca/pca_old.cpp
index 2f916cf..1cd3b1b 100644
--- a/mlpp/pca/pca_old.cpp
+++ b/mlpp/pca/pca_old.cpp
@@ -5,7 +5,7 @@
 //
 
 #include "pca_old.h"
-#include "../data/data.h"
+#include "../data/data_old.h"
 #include "../lin_alg/lin_alg_old.h"
 
 #include <iostream>
@@ -19,7 +19,7 @@ MLPPPCAOld::MLPPPCAOld(std::vector<std::vector<real_t>> inputSet, int k) :
 
 std::vector<std::vector<real_t>> MLPPPCAOld::principalComponents() {
 	MLPPLinAlgOld alg;
-	MLPPData data;
+	MLPPDataOld data;
 
 	MLPPLinAlgOld::SVDResultOld svr_res = alg.SVD(alg.cov(inputSet));
 	X_normalized = data.meanCentering(inputSet);
diff --git a/test/mlpp_tests.cpp b/test/mlpp_tests.cpp
index 2c815d4..f8c00f9 100644
--- a/test/mlpp_tests.cpp
+++ b/test/mlpp_tests.cpp
@@ -978,20 +978,18 @@ void MLPPTests::test_nlp_and_data(bool ui) {
 	PLOG_MSG("LSA:");
 	PLOG_MSG(data.lsa(text_archive2, 2)->to_string());
 
-	/*
-	std::vector<std::vector<real_t>> inputSet = { { 1, 2 }, { 2, 3 }, { 3, 4 }, { 4, 5 }, { 5, 6 } };
-	std::cout << "Feature Scaling Example:" << std::endl;
-	alg.printMatrix(data.featureScaling(inputSet));
-	std::cout << std::endl;
+	std::vector<std::vector<real_t>> input_set_vec = { { 1, 2 }, { 2, 3 }, { 3, 4 }, { 4, 5 }, { 5, 6 } };
 
-	std::cout << "Mean Centering Example:" << std::endl;
-	alg.printMatrix(data.meanCentering(inputSet));
-	std::cout << std::endl;
+	Ref<MLPPMatrix> input_set = Ref<MLPPMatrix>(memnew(MLPPMatrix(input_set_vec)));
 
-	std::cout << "Mean Normalization Example:" << std::endl;
-	alg.printMatrix(data.meanNormalization(inputSet));
-	std::cout << std::endl;
-	*/
+	PLOG_MSG("Feature Scaling Example:");
+	PLOG_MSG(data.feature_scaling(input_set)->to_string());
+
+	PLOG_MSG("Mean Centering Example:");
+	PLOG_MSG(data.mean_centering(input_set)->to_string());
+
+	PLOG_MSG("Mean Normalization Example:");
+	PLOG_MSG(data.mean_normalization(input_set)->to_string());
 }
 void MLPPTests::test_outlier_finder(bool ui) {
 	MLPPLinAlg alg;
diff --git a/test/mlpp_tests_old.cpp b/test/mlpp_tests_old.cpp
index 8d1c23e..57173c4 100644
--- a/test/mlpp_tests_old.cpp
+++ b/test/mlpp_tests_old.cpp
@@ -400,7 +400,6 @@ void MLPPTestsOld::test_outlier_finder(bool ui) {
 	MLPPLinAlgOld alg;
 
 	// Outlier Finder
-	//std::vector<real_t> inputSet = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 23554332523523 };
 	std::vector<real_t> inputSet = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 23554332 };
 	MLPPOutlierFinderOld outlierFinderOld(2); // Any datapoint outside of 2 stds from the mean is marked as an outlier.
 	alg.printVector(outlierFinderOld.modelTest(inputSet));