Ported all tests.

2025-04-15 03:28:26 +02:00 · 2023-01-26 14:52:49 +01:00 · 2023-01-26 14:52:49 +01:00 · 2dabbb42be
commit 2dabbb42be
parent b398337558
6 changed files with 779 additions and 469 deletions
--- a/mlpp/data/data.cpp
+++ b/mlpp/data/data.cpp
@ -200,6 +200,53 @@ void MLPPData::set_data_simple(const String &file_name, std::vector<double> &inp
 	memdelete(file);
 }
 MLPPData::SplitComplexData MLPPData::train_test_split(const Ref<MLPPDataComplex> &data, double test_size) {
 	SplitComplexData res;
 	res.train.instance();
 	res.test.instance();
 	ERR_FAIL_COND_V(!data.is_valid(), res);
 	int is = MIN(data->input.size(), data->output.size());
 	Array indices;
 	indices.resize(is);
 	for (int i = 0; i < is; ++i) {
 		indices[i] = i;
 	}
 	indices.shuffle();
 	int test_input_number = test_size * is; // implicit usage of floor
 	for (int i = 0; i < test_input_number; ++i) {
 		int index = indices[i];
 		res.test->input.push_back(data->input[i]);
 		res.test->output.push_back(data->output[i]);
 	}
 	for (int i = test_input_number; i < is; ++i) {
 		int index = indices[i];
 		res.train->input.push_back(data->input[i]);
 		res.train->output.push_back(data->output[i]);
 	}
 	return res;
 }
 Array MLPPData::train_test_split_bind(const Ref<MLPPDataComplex> &data, double test_size) {
 	SplitComplexData res = train_test_split(data, test_size);
 	Array arr;
 	arr.push_back(res.train);
 	arr.push_back(res.test);
 	return arr;
 }
 // Loading Datasets
 std::tuple<std::vector<std::vector<double>>, std::vector<double>> MLPPData::loadBreastCancer() {
 	const int BREAST_CANCER_SIZE = 30; // k = 30
@ -280,6 +327,9 @@ std::tuple<std::vector<double>, std::vector<double>> MLPPData::loadFiresAndCrime
 	return { inputSet, outputSet };
 }
 // Note that inputs and outputs should be pairs (technically), but this
 // implementation will separate them. (My implementation keeps them tied together.)
 // Not yet sure whether this is intentional or not (or it's something like a compiler specific difference)
 std::tuple<std::vector<std::vector<double>>, std::vector<std::vector<double>>, std::vector<std::vector<double>>, std::vector<std::vector<double>>> MLPPData::trainTestSplit(std::vector<std::vector<double>> inputSet, std::vector<std::vector<double>> outputSet, double testSize) {
 	std::random_device rd;
 	std::default_random_engine generator(rd());
@ -817,6 +867,73 @@ std::tuple<std::vector<std::vector<double>>, std::vector<std::string>> MLPPData:
 	return { wordEmbeddings, wordList };
 }
 struct WordsToVecResult {
 	std::vector<std::vector<double>> word_embeddings;
 	std::vector<std::string> word_list;
 };
 MLPPData::WordsToVecResult MLPPData::word_to_vec(std::vector<std::string> sentences, std::string type, int windowSize, int dimension, double learning_rate, int max_epoch) {
 	WordsToVecResult res;
 	res.word_list = removeNullByte(removeStopWords(createWordList(sentences)));
 	std::vector<std::vector<std::string>> segmented_sentences;
 	segmented_sentences.resize(sentences.size());
 	for (int i = 0; i < sentences.size(); i++) {
 		segmented_sentences[i] = removeStopWords(sentences[i]);
 	}
 	std::vector<std::string> inputStrings;
 	std::vector<std::string> outputStrings;
 	for (int i = 0; i < segmented_sentences.size(); i++) {
 		for (int j = 0; j < segmented_sentences[i].size(); j++) {
 			for (int k = windowSize; k > 0; k--) {
 				if (j - k >= 0) {
 					inputStrings.push_back(segmented_sentences[i][j]);
 					outputStrings.push_back(segmented_sentences[i][j - k]);
 				}
 				if (j + k <= segmented_sentences[i].size() - 1) {
 					inputStrings.push_back(segmented_sentences[i][j]);
 					outputStrings.push_back(segmented_sentences[i][j + k]);
 				}
 			}
 		}
 	}
 	int inputSize = inputStrings.size();
 	inputStrings.insert(inputStrings.end(), outputStrings.begin(), outputStrings.end());
 	std::vector<std::vector<double>> BOW = MLPPData::BOW(inputStrings, "Binary");
 	std::vector<std::vector<double>> inputSet;
 	std::vector<std::vector<double>> outputSet;
 	for (int i = 0; i < inputSize; i++) {
 		inputSet.push_back(BOW[i]);
 	}
 	for (int i = inputSize; i < BOW.size(); i++) {
 		outputSet.push_back(BOW[i]);
 	}
 	MLPPLinAlg alg;
 	MLPPSoftmaxNet *model;
 	if (type == "Skipgram") {
 		model = new MLPPSoftmaxNet(outputSet, inputSet, dimension);
 	} else { // else = CBOW. We maintain it is a default.
 		model = new MLPPSoftmaxNet(inputSet, outputSet, dimension);
 	}
 	model->gradientDescent(learning_rate, max_epoch, false);
 	res.word_embeddings = model->getEmbeddings();
 	delete model;
 	return res;
 }
 std::vector<std::vector<double>> MLPPData::LSA(std::vector<std::string> sentences, int dim) {
 	MLPPLinAlg alg;
 	std::vector<std::vector<double>> docWordData = BOW(sentences, "Binary");
@ -946,4 +1063,6 @@ void MLPPData::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("load_mnist_test", "path"), &MLPPData::load_mnist_test);
 	ClassDB::bind_method(D_METHOD("load_california_housing", "path"), &MLPPData::load_california_housing);
 	ClassDB::bind_method(D_METHOD("load_fires_and_crime", "path"), &MLPPData::load_fires_and_crime);
 	ClassDB::bind_method(D_METHOD("train_test_split", "data", "test_size"), &MLPPData::train_test_split_bind);
 }
--- a/mlpp/data/data.h
+++ b/mlpp/data/data.h
@ -10,6 +10,7 @@
 //
 #include "core/string/ustring.h"
 #include "core/variant/array.h"
 #include "core/object/reference.h"
@ -68,6 +69,14 @@ public:
 	void set_data_unsupervised(int k, const String &file_name, std::vector<std::vector<double>> &inputSet);
 	void set_data_simple(const String &file_name, std::vector<double> &inputSet, std::vector<double> &outputSet);
 	struct SplitComplexData {
 		Ref<MLPPDataComplex> train;
 		Ref<MLPPDataComplex> test;
 	};
 	SplitComplexData train_test_split(const Ref<MLPPDataComplex> &data, double test_size);
 	Array train_test_split_bind(const Ref<MLPPDataComplex> &data, double test_size);
 	// Load Datasets
 	std::tuple<std::vector<std::vector<double>>, std::vector<double>> loadBreastCancer();
 	std::tuple<std::vector<std::vector<double>>, std::vector<double>> loadBreastCancerSVC();
@ -114,7 +123,16 @@ public:
 	std::vector<std::vector<double>> BOW(std::vector<std::string> sentences, std::string = "Default");
 	std::vector<std::vector<double>> TFIDF(std::vector<std::string> sentences);
 	std::tuple<std::vector<std::vector<double>>, std::vector<std::string>> word2Vec(std::vector<std::string> sentences, std::string type, int windowSize, int dimension, double learning_rate, int max_epoch);
 	struct WordsToVecResult {
 		std::vector<std::vector<double>> word_embeddings;
 		std::vector<std::string> word_list;
 	};
 	WordsToVecResult word_to_vec(std::vector<std::string> sentences, std::string type, int windowSize, int dimension, double learning_rate, int max_epoch);
 	std::vector<std::vector<double>> LSA(std::vector<std::string> sentences, int dim);
 	std::vector<std::string> createWordList(std::vector<std::string> sentences);
--- a/mlpp/lin_alg/lin_alg.cpp
+++ b/mlpp/lin_alg/lin_alg.cpp
@ -11,8 +11,6 @@
 #include <map>
 #include <random>
 std::vector<std::vector<double>> MLPPLinAlg::gramMatrix(std::vector<std::vector<double>> A) {
 	return matmult(transpose(A), A); // AtA
 }
@ -507,7 +505,7 @@ std::vector<std::vector<double>> MLPPLinAlg::identity(double d) {
 }
 std::vector<std::vector<double>> MLPPLinAlg::cov(std::vector<std::vector<double>> A) {
-	MLPPStat  stat;
+	MLPPStat stat;
 	std::vector<std::vector<double>> covMat;
 	covMat.resize(A.size());
 	for (int i = 0; i < covMat.size(); i++) {
@ -641,6 +639,131 @@ std::tuple<std::vector<std::vector<double>>, std::vector<std::vector<double>>> M
 	return { eigenvectors, a_new };
 }
 MLPPLinAlg::EigenResult MLPPLinAlg::eigen(std::vector<std::vector<double>> A) {
 	/*
 	A (the entered parameter) in most use cases will be X'X, XX', etc. and must be symmetric.
 	That simply means that 1) X' = X and 2) X is a square matrix. This function that computes the
 	eigenvalues of a matrix is utilizing Jacobi's method.
 	*/
 	double diagonal = true; // Perform the iterative Jacobi algorithm unless and until we reach a diagonal matrix which yields us the eigenvals.
 	std::map<int, int> val_to_vec;
 	std::vector<std::vector<double>> a_new;
 	std::vector<std::vector<double>> eigenvectors = identity(A.size());
 	do {
 		double a_ij = A[0][1];
 		double sub_i = 0;
 		double sub_j = 1;
 		for (int i = 0; i < A.size(); i++) {
 			for (int j = 0; j < A[i].size(); j++) {
 				if (i != j && std::abs(A[i][j]) > a_ij) {
 					a_ij = A[i][j];
 					sub_i = i;
 					sub_j = j;
 				} else if (i != j && std::abs(A[i][j]) == a_ij) {
 					if (i < sub_i) {
 						a_ij = A[i][j];
 						sub_i = i;
 						sub_j = j;
 					}
 				}
 			}
 		}
 		double a_ii = A[sub_i][sub_i];
 		double a_jj = A[sub_j][sub_j];
 		double a_ji = A[sub_j][sub_i];
 		double theta;
 		if (a_ii == a_jj) {
 			theta = M_PI / 4;
 		} else {
 			theta = 0.5 * atan(2 * a_ij / (a_ii - a_jj));
 		}
 		std::vector<std::vector<double>> P = identity(A.size());
 		P[sub_i][sub_j] = -std::sin(theta);
 		P[sub_i][sub_i] = std::cos(theta);
 		P[sub_j][sub_j] = std::cos(theta);
 		P[sub_j][sub_i] = std::sin(theta);
 		a_new = matmult(matmult(inverse(P), A), P);
 		for (int i = 0; i < a_new.size(); i++) {
 			for (int j = 0; j < a_new[i].size(); j++) {
 				if (i != j && std::round(a_new[i][j]) == 0) {
 					a_new[i][j] = 0;
 				}
 			}
 		}
 		bool non_zero = false;
 		for (int i = 0; i < a_new.size(); i++) {
 			for (int j = 0; j < a_new[i].size(); j++) {
 				if (i != j && std::round(a_new[i][j]) != 0) {
 					non_zero = true;
 				}
 			}
 		}
 		if (non_zero) {
 			diagonal = false;
 		} else {
 			diagonal = true;
 		}
 		if (a_new == A) {
 			diagonal = true;
 			for (int i = 0; i < a_new.size(); i++) {
 				for (int j = 0; j < a_new[i].size(); j++) {
 					if (i != j) {
 						a_new[i][j] = 0;
 					}
 				}
 			}
 		}
 		eigenvectors = matmult(eigenvectors, P);
 		A = a_new;
 	} while (!diagonal);
 	std::vector<std::vector<double>> a_new_prior = a_new;
 	// Bubble Sort. Should change this later.
 	for (int i = 0; i < a_new.size() - 1; i++) {
 		for (int j = 0; j < a_new.size() - 1 - i; j++) {
 			if (a_new[j][j] < a_new[j + 1][j + 1]) {
 				double temp = a_new[j + 1][j + 1];
 				a_new[j + 1][j + 1] = a_new[j][j];
 				a_new[j][j] = temp;
 			}
 		}
 	}
 	for (int i = 0; i < a_new.size(); i++) {
 		for (int j = 0; j < a_new.size(); j++) {
 			if (a_new[i][i] == a_new_prior[j][j]) {
 				val_to_vec[i] = j;
 			}
 		}
 	}
 	std::vector<std::vector<double>> eigen_temp = eigenvectors;
 	for (int i = 0; i < eigenvectors.size(); i++) {
 		for (int j = 0; j < eigenvectors[i].size(); j++) {
 			eigenvectors[i][j] = eigen_temp[i][val_to_vec[j]];
 		}
 	}
 	EigenResult res;
 	res.eigen_vectors = eigenvectors;
 	res.eigen_values = a_new;
 	return res;
 }
 std::tuple<std::vector<std::vector<double>>, std::vector<std::vector<double>>, std::vector<std::vector<double>>> MLPPLinAlg::SVD(std::vector<std::vector<double>> A) {
 	auto [left_eigenvecs, eigenvals] = eig(matmult(A, transpose(A)));
 	auto [right_eigenvecs, right_eigenvals] = eig(matmult(transpose(A), A));
@ -655,6 +778,26 @@ std::tuple<std::vector<std::vector<double>>, std::vector<std::vector<double>>, s
 	return { left_eigenvecs, sigma, right_eigenvecs };
 }
 MLPPLinAlg::SDVResult MLPPLinAlg::svd(std::vector<std::vector<double>> A) {
 	EigenResult left_eigen = eigen(matmult(A, transpose(A)));
 	EigenResult right_eigen = eigen(matmult(transpose(A), A));
 	std::vector<std::vector<double>> singularvals = sqrt(left_eigen.eigen_values);
 	std::vector<std::vector<double>> sigma = zeromat(A.size(), A[0].size());
 	for (int i = 0; i < singularvals.size(); i++) {
 		for (int j = 0; j < singularvals[i].size(); j++) {
 			sigma[i][j] = singularvals[i][j];
 		}
 	}
 	SDVResult res;
 	res.U = left_eigen.eigen_vectors;
 	res.S = sigma;
 	res.Vt = right_eigen.eigen_vectors;
 	return res;
 }
 std::vector<double> MLPPLinAlg::vectorProjection(std::vector<double> a, std::vector<double> b) {
 	double product = dot(a, b) / dot(a, a);
 	return scalarMultiply(product, a); // Projection of vector a onto b. Denotated as proj_a(b).
@ -686,6 +829,15 @@ std::tuple<std::vector<std::vector<double>>, std::vector<std::vector<double>>> M
 	return { Q, R };
 }
 MLPPLinAlg::QRDResult MLPPLinAlg::qrd(std::vector<std::vector<double>> A) {
 	QRDResult res;
 	res.Q = gramSchmidtProcess(A);
 	res.R = matmult(transpose(res.Q), A);
 	return res;
 }
 std::tuple<std::vector<std::vector<double>>, std::vector<std::vector<double>>> MLPPLinAlg::chol(std::vector<std::vector<double>> A) {
 	std::vector<std::vector<double>> L = zeromat(A.size(), A[0].size());
 	for (int j = 0; j < L.size(); j++) { // Matrices entered must be square. No problem here.
@ -708,6 +860,33 @@ std::tuple<std::vector<std::vector<double>>, std::vector<std::vector<double>>> M
 	return { L, transpose(L) }; // Indeed, L.T is our upper triangular matrix.
 }
 MLPPLinAlg::CholeskyResult MLPPLinAlg::cholesky(std::vector<std::vector<double>> A) {
 	std::vector<std::vector<double>> L = zeromat(A.size(), A[0].size());
 	for (int j = 0; j < L.size(); j++) { // Matrices entered must be square. No problem here.
 		for (int i = j; i < L.size(); i++) {
 			if (i == j) {
 				double sum = 0;
 				for (int k = 0; k < j; k++) {
 					sum += L[i][k] * L[i][k];
 				}
 				L[i][j] = std::sqrt(A[i][j] - sum);
 			} else { // That is, i!=j
 				double sum = 0;
 				for (int k = 0; k < j; k++) {
 					sum += L[i][k] * L[j][k];
 				}
 				L[i][j] = (A[i][j] - sum) / L[j][j];
 			}
 		}
 	}
 	CholeskyResult res;
 	res.L = L;
 	res.Lt = transpose(L); // Indeed, L.T is our upper triangular matrix.
 	return res;
 }
 double MLPPLinAlg::sum_elements(std::vector<std::vector<double>> A) {
 	double sum = 0;
 	for (int i = 0; i < A.size(); i++) {
--- a/mlpp/lin_alg/lin_alg.h
+++ b/mlpp/lin_alg/lin_alg.h
@ -11,7 +11,6 @@
 #include <tuple>
 #include <vector>
 class MLPPLinAlg {
 public:
 	// MATRIX FUNCTIONS
@ -98,16 +97,45 @@ public:
 	std::tuple<std::vector<std::vector<double>>, std::vector<std::vector<double>>> eig(std::vector<std::vector<double>> A);
 	struct EigenResult {
 		std::vector<std::vector<double>> eigen_vectors;
 		std::vector<std::vector<double>> eigen_values;
 	};
 	EigenResult eigen(std::vector<std::vector<double>> A);
 	std::tuple<std::vector<std::vector<double>>, std::vector<std::vector<double>>, std::vector<std::vector<double>>> SVD(std::vector<std::vector<double>> A);
 	struct SDVResult {
 		std::vector<std::vector<double>> U;
 		std::vector<std::vector<double>> S;
 		std::vector<std::vector<double>> Vt;
 	};
 	SDVResult svd(std::vector<std::vector<double>> A);
 	std::vector<double> vectorProjection(std::vector<double> a, std::vector<double> b);
 	std::vector<std::vector<double>> gramSchmidtProcess(std::vector<std::vector<double>> A);
 	std::tuple<std::vector<std::vector<double>>, std::vector<std::vector<double>>> QRD(std::vector<std::vector<double>> A);
 	struct QRDResult {
 		std::vector<std::vector<double>> Q;
 		std::vector<std::vector<double>> R;
 	};
 	QRDResult qrd(std::vector<std::vector<double>> A);
 	std::tuple<std::vector<std::vector<double>>, std::vector<std::vector<double>>> chol(std::vector<std::vector<double>> A);
 	struct CholeskyResult {
 		std::vector<std::vector<double>> L;
 		std::vector<std::vector<double>> Lt;
 	};
 	CholeskyResult cholesky(std::vector<std::vector<double>> A);
 	double sum_elements(std::vector<std::vector<double>> A);
 	std::vector<double> flatten(std::vector<std::vector<double>> A);
@ -231,6 +259,4 @@ public:
 private:
 };
 #endif /* LinAlg_hpp */
--- a/test/mlpp_tests.cpp
+++ b/test/mlpp_tests.cpp
--- a/test/mlpp_tests.h
+++ b/test/mlpp_tests.h
@ -44,7 +44,7 @@ public:
 	void test_dynamically_sized_mann(bool ui = false);
 	void test_train_test_split_mann(bool ui = false);
-	void test_naive_bayes(bool ui = false);
+	void test_naive_bayes();
 	void test_k_means(bool ui = false);
 	void test_knn(bool ui = false);