Ported all tests.

This commit is contained in:
Relintai 2023-01-26 14:52:49 +01:00
parent b398337558
commit 2dabbb42be
6 changed files with 779 additions and 469 deletions

View File

@ -200,6 +200,53 @@ void MLPPData::set_data_simple(const String &file_name, std::vector<double> &inp
memdelete(file);
}
MLPPData::SplitComplexData MLPPData::train_test_split(const Ref<MLPPDataComplex> &data, double test_size) {
SplitComplexData res;
res.train.instance();
res.test.instance();
ERR_FAIL_COND_V(!data.is_valid(), res);
int is = MIN(data->input.size(), data->output.size());
Array indices;
indices.resize(is);
for (int i = 0; i < is; ++i) {
indices[i] = i;
}
indices.shuffle();
int test_input_number = test_size * is; // implicit usage of floor
for (int i = 0; i < test_input_number; ++i) {
int index = indices[i];
res.test->input.push_back(data->input[i]);
res.test->output.push_back(data->output[i]);
}
for (int i = test_input_number; i < is; ++i) {
int index = indices[i];
res.train->input.push_back(data->input[i]);
res.train->output.push_back(data->output[i]);
}
return res;
}
Array MLPPData::train_test_split_bind(const Ref<MLPPDataComplex> &data, double test_size) {
SplitComplexData res = train_test_split(data, test_size);
Array arr;
arr.push_back(res.train);
arr.push_back(res.test);
return arr;
}
// Loading Datasets
std::tuple<std::vector<std::vector<double>>, std::vector<double>> MLPPData::loadBreastCancer() {
const int BREAST_CANCER_SIZE = 30; // k = 30
@ -280,6 +327,9 @@ std::tuple<std::vector<double>, std::vector<double>> MLPPData::loadFiresAndCrime
return { inputSet, outputSet };
}
// Note that inputs and outputs should be pairs (technically), but this
// implementation will separate them. (My implementation keeps them tied together.)
// Not yet sure whether this is intentional or not (or it's something like a compiler specific difference)
std::tuple<std::vector<std::vector<double>>, std::vector<std::vector<double>>, std::vector<std::vector<double>>, std::vector<std::vector<double>>> MLPPData::trainTestSplit(std::vector<std::vector<double>> inputSet, std::vector<std::vector<double>> outputSet, double testSize) {
std::random_device rd;
std::default_random_engine generator(rd());
@ -817,6 +867,73 @@ std::tuple<std::vector<std::vector<double>>, std::vector<std::string>> MLPPData:
return { wordEmbeddings, wordList };
}
struct WordsToVecResult {
std::vector<std::vector<double>> word_embeddings;
std::vector<std::string> word_list;
};
MLPPData::WordsToVecResult MLPPData::word_to_vec(std::vector<std::string> sentences, std::string type, int windowSize, int dimension, double learning_rate, int max_epoch) {
WordsToVecResult res;
res.word_list = removeNullByte(removeStopWords(createWordList(sentences)));
std::vector<std::vector<std::string>> segmented_sentences;
segmented_sentences.resize(sentences.size());
for (int i = 0; i < sentences.size(); i++) {
segmented_sentences[i] = removeStopWords(sentences[i]);
}
std::vector<std::string> inputStrings;
std::vector<std::string> outputStrings;
for (int i = 0; i < segmented_sentences.size(); i++) {
for (int j = 0; j < segmented_sentences[i].size(); j++) {
for (int k = windowSize; k > 0; k--) {
if (j - k >= 0) {
inputStrings.push_back(segmented_sentences[i][j]);
outputStrings.push_back(segmented_sentences[i][j - k]);
}
if (j + k <= segmented_sentences[i].size() - 1) {
inputStrings.push_back(segmented_sentences[i][j]);
outputStrings.push_back(segmented_sentences[i][j + k]);
}
}
}
}
int inputSize = inputStrings.size();
inputStrings.insert(inputStrings.end(), outputStrings.begin(), outputStrings.end());
std::vector<std::vector<double>> BOW = MLPPData::BOW(inputStrings, "Binary");
std::vector<std::vector<double>> inputSet;
std::vector<std::vector<double>> outputSet;
for (int i = 0; i < inputSize; i++) {
inputSet.push_back(BOW[i]);
}
for (int i = inputSize; i < BOW.size(); i++) {
outputSet.push_back(BOW[i]);
}
MLPPLinAlg alg;
MLPPSoftmaxNet *model;
if (type == "Skipgram") {
model = new MLPPSoftmaxNet(outputSet, inputSet, dimension);
} else { // else = CBOW. We maintain it is a default.
model = new MLPPSoftmaxNet(inputSet, outputSet, dimension);
}
model->gradientDescent(learning_rate, max_epoch, false);
res.word_embeddings = model->getEmbeddings();
delete model;
return res;
}
std::vector<std::vector<double>> MLPPData::LSA(std::vector<std::string> sentences, int dim) {
MLPPLinAlg alg;
std::vector<std::vector<double>> docWordData = BOW(sentences, "Binary");
@ -946,4 +1063,6 @@ void MLPPData::_bind_methods() {
ClassDB::bind_method(D_METHOD("load_mnist_test", "path"), &MLPPData::load_mnist_test);
ClassDB::bind_method(D_METHOD("load_california_housing", "path"), &MLPPData::load_california_housing);
ClassDB::bind_method(D_METHOD("load_fires_and_crime", "path"), &MLPPData::load_fires_and_crime);
ClassDB::bind_method(D_METHOD("train_test_split", "data", "test_size"), &MLPPData::train_test_split_bind);
}

View File

@ -10,6 +10,7 @@
//
#include "core/string/ustring.h"
#include "core/variant/array.h"
#include "core/object/reference.h"
@ -68,6 +69,14 @@ public:
void set_data_unsupervised(int k, const String &file_name, std::vector<std::vector<double>> &inputSet);
void set_data_simple(const String &file_name, std::vector<double> &inputSet, std::vector<double> &outputSet);
struct SplitComplexData {
Ref<MLPPDataComplex> train;
Ref<MLPPDataComplex> test;
};
SplitComplexData train_test_split(const Ref<MLPPDataComplex> &data, double test_size);
Array train_test_split_bind(const Ref<MLPPDataComplex> &data, double test_size);
// Load Datasets
std::tuple<std::vector<std::vector<double>>, std::vector<double>> loadBreastCancer();
std::tuple<std::vector<std::vector<double>>, std::vector<double>> loadBreastCancerSVC();
@ -114,7 +123,16 @@ public:
std::vector<std::vector<double>> BOW(std::vector<std::string> sentences, std::string = "Default");
std::vector<std::vector<double>> TFIDF(std::vector<std::string> sentences);
std::tuple<std::vector<std::vector<double>>, std::vector<std::string>> word2Vec(std::vector<std::string> sentences, std::string type, int windowSize, int dimension, double learning_rate, int max_epoch);
struct WordsToVecResult {
std::vector<std::vector<double>> word_embeddings;
std::vector<std::string> word_list;
};
WordsToVecResult word_to_vec(std::vector<std::string> sentences, std::string type, int windowSize, int dimension, double learning_rate, int max_epoch);
std::vector<std::vector<double>> LSA(std::vector<std::string> sentences, int dim);
std::vector<std::string> createWordList(std::vector<std::string> sentences);

View File

@ -11,8 +11,6 @@
#include <map>
#include <random>
std::vector<std::vector<double>> MLPPLinAlg::gramMatrix(std::vector<std::vector<double>> A) {
return matmult(transpose(A), A); // AtA
}
@ -641,6 +639,131 @@ std::tuple<std::vector<std::vector<double>>, std::vector<std::vector<double>>> M
return { eigenvectors, a_new };
}
MLPPLinAlg::EigenResult MLPPLinAlg::eigen(std::vector<std::vector<double>> A) {
/*
A (the entered parameter) in most use cases will be X'X, XX', etc. and must be symmetric.
That simply means that 1) X' = X and 2) X is a square matrix. This function that computes the
eigenvalues of a matrix is utilizing Jacobi's method.
*/
double diagonal = true; // Perform the iterative Jacobi algorithm unless and until we reach a diagonal matrix which yields us the eigenvals.
std::map<int, int> val_to_vec;
std::vector<std::vector<double>> a_new;
std::vector<std::vector<double>> eigenvectors = identity(A.size());
do {
double a_ij = A[0][1];
double sub_i = 0;
double sub_j = 1;
for (int i = 0; i < A.size(); i++) {
for (int j = 0; j < A[i].size(); j++) {
if (i != j && std::abs(A[i][j]) > a_ij) {
a_ij = A[i][j];
sub_i = i;
sub_j = j;
} else if (i != j && std::abs(A[i][j]) == a_ij) {
if (i < sub_i) {
a_ij = A[i][j];
sub_i = i;
sub_j = j;
}
}
}
}
double a_ii = A[sub_i][sub_i];
double a_jj = A[sub_j][sub_j];
double a_ji = A[sub_j][sub_i];
double theta;
if (a_ii == a_jj) {
theta = M_PI / 4;
} else {
theta = 0.5 * atan(2 * a_ij / (a_ii - a_jj));
}
std::vector<std::vector<double>> P = identity(A.size());
P[sub_i][sub_j] = -std::sin(theta);
P[sub_i][sub_i] = std::cos(theta);
P[sub_j][sub_j] = std::cos(theta);
P[sub_j][sub_i] = std::sin(theta);
a_new = matmult(matmult(inverse(P), A), P);
for (int i = 0; i < a_new.size(); i++) {
for (int j = 0; j < a_new[i].size(); j++) {
if (i != j && std::round(a_new[i][j]) == 0) {
a_new[i][j] = 0;
}
}
}
bool non_zero = false;
for (int i = 0; i < a_new.size(); i++) {
for (int j = 0; j < a_new[i].size(); j++) {
if (i != j && std::round(a_new[i][j]) != 0) {
non_zero = true;
}
}
}
if (non_zero) {
diagonal = false;
} else {
diagonal = true;
}
if (a_new == A) {
diagonal = true;
for (int i = 0; i < a_new.size(); i++) {
for (int j = 0; j < a_new[i].size(); j++) {
if (i != j) {
a_new[i][j] = 0;
}
}
}
}
eigenvectors = matmult(eigenvectors, P);
A = a_new;
} while (!diagonal);
std::vector<std::vector<double>> a_new_prior = a_new;
// Bubble Sort. Should change this later.
for (int i = 0; i < a_new.size() - 1; i++) {
for (int j = 0; j < a_new.size() - 1 - i; j++) {
if (a_new[j][j] < a_new[j + 1][j + 1]) {
double temp = a_new[j + 1][j + 1];
a_new[j + 1][j + 1] = a_new[j][j];
a_new[j][j] = temp;
}
}
}
for (int i = 0; i < a_new.size(); i++) {
for (int j = 0; j < a_new.size(); j++) {
if (a_new[i][i] == a_new_prior[j][j]) {
val_to_vec[i] = j;
}
}
}
std::vector<std::vector<double>> eigen_temp = eigenvectors;
for (int i = 0; i < eigenvectors.size(); i++) {
for (int j = 0; j < eigenvectors[i].size(); j++) {
eigenvectors[i][j] = eigen_temp[i][val_to_vec[j]];
}
}
EigenResult res;
res.eigen_vectors = eigenvectors;
res.eigen_values = a_new;
return res;
}
std::tuple<std::vector<std::vector<double>>, std::vector<std::vector<double>>, std::vector<std::vector<double>>> MLPPLinAlg::SVD(std::vector<std::vector<double>> A) {
auto [left_eigenvecs, eigenvals] = eig(matmult(A, transpose(A)));
auto [right_eigenvecs, right_eigenvals] = eig(matmult(transpose(A), A));
@ -655,6 +778,26 @@ std::tuple<std::vector<std::vector<double>>, std::vector<std::vector<double>>, s
return { left_eigenvecs, sigma, right_eigenvecs };
}
MLPPLinAlg::SDVResult MLPPLinAlg::svd(std::vector<std::vector<double>> A) {
EigenResult left_eigen = eigen(matmult(A, transpose(A)));
EigenResult right_eigen = eigen(matmult(transpose(A), A));
std::vector<std::vector<double>> singularvals = sqrt(left_eigen.eigen_values);
std::vector<std::vector<double>> sigma = zeromat(A.size(), A[0].size());
for (int i = 0; i < singularvals.size(); i++) {
for (int j = 0; j < singularvals[i].size(); j++) {
sigma[i][j] = singularvals[i][j];
}
}
SDVResult res;
res.U = left_eigen.eigen_vectors;
res.S = sigma;
res.Vt = right_eigen.eigen_vectors;
return res;
}
std::vector<double> MLPPLinAlg::vectorProjection(std::vector<double> a, std::vector<double> b) {
double product = dot(a, b) / dot(a, a);
return scalarMultiply(product, a); // Projection of vector a onto b. Denotated as proj_a(b).
@ -686,6 +829,15 @@ std::tuple<std::vector<std::vector<double>>, std::vector<std::vector<double>>> M
return { Q, R };
}
MLPPLinAlg::QRDResult MLPPLinAlg::qrd(std::vector<std::vector<double>> A) {
QRDResult res;
res.Q = gramSchmidtProcess(A);
res.R = matmult(transpose(res.Q), A);
return res;
}
std::tuple<std::vector<std::vector<double>>, std::vector<std::vector<double>>> MLPPLinAlg::chol(std::vector<std::vector<double>> A) {
std::vector<std::vector<double>> L = zeromat(A.size(), A[0].size());
for (int j = 0; j < L.size(); j++) { // Matrices entered must be square. No problem here.
@ -708,6 +860,33 @@ std::tuple<std::vector<std::vector<double>>, std::vector<std::vector<double>>> M
return { L, transpose(L) }; // Indeed, L.T is our upper triangular matrix.
}
MLPPLinAlg::CholeskyResult MLPPLinAlg::cholesky(std::vector<std::vector<double>> A) {
std::vector<std::vector<double>> L = zeromat(A.size(), A[0].size());
for (int j = 0; j < L.size(); j++) { // Matrices entered must be square. No problem here.
for (int i = j; i < L.size(); i++) {
if (i == j) {
double sum = 0;
for (int k = 0; k < j; k++) {
sum += L[i][k] * L[i][k];
}
L[i][j] = std::sqrt(A[i][j] - sum);
} else { // That is, i!=j
double sum = 0;
for (int k = 0; k < j; k++) {
sum += L[i][k] * L[j][k];
}
L[i][j] = (A[i][j] - sum) / L[j][j];
}
}
}
CholeskyResult res;
res.L = L;
res.Lt = transpose(L); // Indeed, L.T is our upper triangular matrix.
return res;
}
double MLPPLinAlg::sum_elements(std::vector<std::vector<double>> A) {
double sum = 0;
for (int i = 0; i < A.size(); i++) {

View File

@ -11,7 +11,6 @@
#include <tuple>
#include <vector>
class MLPPLinAlg {
public:
// MATRIX FUNCTIONS
@ -98,16 +97,45 @@ public:
std::tuple<std::vector<std::vector<double>>, std::vector<std::vector<double>>> eig(std::vector<std::vector<double>> A);
struct EigenResult {
std::vector<std::vector<double>> eigen_vectors;
std::vector<std::vector<double>> eigen_values;
};
EigenResult eigen(std::vector<std::vector<double>> A);
std::tuple<std::vector<std::vector<double>>, std::vector<std::vector<double>>, std::vector<std::vector<double>>> SVD(std::vector<std::vector<double>> A);
struct SDVResult {
std::vector<std::vector<double>> U;
std::vector<std::vector<double>> S;
std::vector<std::vector<double>> Vt;
};
SDVResult svd(std::vector<std::vector<double>> A);
std::vector<double> vectorProjection(std::vector<double> a, std::vector<double> b);
std::vector<std::vector<double>> gramSchmidtProcess(std::vector<std::vector<double>> A);
std::tuple<std::vector<std::vector<double>>, std::vector<std::vector<double>>> QRD(std::vector<std::vector<double>> A);
struct QRDResult {
std::vector<std::vector<double>> Q;
std::vector<std::vector<double>> R;
};
QRDResult qrd(std::vector<std::vector<double>> A);
std::tuple<std::vector<std::vector<double>>, std::vector<std::vector<double>>> chol(std::vector<std::vector<double>> A);
struct CholeskyResult {
std::vector<std::vector<double>> L;
std::vector<std::vector<double>> Lt;
};
CholeskyResult cholesky(std::vector<std::vector<double>> A);
double sum_elements(std::vector<std::vector<double>> A);
std::vector<double> flatten(std::vector<std::vector<double>> A);
@ -231,6 +259,4 @@ public:
private:
};
#endif /* LinAlg_hpp */

File diff suppressed because it is too large Load Diff

View File

@ -44,7 +44,7 @@ public:
void test_dynamically_sized_mann(bool ui = false);
void test_train_test_split_mann(bool ui = false);
void test_naive_bayes(bool ui = false);
void test_naive_bayes();
void test_k_means(bool ui = false);
void test_knn(bool ui = false);