mirror of
https://github.com/Relintai/pmlpp.git
synced 2025-01-18 15:07:16 +01:00
Ported all tests.
This commit is contained in:
parent
b398337558
commit
2dabbb42be
@ -200,6 +200,53 @@ void MLPPData::set_data_simple(const String &file_name, std::vector<double> &inp
|
||||
memdelete(file);
|
||||
}
|
||||
|
||||
MLPPData::SplitComplexData MLPPData::train_test_split(const Ref<MLPPDataComplex> &data, double test_size) {
|
||||
SplitComplexData res;
|
||||
|
||||
res.train.instance();
|
||||
res.test.instance();
|
||||
|
||||
ERR_FAIL_COND_V(!data.is_valid(), res);
|
||||
|
||||
int is = MIN(data->input.size(), data->output.size());
|
||||
|
||||
Array indices;
|
||||
indices.resize(is);
|
||||
|
||||
for (int i = 0; i < is; ++i) {
|
||||
indices[i] = i;
|
||||
}
|
||||
|
||||
indices.shuffle();
|
||||
|
||||
int test_input_number = test_size * is; // implicit usage of floor
|
||||
|
||||
for (int i = 0; i < test_input_number; ++i) {
|
||||
int index = indices[i];
|
||||
|
||||
res.test->input.push_back(data->input[i]);
|
||||
res.test->output.push_back(data->output[i]);
|
||||
}
|
||||
|
||||
for (int i = test_input_number; i < is; ++i) {
|
||||
int index = indices[i];
|
||||
|
||||
res.train->input.push_back(data->input[i]);
|
||||
res.train->output.push_back(data->output[i]);
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
Array MLPPData::train_test_split_bind(const Ref<MLPPDataComplex> &data, double test_size) {
|
||||
SplitComplexData res = train_test_split(data, test_size);
|
||||
|
||||
Array arr;
|
||||
arr.push_back(res.train);
|
||||
arr.push_back(res.test);
|
||||
|
||||
return arr;
|
||||
}
|
||||
|
||||
// Loading Datasets
|
||||
std::tuple<std::vector<std::vector<double>>, std::vector<double>> MLPPData::loadBreastCancer() {
|
||||
const int BREAST_CANCER_SIZE = 30; // k = 30
|
||||
@ -280,6 +327,9 @@ std::tuple<std::vector<double>, std::vector<double>> MLPPData::loadFiresAndCrime
|
||||
return { inputSet, outputSet };
|
||||
}
|
||||
|
||||
// Note that inputs and outputs should be pairs (technically), but this
|
||||
// implementation will separate them. (My implementation keeps them tied together.)
|
||||
// Not yet sure whether this is intentional or not (or it's something like a compiler specific difference)
|
||||
std::tuple<std::vector<std::vector<double>>, std::vector<std::vector<double>>, std::vector<std::vector<double>>, std::vector<std::vector<double>>> MLPPData::trainTestSplit(std::vector<std::vector<double>> inputSet, std::vector<std::vector<double>> outputSet, double testSize) {
|
||||
std::random_device rd;
|
||||
std::default_random_engine generator(rd());
|
||||
@ -817,6 +867,73 @@ std::tuple<std::vector<std::vector<double>>, std::vector<std::string>> MLPPData:
|
||||
return { wordEmbeddings, wordList };
|
||||
}
|
||||
|
||||
struct WordsToVecResult {
|
||||
std::vector<std::vector<double>> word_embeddings;
|
||||
std::vector<std::string> word_list;
|
||||
};
|
||||
|
||||
MLPPData::WordsToVecResult MLPPData::word_to_vec(std::vector<std::string> sentences, std::string type, int windowSize, int dimension, double learning_rate, int max_epoch) {
|
||||
WordsToVecResult res;
|
||||
|
||||
res.word_list = removeNullByte(removeStopWords(createWordList(sentences)));
|
||||
|
||||
std::vector<std::vector<std::string>> segmented_sentences;
|
||||
segmented_sentences.resize(sentences.size());
|
||||
|
||||
for (int i = 0; i < sentences.size(); i++) {
|
||||
segmented_sentences[i] = removeStopWords(sentences[i]);
|
||||
}
|
||||
|
||||
std::vector<std::string> inputStrings;
|
||||
std::vector<std::string> outputStrings;
|
||||
|
||||
for (int i = 0; i < segmented_sentences.size(); i++) {
|
||||
for (int j = 0; j < segmented_sentences[i].size(); j++) {
|
||||
for (int k = windowSize; k > 0; k--) {
|
||||
if (j - k >= 0) {
|
||||
inputStrings.push_back(segmented_sentences[i][j]);
|
||||
|
||||
outputStrings.push_back(segmented_sentences[i][j - k]);
|
||||
}
|
||||
if (j + k <= segmented_sentences[i].size() - 1) {
|
||||
inputStrings.push_back(segmented_sentences[i][j]);
|
||||
outputStrings.push_back(segmented_sentences[i][j + k]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int inputSize = inputStrings.size();
|
||||
|
||||
inputStrings.insert(inputStrings.end(), outputStrings.begin(), outputStrings.end());
|
||||
|
||||
std::vector<std::vector<double>> BOW = MLPPData::BOW(inputStrings, "Binary");
|
||||
|
||||
std::vector<std::vector<double>> inputSet;
|
||||
std::vector<std::vector<double>> outputSet;
|
||||
|
||||
for (int i = 0; i < inputSize; i++) {
|
||||
inputSet.push_back(BOW[i]);
|
||||
}
|
||||
|
||||
for (int i = inputSize; i < BOW.size(); i++) {
|
||||
outputSet.push_back(BOW[i]);
|
||||
}
|
||||
MLPPLinAlg alg;
|
||||
MLPPSoftmaxNet *model;
|
||||
if (type == "Skipgram") {
|
||||
model = new MLPPSoftmaxNet(outputSet, inputSet, dimension);
|
||||
} else { // else = CBOW. We maintain it is a default.
|
||||
model = new MLPPSoftmaxNet(inputSet, outputSet, dimension);
|
||||
}
|
||||
model->gradientDescent(learning_rate, max_epoch, false);
|
||||
|
||||
res.word_embeddings = model->getEmbeddings();
|
||||
delete model;
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
std::vector<std::vector<double>> MLPPData::LSA(std::vector<std::string> sentences, int dim) {
|
||||
MLPPLinAlg alg;
|
||||
std::vector<std::vector<double>> docWordData = BOW(sentences, "Binary");
|
||||
@ -946,4 +1063,6 @@ void MLPPData::_bind_methods() {
|
||||
ClassDB::bind_method(D_METHOD("load_mnist_test", "path"), &MLPPData::load_mnist_test);
|
||||
ClassDB::bind_method(D_METHOD("load_california_housing", "path"), &MLPPData::load_california_housing);
|
||||
ClassDB::bind_method(D_METHOD("load_fires_and_crime", "path"), &MLPPData::load_fires_and_crime);
|
||||
|
||||
ClassDB::bind_method(D_METHOD("train_test_split", "data", "test_size"), &MLPPData::train_test_split_bind);
|
||||
}
|
||||
|
@ -10,6 +10,7 @@
|
||||
//
|
||||
|
||||
#include "core/string/ustring.h"
|
||||
#include "core/variant/array.h"
|
||||
|
||||
#include "core/object/reference.h"
|
||||
|
||||
@ -68,6 +69,14 @@ public:
|
||||
void set_data_unsupervised(int k, const String &file_name, std::vector<std::vector<double>> &inputSet);
|
||||
void set_data_simple(const String &file_name, std::vector<double> &inputSet, std::vector<double> &outputSet);
|
||||
|
||||
struct SplitComplexData {
|
||||
Ref<MLPPDataComplex> train;
|
||||
Ref<MLPPDataComplex> test;
|
||||
};
|
||||
|
||||
SplitComplexData train_test_split(const Ref<MLPPDataComplex> &data, double test_size);
|
||||
Array train_test_split_bind(const Ref<MLPPDataComplex> &data, double test_size);
|
||||
|
||||
// Load Datasets
|
||||
std::tuple<std::vector<std::vector<double>>, std::vector<double>> loadBreastCancer();
|
||||
std::tuple<std::vector<std::vector<double>>, std::vector<double>> loadBreastCancerSVC();
|
||||
@ -114,7 +123,16 @@ public:
|
||||
|
||||
std::vector<std::vector<double>> BOW(std::vector<std::string> sentences, std::string = "Default");
|
||||
std::vector<std::vector<double>> TFIDF(std::vector<std::string> sentences);
|
||||
|
||||
std::tuple<std::vector<std::vector<double>>, std::vector<std::string>> word2Vec(std::vector<std::string> sentences, std::string type, int windowSize, int dimension, double learning_rate, int max_epoch);
|
||||
|
||||
struct WordsToVecResult {
|
||||
std::vector<std::vector<double>> word_embeddings;
|
||||
std::vector<std::string> word_list;
|
||||
};
|
||||
|
||||
WordsToVecResult word_to_vec(std::vector<std::string> sentences, std::string type, int windowSize, int dimension, double learning_rate, int max_epoch);
|
||||
|
||||
std::vector<std::vector<double>> LSA(std::vector<std::string> sentences, int dim);
|
||||
|
||||
std::vector<std::string> createWordList(std::vector<std::string> sentences);
|
||||
|
@ -11,8 +11,6 @@
|
||||
#include <map>
|
||||
#include <random>
|
||||
|
||||
|
||||
|
||||
std::vector<std::vector<double>> MLPPLinAlg::gramMatrix(std::vector<std::vector<double>> A) {
|
||||
return matmult(transpose(A), A); // AtA
|
||||
}
|
||||
@ -641,6 +639,131 @@ std::tuple<std::vector<std::vector<double>>, std::vector<std::vector<double>>> M
|
||||
return { eigenvectors, a_new };
|
||||
}
|
||||
|
||||
MLPPLinAlg::EigenResult MLPPLinAlg::eigen(std::vector<std::vector<double>> A) {
|
||||
/*
|
||||
A (the entered parameter) in most use cases will be X'X, XX', etc. and must be symmetric.
|
||||
That simply means that 1) X' = X and 2) X is a square matrix. This function that computes the
|
||||
eigenvalues of a matrix is utilizing Jacobi's method.
|
||||
*/
|
||||
|
||||
double diagonal = true; // Perform the iterative Jacobi algorithm unless and until we reach a diagonal matrix which yields us the eigenvals.
|
||||
|
||||
std::map<int, int> val_to_vec;
|
||||
std::vector<std::vector<double>> a_new;
|
||||
std::vector<std::vector<double>> eigenvectors = identity(A.size());
|
||||
do {
|
||||
double a_ij = A[0][1];
|
||||
double sub_i = 0;
|
||||
double sub_j = 1;
|
||||
for (int i = 0; i < A.size(); i++) {
|
||||
for (int j = 0; j < A[i].size(); j++) {
|
||||
if (i != j && std::abs(A[i][j]) > a_ij) {
|
||||
a_ij = A[i][j];
|
||||
sub_i = i;
|
||||
sub_j = j;
|
||||
} else if (i != j && std::abs(A[i][j]) == a_ij) {
|
||||
if (i < sub_i) {
|
||||
a_ij = A[i][j];
|
||||
sub_i = i;
|
||||
sub_j = j;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
double a_ii = A[sub_i][sub_i];
|
||||
double a_jj = A[sub_j][sub_j];
|
||||
double a_ji = A[sub_j][sub_i];
|
||||
double theta;
|
||||
|
||||
if (a_ii == a_jj) {
|
||||
theta = M_PI / 4;
|
||||
} else {
|
||||
theta = 0.5 * atan(2 * a_ij / (a_ii - a_jj));
|
||||
}
|
||||
|
||||
std::vector<std::vector<double>> P = identity(A.size());
|
||||
P[sub_i][sub_j] = -std::sin(theta);
|
||||
P[sub_i][sub_i] = std::cos(theta);
|
||||
P[sub_j][sub_j] = std::cos(theta);
|
||||
P[sub_j][sub_i] = std::sin(theta);
|
||||
|
||||
a_new = matmult(matmult(inverse(P), A), P);
|
||||
|
||||
for (int i = 0; i < a_new.size(); i++) {
|
||||
for (int j = 0; j < a_new[i].size(); j++) {
|
||||
if (i != j && std::round(a_new[i][j]) == 0) {
|
||||
a_new[i][j] = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool non_zero = false;
|
||||
for (int i = 0; i < a_new.size(); i++) {
|
||||
for (int j = 0; j < a_new[i].size(); j++) {
|
||||
if (i != j && std::round(a_new[i][j]) != 0) {
|
||||
non_zero = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (non_zero) {
|
||||
diagonal = false;
|
||||
} else {
|
||||
diagonal = true;
|
||||
}
|
||||
|
||||
if (a_new == A) {
|
||||
diagonal = true;
|
||||
for (int i = 0; i < a_new.size(); i++) {
|
||||
for (int j = 0; j < a_new[i].size(); j++) {
|
||||
if (i != j) {
|
||||
a_new[i][j] = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
eigenvectors = matmult(eigenvectors, P);
|
||||
A = a_new;
|
||||
|
||||
} while (!diagonal);
|
||||
|
||||
std::vector<std::vector<double>> a_new_prior = a_new;
|
||||
|
||||
// Bubble Sort. Should change this later.
|
||||
for (int i = 0; i < a_new.size() - 1; i++) {
|
||||
for (int j = 0; j < a_new.size() - 1 - i; j++) {
|
||||
if (a_new[j][j] < a_new[j + 1][j + 1]) {
|
||||
double temp = a_new[j + 1][j + 1];
|
||||
a_new[j + 1][j + 1] = a_new[j][j];
|
||||
a_new[j][j] = temp;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < a_new.size(); i++) {
|
||||
for (int j = 0; j < a_new.size(); j++) {
|
||||
if (a_new[i][i] == a_new_prior[j][j]) {
|
||||
val_to_vec[i] = j;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::vector<double>> eigen_temp = eigenvectors;
|
||||
for (int i = 0; i < eigenvectors.size(); i++) {
|
||||
for (int j = 0; j < eigenvectors[i].size(); j++) {
|
||||
eigenvectors[i][j] = eigen_temp[i][val_to_vec[j]];
|
||||
}
|
||||
}
|
||||
|
||||
EigenResult res;
|
||||
res.eigen_vectors = eigenvectors;
|
||||
res.eigen_values = a_new;
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
std::tuple<std::vector<std::vector<double>>, std::vector<std::vector<double>>, std::vector<std::vector<double>>> MLPPLinAlg::SVD(std::vector<std::vector<double>> A) {
|
||||
auto [left_eigenvecs, eigenvals] = eig(matmult(A, transpose(A)));
|
||||
auto [right_eigenvecs, right_eigenvals] = eig(matmult(transpose(A), A));
|
||||
@ -655,6 +778,26 @@ std::tuple<std::vector<std::vector<double>>, std::vector<std::vector<double>>, s
|
||||
return { left_eigenvecs, sigma, right_eigenvecs };
|
||||
}
|
||||
|
||||
MLPPLinAlg::SDVResult MLPPLinAlg::svd(std::vector<std::vector<double>> A) {
|
||||
EigenResult left_eigen = eigen(matmult(A, transpose(A)));
|
||||
EigenResult right_eigen = eigen(matmult(transpose(A), A));
|
||||
|
||||
std::vector<std::vector<double>> singularvals = sqrt(left_eigen.eigen_values);
|
||||
std::vector<std::vector<double>> sigma = zeromat(A.size(), A[0].size());
|
||||
for (int i = 0; i < singularvals.size(); i++) {
|
||||
for (int j = 0; j < singularvals[i].size(); j++) {
|
||||
sigma[i][j] = singularvals[i][j];
|
||||
}
|
||||
}
|
||||
|
||||
SDVResult res;
|
||||
res.U = left_eigen.eigen_vectors;
|
||||
res.S = sigma;
|
||||
res.Vt = right_eigen.eigen_vectors;
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
std::vector<double> MLPPLinAlg::vectorProjection(std::vector<double> a, std::vector<double> b) {
|
||||
double product = dot(a, b) / dot(a, a);
|
||||
return scalarMultiply(product, a); // Projection of vector a onto b. Denotated as proj_a(b).
|
||||
@ -686,6 +829,15 @@ std::tuple<std::vector<std::vector<double>>, std::vector<std::vector<double>>> M
|
||||
return { Q, R };
|
||||
}
|
||||
|
||||
MLPPLinAlg::QRDResult MLPPLinAlg::qrd(std::vector<std::vector<double>> A) {
|
||||
QRDResult res;
|
||||
|
||||
res.Q = gramSchmidtProcess(A);
|
||||
res.R = matmult(transpose(res.Q), A);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
std::tuple<std::vector<std::vector<double>>, std::vector<std::vector<double>>> MLPPLinAlg::chol(std::vector<std::vector<double>> A) {
|
||||
std::vector<std::vector<double>> L = zeromat(A.size(), A[0].size());
|
||||
for (int j = 0; j < L.size(); j++) { // Matrices entered must be square. No problem here.
|
||||
@ -708,6 +860,33 @@ std::tuple<std::vector<std::vector<double>>, std::vector<std::vector<double>>> M
|
||||
return { L, transpose(L) }; // Indeed, L.T is our upper triangular matrix.
|
||||
}
|
||||
|
||||
MLPPLinAlg::CholeskyResult MLPPLinAlg::cholesky(std::vector<std::vector<double>> A) {
|
||||
std::vector<std::vector<double>> L = zeromat(A.size(), A[0].size());
|
||||
for (int j = 0; j < L.size(); j++) { // Matrices entered must be square. No problem here.
|
||||
for (int i = j; i < L.size(); i++) {
|
||||
if (i == j) {
|
||||
double sum = 0;
|
||||
for (int k = 0; k < j; k++) {
|
||||
sum += L[i][k] * L[i][k];
|
||||
}
|
||||
L[i][j] = std::sqrt(A[i][j] - sum);
|
||||
} else { // That is, i!=j
|
||||
double sum = 0;
|
||||
for (int k = 0; k < j; k++) {
|
||||
sum += L[i][k] * L[j][k];
|
||||
}
|
||||
L[i][j] = (A[i][j] - sum) / L[j][j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
CholeskyResult res;
|
||||
res.L = L;
|
||||
res.Lt = transpose(L); // Indeed, L.T is our upper triangular matrix.
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
double MLPPLinAlg::sum_elements(std::vector<std::vector<double>> A) {
|
||||
double sum = 0;
|
||||
for (int i = 0; i < A.size(); i++) {
|
||||
|
@ -11,7 +11,6 @@
|
||||
#include <tuple>
|
||||
#include <vector>
|
||||
|
||||
|
||||
class MLPPLinAlg {
|
||||
public:
|
||||
// MATRIX FUNCTIONS
|
||||
@ -98,16 +97,45 @@ public:
|
||||
|
||||
std::tuple<std::vector<std::vector<double>>, std::vector<std::vector<double>>> eig(std::vector<std::vector<double>> A);
|
||||
|
||||
struct EigenResult {
|
||||
std::vector<std::vector<double>> eigen_vectors;
|
||||
std::vector<std::vector<double>> eigen_values;
|
||||
};
|
||||
|
||||
EigenResult eigen(std::vector<std::vector<double>> A);
|
||||
|
||||
std::tuple<std::vector<std::vector<double>>, std::vector<std::vector<double>>, std::vector<std::vector<double>>> SVD(std::vector<std::vector<double>> A);
|
||||
|
||||
struct SDVResult {
|
||||
std::vector<std::vector<double>> U;
|
||||
std::vector<std::vector<double>> S;
|
||||
std::vector<std::vector<double>> Vt;
|
||||
};
|
||||
|
||||
SDVResult svd(std::vector<std::vector<double>> A);
|
||||
|
||||
std::vector<double> vectorProjection(std::vector<double> a, std::vector<double> b);
|
||||
|
||||
std::vector<std::vector<double>> gramSchmidtProcess(std::vector<std::vector<double>> A);
|
||||
|
||||
std::tuple<std::vector<std::vector<double>>, std::vector<std::vector<double>>> QRD(std::vector<std::vector<double>> A);
|
||||
|
||||
struct QRDResult {
|
||||
std::vector<std::vector<double>> Q;
|
||||
std::vector<std::vector<double>> R;
|
||||
};
|
||||
|
||||
QRDResult qrd(std::vector<std::vector<double>> A);
|
||||
|
||||
std::tuple<std::vector<std::vector<double>>, std::vector<std::vector<double>>> chol(std::vector<std::vector<double>> A);
|
||||
|
||||
struct CholeskyResult {
|
||||
std::vector<std::vector<double>> L;
|
||||
std::vector<std::vector<double>> Lt;
|
||||
};
|
||||
|
||||
CholeskyResult cholesky(std::vector<std::vector<double>> A);
|
||||
|
||||
double sum_elements(std::vector<std::vector<double>> A);
|
||||
|
||||
std::vector<double> flatten(std::vector<std::vector<double>> A);
|
||||
@ -231,6 +259,4 @@ public:
|
||||
private:
|
||||
};
|
||||
|
||||
|
||||
|
||||
#endif /* LinAlg_hpp */
|
File diff suppressed because it is too large
Load Diff
@ -44,7 +44,7 @@ public:
|
||||
void test_dynamically_sized_mann(bool ui = false);
|
||||
void test_train_test_split_mann(bool ui = false);
|
||||
|
||||
void test_naive_bayes(bool ui = false);
|
||||
void test_naive_bayes();
|
||||
void test_k_means(bool ui = false);
|
||||
void test_knn(bool ui = false);
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user