pmlpp/mlpp/data/data.cpp

1274 lines
37 KiB
C++
Raw Normal View History

//
// Data.cpp
// MLP
//
// Created by Marc Melikyan on 11/4/20.
//
#include "data.h"
#include "core/os/file_access.h"
2023-01-24 18:12:23 +01:00
#include "../lin_alg/lin_alg.h"
#include "../stat/stat.h"
2023-04-22 17:17:58 +02:00
#include "../lin_alg/lin_alg_old.h"
2023-12-28 17:41:20 +01:00
#include "../softmax_net/softmax_net.h"
2023-04-22 17:17:58 +02:00
#include "../stat/stat_old.h"
2023-12-28 21:06:16 +01:00
#include "data_old.h"
2023-01-24 19:00:54 +01:00
#include <algorithm>
#include <cmath>
#include <fstream>
2023-01-24 19:00:54 +01:00
#include <iostream>
#include <random>
#include <sstream>
Ref<MLPPVector> MLPPDataESimple::get_input() {
return _input;
}
void MLPPDataESimple::set_input(const Ref<MLPPVector> &val) {
_input = val;
}
Ref<MLPPVector> MLPPDataESimple::get_output() {
return _output;
}
void MLPPDataESimple::set_output(const Ref<MLPPVector> &val) {
_output = val;
}
void MLPPDataESimple::instance_data() {
_input.instance();
_output.instance();
}
void MLPPDataESimple::_bind_methods() {
ClassDB::bind_method(D_METHOD("get_input"), &MLPPDataESimple::get_input);
ClassDB::bind_method(D_METHOD("set_input", "val"), &MLPPDataESimple::set_input);
ADD_PROPERTY(PropertyInfo(Variant::OBJECT, "input", PROPERTY_HINT_RESOURCE_TYPE, "MLPPVector"), "set_input", "get_input");
ClassDB::bind_method(D_METHOD("get_output"), &MLPPDataESimple::get_input);
ClassDB::bind_method(D_METHOD("set_output", "val"), &MLPPDataESimple::set_output);
ADD_PROPERTY(PropertyInfo(Variant::OBJECT, "output", PROPERTY_HINT_RESOURCE_TYPE, "MLPPVector"), "set_output", "get_output");
ClassDB::bind_method(D_METHOD("instance_data"), &MLPPDataESimple::instance_data);
}
Ref<MLPPMatrix> MLPPDataSimple::get_input() {
return _input;
}
void MLPPDataSimple::set_input(const Ref<MLPPMatrix> &val) {
_input = val;
}
Ref<MLPPVector> MLPPDataSimple::get_output() {
return _output;
}
void MLPPDataSimple::set_output(const Ref<MLPPVector> &val) {
_output = val;
}
void MLPPDataSimple::instance_data() {
_input.instance();
_output.instance();
}
void MLPPDataSimple::_bind_methods() {
ClassDB::bind_method(D_METHOD("get_input"), &MLPPDataSimple::get_input);
ClassDB::bind_method(D_METHOD("set_input", "val"), &MLPPDataSimple::set_input);
ADD_PROPERTY(PropertyInfo(Variant::OBJECT, "input", PROPERTY_HINT_RESOURCE_TYPE, "MLPPMatrix"), "set_input", "get_input");
ClassDB::bind_method(D_METHOD("get_output"), &MLPPDataSimple::get_input);
ClassDB::bind_method(D_METHOD("set_output", "val"), &MLPPDataSimple::set_output);
ADD_PROPERTY(PropertyInfo(Variant::OBJECT, "output", PROPERTY_HINT_RESOURCE_TYPE, "MLPPVector"), "set_output", "get_output");
ClassDB::bind_method(D_METHOD("instance_data"), &MLPPDataSimple::instance_data);
}
Ref<MLPPMatrix> MLPPDataComplex::get_input() {
return _input;
}
void MLPPDataComplex::set_input(const Ref<MLPPMatrix> &val) {
_input = val;
}
Ref<MLPPMatrix> MLPPDataComplex::get_output() {
return _output;
}
void MLPPDataComplex::set_output(const Ref<MLPPMatrix> &val) {
_output = val;
}
void MLPPDataComplex::instance_data() {
_input.instance();
_output.instance();
}
void MLPPDataComplex::_bind_methods() {
ClassDB::bind_method(D_METHOD("get_input"), &MLPPDataComplex::get_input);
ClassDB::bind_method(D_METHOD("set_input", "val"), &MLPPDataComplex::set_input);
ADD_PROPERTY(PropertyInfo(Variant::OBJECT, "input", PROPERTY_HINT_RESOURCE_TYPE, "MLPPMatrix"), "set_input", "get_input");
ClassDB::bind_method(D_METHOD("get_output"), &MLPPDataComplex::get_input);
ClassDB::bind_method(D_METHOD("set_output", "val"), &MLPPDataComplex::set_output);
ADD_PROPERTY(PropertyInfo(Variant::OBJECT, "output", PROPERTY_HINT_RESOURCE_TYPE, "MLPPMatrix"), "set_output", "get_output");
ClassDB::bind_method(D_METHOD("instance_data"), &MLPPDataComplex::instance_data);
}
// Loading Datasets
Ref<MLPPDataSimple> MLPPData::load_breast_cancer(const String &path) {
const int BREAST_CANCER_SIZE = 30; // k = 30
Ref<MLPPDataSimple> data;
data.instance();
data->instance_data();
set_data_supervised(BREAST_CANCER_SIZE, path, data->get_input(), data->get_output());
return data;
}
Ref<MLPPDataSimple> MLPPData::load_breast_cancer_svc(const String &path) {
const int BREAST_CANCER_SIZE = 30; // k = 30
Ref<MLPPDataSimple> data;
data.instance();
data->instance_data();
set_data_supervised(BREAST_CANCER_SIZE, path, data->get_input(), data->get_output());
return data;
}
Ref<MLPPDataComplex> MLPPData::load_iris(const String &path) {
const int IRIS_SIZE = 4;
const int ONE_HOT_NUM = 3;
Ref<MLPPVector> temp_output_set;
temp_output_set.instance();
Ref<MLPPDataComplex> data;
data.instance();
data->instance_data();
set_data_supervised(IRIS_SIZE, path, data->get_input(), temp_output_set);
data->set_output(one_hot_rep(temp_output_set, ONE_HOT_NUM));
return data;
}
Ref<MLPPDataComplex> MLPPData::load_wine(const String &path) {
const int WINE_SIZE = 4;
const int ONE_HOT_NUM = 3;
Ref<MLPPVector> temp_output_set;
temp_output_set.instance();
Ref<MLPPDataComplex> data;
data.instance();
data->instance_data();
set_data_supervised(WINE_SIZE, path, data->get_input(), temp_output_set);
data->set_output(one_hot_rep(temp_output_set, ONE_HOT_NUM));
return data;
}
Ref<MLPPDataComplex> MLPPData::load_mnist_train(const String &path) {
const int MNIST_SIZE = 784;
const int ONE_HOT_NUM = 10;
Ref<MLPPVector> temp_output_set;
temp_output_set.instance();
Ref<MLPPDataComplex> data;
data.instance();
data->instance_data();
set_data_supervised(MNIST_SIZE, path, data->get_input(), temp_output_set);
data->set_output(one_hot_rep(temp_output_set, ONE_HOT_NUM));
return data;
}
Ref<MLPPDataComplex> MLPPData::load_mnist_test(const String &path) {
const int MNIST_SIZE = 784;
const int ONE_HOT_NUM = 10;
Ref<MLPPVector> temp_output_set;
temp_output_set.instance();
Ref<MLPPDataComplex> data;
data.instance();
data->instance_data();
set_data_supervised(MNIST_SIZE, path, data->get_input(), temp_output_set);
data->set_output(one_hot_rep(temp_output_set, ONE_HOT_NUM));
return data;
}
Ref<MLPPDataSimple> MLPPData::load_california_housing(const String &path) {
const int CALIFORNIA_HOUSING_SIZE = 13; // k = 30
Ref<MLPPDataSimple> data;
data.instance();
data->instance_data();
set_data_supervised(CALIFORNIA_HOUSING_SIZE, path, data->get_input(), data->get_output());
return data;
}
Ref<MLPPDataESimple> MLPPData::load_fires_and_crime(const String &path) {
// k is implicitly 1.
Ref<MLPPDataESimple> data;
data.instance();
data->instance_data();
set_data_simple(path, data->get_input(), data->get_output());
return data;
}
// MULTIVARIATE SUPERVISED
void MLPPData::set_data_supervised(int k, const String &file_name, Ref<MLPPMatrix> input_set, Ref<MLPPVector> output_set) {
ERR_FAIL_COND(!input_set.is_valid() || !output_set.is_valid());
MLPPLinAlg alg;
Vector<Vector<real_t>> input_set_tmp;
Vector<real_t> output_set_tmp;
FileAccess *file = FileAccess::open(file_name, FileAccess::READ);
ERR_FAIL_COND(!file);
while (!file->eof_reached()) {
Vector<String> ll = file->get_csv_line();
Vector<real_t> row;
for (int i = 0; i < k; ++i) {
row.push_back(static_cast<real_t>(ll[i].to_double()));
}
input_set_tmp.push_back(row);
output_set_tmp.push_back(static_cast<real_t>(ll[k].to_double()));
}
file->close();
memdelete(file);
output_set->set_from_vector(output_set_tmp);
input_set->set_from_vectors(input_set_tmp);
}
void MLPPData::set_data_unsupervised(int k, const String &file_name, Ref<MLPPMatrix> input_set) {
ERR_FAIL_COND(!input_set.is_valid());
MLPPLinAlg alg;
Vector<Vector<real_t>> input_set_tmp;
input_set_tmp.resize(k);
FileAccess *file = FileAccess::open(file_name, FileAccess::READ);
ERR_FAIL_COND(!file);
while (!file->eof_reached()) {
Vector<String> ll = file->get_csv_line();
for (int i = 0; i < k; ++i) {
input_set_tmp.write[i].push_back(static_cast<real_t>(ll[i].to_double()));
}
}
file->close();
memdelete(file);
input_set->set_from_vectors(input_set_tmp);
2023-04-22 14:23:51 +02:00
input_set = alg.transposenm(input_set);
}
void MLPPData::set_data_simple(const String &file_name, Ref<MLPPVector> input_set, Ref<MLPPVector> output_set) {
ERR_FAIL_COND(!input_set.is_valid() || !output_set.is_valid());
FileAccess *file = FileAccess::open(file_name, FileAccess::READ);
ERR_FAIL_COND(!file);
Vector<real_t> input_set_tmp;
Vector<real_t> output_set_tmp;
while (!file->eof_reached()) {
Vector<String> ll = file->get_csv_line();
for (int i = 0; i < ll.size(); i += 2) {
input_set_tmp.push_back(static_cast<real_t>(ll[i].to_double()));
output_set_tmp.push_back(static_cast<real_t>(ll[i + 1].to_double()));
}
}
file->close();
memdelete(file);
input_set->set_from_vector(input_set_tmp);
output_set->set_from_vector(output_set_tmp);
}
2023-01-24 19:20:18 +01:00
MLPPData::SplitComplexData MLPPData::train_test_split(Ref<MLPPDataComplex> data, real_t test_size) {
2023-01-26 14:52:49 +01:00
SplitComplexData res;
res.train.instance();
res.train->instance_data();
2023-01-26 14:52:49 +01:00
res.test.instance();
res.test->instance_data();
2023-01-26 14:52:49 +01:00
ERR_FAIL_COND_V(!data.is_valid(), res);
Ref<MLPPMatrix> orig_input = data->get_input();
Ref<MLPPMatrix> orig_output = data->get_output();
ERR_FAIL_COND_V(!orig_input.is_valid(), res);
ERR_FAIL_COND_V(!orig_output.is_valid(), res);
Size2i orig_input_size = orig_input->size();
Size2i orig_output_size = orig_output->size();
int is = MIN(orig_input_size.y, orig_output_size.y);
2023-01-26 14:52:49 +01:00
Array indices;
indices.resize(is);
for (int i = 0; i < is; ++i) {
indices[i] = i;
}
indices.shuffle();
Ref<MLPPVector> orig_input_row_tmp;
orig_input_row_tmp.instance();
orig_input_row_tmp->resize(orig_input_size.x);
Ref<MLPPVector> orig_output_row_tmp;
orig_output_row_tmp.instance();
orig_output_row_tmp->resize(orig_output_size.x);
2023-01-26 14:52:49 +01:00
int test_input_number = test_size * is; // implicit usage of floor
Ref<MLPPMatrix> res_test_input = res.test->get_input();
Ref<MLPPMatrix> res_test_output = res.test->get_output();
res_test_input->resize(Size2i(orig_input_size.x, test_input_number));
res_test_output->resize(Size2i(orig_output_size.x, test_input_number));
2023-01-26 14:52:49 +01:00
for (int i = 0; i < test_input_number; ++i) {
int index = indices[i];
2023-04-29 15:07:30 +02:00
orig_input->row_get_into_mlpp_vector(index, orig_input_row_tmp);
orig_output->row_get_into_mlpp_vector(index, orig_output_row_tmp);
res_test_input->row_set_mlpp_vector(i, orig_input_row_tmp);
res_test_output->row_set_mlpp_vector(i, orig_output_row_tmp);
2023-01-26 14:52:49 +01:00
}
Ref<MLPPMatrix> res_train_input = res.train->get_input();
Ref<MLPPMatrix> res_train_output = res.train->get_output();
int train_input_number = is - test_input_number;
res_train_input->resize(Size2i(orig_input_size.x, train_input_number));
res_train_output->resize(Size2i(orig_output_size.x, train_input_number));
2023-01-26 14:52:49 +01:00
for (int i = 0; i < train_input_number; ++i) {
int index = indices[test_input_number + i];
2023-04-29 15:07:30 +02:00
orig_input->row_get_into_mlpp_vector(index, orig_input_row_tmp);
orig_output->row_get_into_mlpp_vector(index, orig_output_row_tmp);
res_train_input->row_set_mlpp_vector(i, orig_input_row_tmp);
res_train_output->row_set_mlpp_vector(i, orig_output_row_tmp);
2023-01-26 14:52:49 +01:00
}
return res;
}
2023-01-27 13:01:16 +01:00
Array MLPPData::train_test_split_bind(const Ref<MLPPDataComplex> &data, real_t test_size) {
2023-01-26 14:52:49 +01:00
SplitComplexData res = train_test_split(data, test_size);
Array arr;
arr.push_back(res.train);
arr.push_back(res.test);
return arr;
}
2023-01-24 19:00:54 +01:00
// Loading Datasets
2023-01-27 13:01:16 +01:00
std::tuple<std::vector<std::vector<real_t>>, std::vector<real_t>> MLPPData::loadBreastCancer() {
2023-01-24 19:00:54 +01:00
const int BREAST_CANCER_SIZE = 30; // k = 30
2023-01-27 13:01:16 +01:00
std::vector<std::vector<real_t>> inputSet;
std::vector<real_t> outputSet;
2023-01-24 19:00:54 +01:00
setData(BREAST_CANCER_SIZE, "MLPP/Data/Datasets/BreastCancer.csv", inputSet, outputSet);
return { inputSet, outputSet };
}
2023-01-27 13:01:16 +01:00
std::tuple<std::vector<std::vector<real_t>>, std::vector<real_t>> MLPPData::loadBreastCancerSVC() {
2023-01-24 19:00:54 +01:00
const int BREAST_CANCER_SIZE = 30; // k = 30
2023-01-27 13:01:16 +01:00
std::vector<std::vector<real_t>> inputSet;
std::vector<real_t> outputSet;
2023-01-24 19:00:54 +01:00
setData(BREAST_CANCER_SIZE, "MLPP/Data/Datasets/BreastCancerSVM.csv", inputSet, outputSet);
return { inputSet, outputSet };
}
2023-01-27 13:01:16 +01:00
std::tuple<std::vector<std::vector<real_t>>, std::vector<std::vector<real_t>>> MLPPData::loadIris() {
2023-01-24 19:00:54 +01:00
const int IRIS_SIZE = 4;
const int ONE_HOT_NUM = 3;
2023-01-27 13:01:16 +01:00
std::vector<std::vector<real_t>> inputSet;
std::vector<real_t> tempOutputSet;
2023-12-28 21:06:16 +01:00
MLPPDataOld d;
2023-01-24 19:00:54 +01:00
setData(IRIS_SIZE, "/Users/marcmelikyan/Desktop/Data/Iris.csv", inputSet, tempOutputSet);
2023-12-28 21:06:16 +01:00
std::vector<std::vector<real_t>> outputSet = d.oneHotRep(tempOutputSet, ONE_HOT_NUM);
2023-01-24 19:00:54 +01:00
return { inputSet, outputSet };
}
2023-01-27 13:01:16 +01:00
std::tuple<std::vector<std::vector<real_t>>, std::vector<std::vector<real_t>>> MLPPData::loadWine() {
2023-01-24 19:00:54 +01:00
const int WINE_SIZE = 4;
const int ONE_HOT_NUM = 3;
2023-01-27 13:01:16 +01:00
std::vector<std::vector<real_t>> inputSet;
std::vector<real_t> tempOutputSet;
2023-12-28 21:06:16 +01:00
MLPPDataOld d;
2023-01-24 19:00:54 +01:00
setData(WINE_SIZE, "MLPP/Data/Datasets/Iris.csv", inputSet, tempOutputSet);
2023-12-28 21:06:16 +01:00
std::vector<std::vector<real_t>> outputSet = d.oneHotRep(tempOutputSet, ONE_HOT_NUM);
2023-01-24 19:00:54 +01:00
return { inputSet, outputSet };
}
2023-01-27 13:01:16 +01:00
std::tuple<std::vector<std::vector<real_t>>, std::vector<std::vector<real_t>>> MLPPData::loadMnistTrain() {
2023-01-24 19:00:54 +01:00
const int MNIST_SIZE = 784;
const int ONE_HOT_NUM = 10;
2023-01-27 13:01:16 +01:00
std::vector<std::vector<real_t>> inputSet;
std::vector<real_t> tempOutputSet;
2023-12-28 21:06:16 +01:00
MLPPDataOld d;
2023-01-24 19:00:54 +01:00
setData(MNIST_SIZE, "MLPP/Data/Datasets/MnistTrain.csv", inputSet, tempOutputSet);
2023-12-28 21:06:16 +01:00
std::vector<std::vector<real_t>> outputSet = d.oneHotRep(tempOutputSet, ONE_HOT_NUM);
2023-01-24 19:00:54 +01:00
return { inputSet, outputSet };
}
2023-01-27 13:01:16 +01:00
std::tuple<std::vector<std::vector<real_t>>, std::vector<std::vector<real_t>>> MLPPData::loadMnistTest() {
2023-01-24 19:00:54 +01:00
const int MNIST_SIZE = 784;
const int ONE_HOT_NUM = 10;
2023-01-27 13:01:16 +01:00
std::vector<std::vector<real_t>> inputSet;
std::vector<real_t> tempOutputSet;
2023-12-28 21:06:16 +01:00
MLPPDataOld d;
2023-01-24 19:00:54 +01:00
setData(MNIST_SIZE, "MLPP/Data/Datasets/MnistTest.csv", inputSet, tempOutputSet);
2023-12-28 21:06:16 +01:00
std::vector<std::vector<real_t>> outputSet = d.oneHotRep(tempOutputSet, ONE_HOT_NUM);
2023-01-24 19:00:54 +01:00
return { inputSet, outputSet };
}
2023-01-27 13:01:16 +01:00
std::tuple<std::vector<std::vector<real_t>>, std::vector<real_t>> MLPPData::loadCaliforniaHousing() {
2023-01-24 19:00:54 +01:00
const int CALIFORNIA_HOUSING_SIZE = 13; // k = 30
2023-01-27 13:01:16 +01:00
std::vector<std::vector<real_t>> inputSet;
std::vector<real_t> outputSet;
2023-01-24 19:00:54 +01:00
setData(CALIFORNIA_HOUSING_SIZE, "MLPP/Data/Datasets/CaliforniaHousing.csv", inputSet, outputSet);
return { inputSet, outputSet };
}
2023-01-27 13:01:16 +01:00
std::tuple<std::vector<real_t>, std::vector<real_t>> MLPPData::loadFiresAndCrime() {
std::vector<real_t> inputSet; // k is implicitly 1.
std::vector<real_t> outputSet;
2023-01-24 19:00:54 +01:00
setData("MLPP/Data/Datasets/FiresAndCrime.csv", inputSet, outputSet);
return { inputSet, outputSet };
}
2023-01-26 14:52:49 +01:00
// Note that inputs and outputs should be pairs (technically), but this
// implementation will separate them. (My implementation keeps them tied together.)
// Not yet sure whether this is intentional or not (or it's something like a compiler specific difference)
2023-01-27 13:01:16 +01:00
std::tuple<std::vector<std::vector<real_t>>, std::vector<std::vector<real_t>>, std::vector<std::vector<real_t>>, std::vector<std::vector<real_t>>> MLPPData::trainTestSplit(std::vector<std::vector<real_t>> inputSet, std::vector<std::vector<real_t>> outputSet, real_t testSize) {
2023-01-24 19:00:54 +01:00
std::random_device rd;
std::default_random_engine generator(rd());
std::shuffle(inputSet.begin(), inputSet.end(), generator); // inputSet random shuffle
std::shuffle(outputSet.begin(), outputSet.end(), generator); // outputSet random shuffle)
2023-01-27 13:01:16 +01:00
std::vector<std::vector<real_t>> inputTestSet;
std::vector<std::vector<real_t>> outputTestSet;
2023-01-24 19:00:54 +01:00
int testInputNumber = testSize * inputSet.size(); // implicit usage of floor
int testOutputNumber = testSize * outputSet.size(); // implicit usage of floor
for (int i = 0; i < testInputNumber; i++) {
inputTestSet.push_back(inputSet[i]);
inputSet.erase(inputSet.begin());
}
for (int i = 0; i < testOutputNumber; i++) {
outputTestSet.push_back(outputSet[i]);
outputSet.erase(outputSet.begin());
}
return { inputSet, outputSet, inputTestSet, outputTestSet };
}
// MULTIVARIATE SUPERVISED
2023-01-27 13:01:16 +01:00
void MLPPData::setData(int k, std::string fileName, std::vector<std::vector<real_t>> &inputSet, std::vector<real_t> &outputSet) {
2023-04-22 17:17:58 +02:00
MLPPLinAlgOld alg;
2023-01-24 19:00:54 +01:00
std::string inputTemp;
std::string outputTemp;
inputSet.resize(k);
std::ifstream dataFile(fileName);
if (!dataFile.is_open()) {
std::cout << fileName << " failed to open." << std::endl;
}
std::string line;
while (std::getline(dataFile, line)) {
std::stringstream ss(line);
for (int i = 0; i < k; i++) {
std::getline(ss, inputTemp, ',');
inputSet[i].push_back(std::stod(inputTemp));
}
std::getline(ss, outputTemp, ',');
outputSet.push_back(std::stod(outputTemp));
}
inputSet = alg.transpose(inputSet);
dataFile.close();
}
2023-01-27 13:01:16 +01:00
void MLPPData::printData(std::vector<std::string> inputName, std::string outputName, std::vector<std::vector<real_t>> inputSet, std::vector<real_t> outputSet) {
2023-04-22 17:17:58 +02:00
MLPPLinAlgOld alg;
2023-01-24 19:00:54 +01:00
inputSet = alg.transpose(inputSet);
2023-02-12 18:03:17 +01:00
for (uint32_t i = 0; i < inputSet.size(); i++) {
2023-01-24 19:00:54 +01:00
std::cout << inputName[i] << std::endl;
2023-02-12 18:03:17 +01:00
for (uint32_t j = 0; j < inputSet[i].size(); j++) {
2023-01-24 19:00:54 +01:00
std::cout << inputSet[i][j] << std::endl;
}
}
std::cout << outputName << std::endl;
2023-02-12 18:03:17 +01:00
for (uint32_t i = 0; i < outputSet.size(); i++) {
2023-01-24 19:00:54 +01:00
std::cout << outputSet[i] << std::endl;
}
}
// UNSUPERVISED
2023-01-27 13:01:16 +01:00
void MLPPData::setData(int k, std::string fileName, std::vector<std::vector<real_t>> &inputSet) {
2023-04-22 17:17:58 +02:00
MLPPLinAlgOld alg;
2023-01-24 19:00:54 +01:00
std::string inputTemp;
inputSet.resize(k);
std::ifstream dataFile(fileName);
if (!dataFile.is_open()) {
std::cout << fileName << " failed to open." << std::endl;
}
std::string line;
while (std::getline(dataFile, line)) {
std::stringstream ss(line);
for (int i = 0; i < k; i++) {
std::getline(ss, inputTemp, ',');
inputSet[i].push_back(std::stod(inputTemp));
}
}
inputSet = alg.transpose(inputSet);
dataFile.close();
}
2023-01-27 13:01:16 +01:00
void MLPPData::printData(std::vector<std::string> inputName, std::vector<std::vector<real_t>> inputSet) {
2023-04-22 17:17:58 +02:00
MLPPLinAlgOld alg;
2023-01-24 19:00:54 +01:00
inputSet = alg.transpose(inputSet);
2023-02-12 18:03:17 +01:00
for (uint32_t i = 0; i < inputSet.size(); i++) {
2023-01-24 19:00:54 +01:00
std::cout << inputName[i] << std::endl;
2023-02-12 18:03:17 +01:00
for (uint32_t j = 0; j < inputSet[i].size(); j++) {
2023-01-24 19:00:54 +01:00
std::cout << inputSet[i][j] << std::endl;
}
}
}
// SIMPLE
2023-01-27 13:01:16 +01:00
void MLPPData::setData(std::string fileName, std::vector<real_t> &inputSet, std::vector<real_t> &outputSet) {
2023-01-24 19:00:54 +01:00
std::string inputTemp, outputTemp;
std::ifstream dataFile(fileName);
if (!dataFile.is_open()) {
std::cout << "The file failed to open." << std::endl;
}
std::string line;
while (std::getline(dataFile, line)) {
std::stringstream ss(line);
std::getline(ss, inputTemp, ',');
std::getline(ss, outputTemp, ',');
inputSet.push_back(std::stod(inputTemp));
outputSet.push_back(std::stod(outputTemp));
}
dataFile.close();
}
2023-01-27 13:01:16 +01:00
void MLPPData::printData(std::string &inputName, std::string &outputName, std::vector<real_t> &inputSet, std::vector<real_t> &outputSet) {
2023-01-24 19:00:54 +01:00
std::cout << inputName << std::endl;
2023-02-12 18:03:17 +01:00
for (uint32_t i = 0; i < inputSet.size(); i++) {
2023-01-24 19:00:54 +01:00
std::cout << inputSet[i] << std::endl;
}
std::cout << outputName << std::endl;
2023-02-12 18:03:17 +01:00
for (uint32_t i = 0; i < inputSet.size(); i++) {
2023-01-24 19:00:54 +01:00
std::cout << outputSet[i] << std::endl;
}
}
// Images
2023-01-27 13:01:16 +01:00
std::vector<std::vector<real_t>> MLPPData::rgb2gray(std::vector<std::vector<std::vector<real_t>>> input) {
std::vector<std::vector<real_t>> grayScale;
2023-01-24 19:00:54 +01:00
grayScale.resize(input[0].size());
2023-02-12 18:03:17 +01:00
for (uint32_t i = 0; i < grayScale.size(); i++) {
2023-01-24 19:00:54 +01:00
grayScale[i].resize(input[0][i].size());
}
2023-02-12 18:03:17 +01:00
for (uint32_t i = 0; i < grayScale.size(); i++) {
for (uint32_t j = 0; j < grayScale[i].size(); j++) {
2023-01-24 19:00:54 +01:00
grayScale[i][j] = 0.299 * input[0][i][j] + 0.587 * input[1][i][j] + 0.114 * input[2][i][j];
}
}
return grayScale;
}
2023-01-27 13:01:16 +01:00
std::vector<std::vector<std::vector<real_t>>> MLPPData::rgb2ycbcr(std::vector<std::vector<std::vector<real_t>>> input) {
2023-04-22 17:17:58 +02:00
MLPPLinAlgOld alg;
2023-01-27 13:01:16 +01:00
std::vector<std::vector<std::vector<real_t>>> YCbCr;
2023-01-24 19:00:54 +01:00
YCbCr = alg.resize(YCbCr, input);
2023-02-12 18:03:17 +01:00
for (uint32_t i = 0; i < YCbCr[0].size(); i++) {
for (uint32_t j = 0; j < YCbCr[0][i].size(); j++) {
2023-01-24 19:00:54 +01:00
YCbCr[0][i][j] = 0.299 * input[0][i][j] + 0.587 * input[1][i][j] + 0.114 * input[2][i][j];
YCbCr[1][i][j] = -0.169 * input[0][i][j] - 0.331 * input[1][i][j] + 0.500 * input[2][i][j];
YCbCr[2][i][j] = 0.500 * input[0][i][j] - 0.419 * input[1][i][j] - 0.081 * input[2][i][j];
}
}
return YCbCr;
}
// Conversion formulas available here:
// https://www.rapidtables.com/convert/color/rgb-to-hsv.html
2023-01-27 13:01:16 +01:00
std::vector<std::vector<std::vector<real_t>>> MLPPData::rgb2hsv(std::vector<std::vector<std::vector<real_t>>> input) {
2023-04-22 17:17:58 +02:00
MLPPLinAlgOld alg;
2023-01-27 13:01:16 +01:00
std::vector<std::vector<std::vector<real_t>>> HSV;
2023-01-24 19:00:54 +01:00
HSV = alg.resize(HSV, input);
2023-02-12 18:03:17 +01:00
for (uint32_t i = 0; i < HSV[0].size(); i++) {
for (uint32_t j = 0; j < HSV[0][i].size(); j++) {
2023-01-27 13:01:16 +01:00
real_t rPrime = input[0][i][j] / 255;
real_t gPrime = input[1][i][j] / 255;
real_t bPrime = input[2][i][j] / 255;
2023-01-24 19:00:54 +01:00
2023-01-27 13:01:16 +01:00
real_t cMax = alg.max({ rPrime, gPrime, bPrime });
real_t cMin = alg.min({ rPrime, gPrime, bPrime });
real_t delta = cMax - cMin;
2023-01-24 19:00:54 +01:00
// H calculation.
if (delta == 0) {
HSV[0][i][j] = 0;
} else {
if (cMax == rPrime) {
HSV[0][i][j] = 60 * fmod(((gPrime - bPrime) / delta), 6);
} else if (cMax == gPrime) {
HSV[0][i][j] = 60 * ((bPrime - rPrime) / delta + 2);
} else { // cMax == bPrime
HSV[0][i][j] = 60 * ((rPrime - gPrime) / delta + 6);
}
}
// S calculation.
if (cMax == 0) {
HSV[1][i][j] = 0;
} else {
HSV[1][i][j] = delta / cMax;
}
// V calculation.
HSV[2][i][j] = cMax;
}
}
return HSV;
}
// http://machinethatsees.blogspot.com/2013/07/how-to-convert-rgb-to-xyz-or-vice-versa.html
2023-01-27 13:01:16 +01:00
std::vector<std::vector<std::vector<real_t>>> MLPPData::rgb2xyz(std::vector<std::vector<std::vector<real_t>>> input) {
2023-04-22 17:17:58 +02:00
MLPPLinAlgOld alg;
2023-01-27 13:01:16 +01:00
std::vector<std::vector<std::vector<real_t>>> XYZ;
2023-01-24 19:00:54 +01:00
XYZ = alg.resize(XYZ, input);
2023-01-27 13:01:16 +01:00
std::vector<std::vector<real_t>> RGB2XYZ = { { 0.4124564, 0.3575761, 0.1804375 }, { 0.2126726, 0.7151522, 0.0721750 }, { 0.0193339, 0.1191920, 0.9503041 } };
2023-01-24 19:00:54 +01:00
return alg.vector_wise_tensor_product(input, RGB2XYZ);
}
2023-01-27 13:01:16 +01:00
std::vector<std::vector<std::vector<real_t>>> MLPPData::xyz2rgb(std::vector<std::vector<std::vector<real_t>>> input) {
2023-04-22 17:17:58 +02:00
MLPPLinAlgOld alg;
2023-01-27 13:01:16 +01:00
std::vector<std::vector<std::vector<real_t>>> XYZ;
2023-01-24 19:00:54 +01:00
XYZ = alg.resize(XYZ, input);
2023-01-27 13:01:16 +01:00
std::vector<std::vector<real_t>> RGB2XYZ = alg.inverse({ { 0.4124564, 0.3575761, 0.1804375 }, { 0.2126726, 0.7151522, 0.0721750 }, { 0.0193339, 0.1191920, 0.9503041 } });
2023-01-24 19:00:54 +01:00
return alg.vector_wise_tensor_product(input, RGB2XYZ);
}
// TEXT-BASED & NLP
2023-01-25 00:21:31 +01:00
std::string MLPPData::toLower(std::string text) {
2023-02-12 18:03:17 +01:00
for (uint32_t i = 0; i < text.size(); i++) {
2023-01-24 19:00:54 +01:00
text[i] = tolower(text[i]);
}
return text;
}
2023-01-25 00:21:31 +01:00
std::vector<char> MLPPData::split(std::string text) {
2023-01-24 19:00:54 +01:00
std::vector<char> split_data;
2023-02-12 18:03:17 +01:00
for (uint32_t i = 0; i < text.size(); i++) {
2023-01-24 19:00:54 +01:00
split_data.push_back(text[i]);
}
return split_data;
}
2023-12-28 17:41:20 +01:00
Vector<String> MLPPData::split_sentences(String data) {
Vector<String> sentences;
2023-01-24 19:00:54 +01:00
2023-12-28 17:41:20 +01:00
int start_index = 0;
for (int i = 0; i < data.length() - 1; ++i) {
2023-01-24 19:00:54 +01:00
if (data[i] == '.' && data[i + 1] != '.') {
2023-12-28 17:41:20 +01:00
continue;
}
if (data[i] == '.') {
sentences.push_back(data.substr_index(start_index, i));
start_index = i + 1;
2023-01-24 19:00:54 +01:00
}
}
2023-12-28 17:41:20 +01:00
if (start_index != data.length() - 1) {
sentences.push_back(data.substr_index(start_index, data.length() - 1));
}
2023-01-24 19:00:54 +01:00
return sentences;
}
2023-12-28 17:41:20 +01:00
Vector<String> MLPPData::remove_spaces(Vector<String> data) {
for (int i = 0; i < data.size(); i++) {
data.write[i] = data[i].replace(" ", "");
2023-01-24 19:00:54 +01:00
}
return data;
}
2023-12-28 17:41:20 +01:00
Vector<String> MLPPData::remove_empty(Vector<String> data) {
for (int i = 0; i < data.size(); ++i) {
if (data[i].empty()) {
data.remove(i);
2023-01-24 19:00:54 +01:00
}
}
2023-12-28 17:41:20 +01:00
2023-01-24 19:00:54 +01:00
return data;
}
2023-12-28 17:41:20 +01:00
Vector<String> MLPPData::segment(String text) {
Vector<String> segmented_data;
2023-01-24 19:00:54 +01:00
int prev_delim = 0;
2023-12-28 17:41:20 +01:00
for (int i = 0; i < text.length(); i++) {
2023-01-24 19:00:54 +01:00
if (text[i] == ' ') {
segmented_data.push_back(text.substr(prev_delim, i - prev_delim));
prev_delim = i + 1;
} else if (text[i] == ',' || text[i] == '!' || text[i] == '.' || text[i] == '-') {
segmented_data.push_back(text.substr(prev_delim, i - prev_delim));
2023-12-28 17:41:20 +01:00
String punc;
punc += text[i];
2023-01-24 19:00:54 +01:00
segmented_data.push_back(punc);
prev_delim = i + 2;
i++;
} else if (i == text.length() - 1) {
segmented_data.push_back(text.substr(prev_delim, text.length() - prev_delim)); // hehe oops- forgot this
}
}
return segmented_data;
}
2023-12-28 17:41:20 +01:00
Vector<int> MLPPData::tokenize(String text) {
2023-01-24 19:00:54 +01:00
int max_num = 0;
bool new_num = true;
2023-12-28 17:41:20 +01:00
Vector<String> segmented_data = segment(text);
Vector<int> tokenized_data;
2023-01-24 19:00:54 +01:00
tokenized_data.resize(segmented_data.size());
2023-12-28 17:41:20 +01:00
for (int i = 0; i < segmented_data.size(); i++) {
2023-01-24 19:00:54 +01:00
for (int j = i - 1; j >= 0; j--) {
if (segmented_data[i] == segmented_data[j]) {
2023-12-28 17:41:20 +01:00
tokenized_data.write[i] = tokenized_data[j];
2023-01-24 19:00:54 +01:00
new_num = false;
}
}
if (!new_num) {
new_num = true;
} else {
max_num++;
2023-12-28 17:41:20 +01:00
tokenized_data.write[i] = max_num;
2023-01-24 19:00:54 +01:00
}
}
2023-12-28 17:41:20 +01:00
2023-01-24 19:00:54 +01:00
return tokenized_data;
}
2023-12-28 17:41:20 +01:00
Vector<String> MLPPData::remove_stop_words(String text) {
Vector<String> segmented_data = remove_spaces(segment(text.to_lower()));
2023-01-24 19:00:54 +01:00
2023-12-28 17:41:20 +01:00
for (int i = 0; i < stop_words.size(); i++) {
for (int j = 0; j < segmented_data.size(); j++) {
if (segmented_data[j] == stop_words[i]) {
segmented_data.remove(j);
--j;
2023-01-24 19:00:54 +01:00
}
}
}
2023-12-28 17:41:20 +01:00
2023-01-24 19:00:54 +01:00
return segmented_data;
}
2023-12-28 17:41:20 +01:00
Vector<String> MLPPData::remove_stop_words_vec(Vector<String> segmented_data) {
for (int i = 0; i < segmented_data.size(); i++) {
for (int j = 0; j < stop_words.size(); j++) {
if (segmented_data[i] == stop_words[j]) {
segmented_data.remove(i);
--i;
2023-01-24 19:00:54 +01:00
}
}
}
2023-12-28 17:41:20 +01:00
2023-01-24 19:00:54 +01:00
return segmented_data;
}
2023-12-28 17:41:20 +01:00
String MLPPData::stemming(String text) {
2023-01-24 19:00:54 +01:00
int padding_size = 4;
2023-12-28 17:41:20 +01:00
String padding = " "; // our padding
2023-01-24 19:00:54 +01:00
2023-12-28 17:41:20 +01:00
text += String(padding).repeat(padding_size); // ' ' will be our padding value
2023-01-24 19:00:54 +01:00
2023-12-28 17:41:20 +01:00
for (int i = 0; i < text.length(); i++) {
for (int j = 0; j < suffixes.size(); j++) {
2023-01-24 19:00:54 +01:00
if (text.substr(i, suffixes[j].length()) == suffixes[j] && (text[i + suffixes[j].length()] == ' ' || text[i + suffixes[j].length()] == ',' || text[i + suffixes[j].length()] == '-' || text[i + suffixes[j].length()] == '.' || text[i + suffixes[j].length()] == '!')) {
text.erase(i, suffixes[j].length());
}
}
}
return text;
}
2023-12-28 17:41:20 +01:00
Ref<MLPPMatrix> MLPPData::bag_of_words(Vector<String> sentences, BagOfWordsType type) {
2023-01-24 19:00:54 +01:00
/*
STEPS OF BOW:
2023-12-28 17:41:20 +01:00
1) To lowercase (done by remove_stop_words function by def)
2023-01-24 19:00:54 +01:00
2) Removing stop words
3) Obtain a list of the used words
4) Create a one hot encoded vector of the words and sentences
5) Sentence.size() x list.size() matrix
*/
2023-12-28 17:41:20 +01:00
Vector<String> word_list = remove_empty(remove_stop_words_vec(create_word_list(sentences)));
2023-01-24 19:00:54 +01:00
2023-12-28 17:41:20 +01:00
Vector<Vector<String>> segmented_sentences;
2023-01-24 19:00:54 +01:00
segmented_sentences.resize(sentences.size());
2023-12-28 17:41:20 +01:00
for (int i = 0; i < sentences.size(); i++) {
segmented_sentences.write[i] = remove_stop_words(sentences[i]);
2023-01-24 19:00:54 +01:00
}
2023-12-28 17:41:20 +01:00
Ref<MLPPMatrix> bow;
bow.instance();
2023-12-28 18:00:26 +01:00
bow->resize(Size2i(word_list.size(), segmented_sentences.size()));
2023-12-28 17:41:20 +01:00
bow->fill(0);
2023-01-24 19:00:54 +01:00
2023-12-28 17:41:20 +01:00
for (int i = 0; i < segmented_sentences.size(); i++) {
for (int j = 0; j < segmented_sentences[i].size(); j++) {
for (int k = 0; k < word_list.size(); k++) {
if (segmented_sentences[i][j] == word_list[k]) {
if (type == BAG_OF_WORDS_TYPE_BINARY) {
bow->element_set(i, k, 1);
2023-01-24 19:00:54 +01:00
} else {
2023-12-28 17:41:20 +01:00
bow->element_set(i, k, bow->element_get(i, k) + 1);
2023-01-24 19:00:54 +01:00
}
}
}
}
}
2023-12-28 17:41:20 +01:00
2023-01-24 19:00:54 +01:00
return bow;
}
2023-12-28 17:41:20 +01:00
Ref<MLPPMatrix> MLPPData::tfidf(Vector<String> sentences) {
Vector<String> word_list = remove_empty(remove_stop_words_vec(create_word_list(sentences)));
2023-01-24 19:00:54 +01:00
2023-12-28 17:41:20 +01:00
Vector<Vector<String>> segmented_sentences;
2023-01-24 19:00:54 +01:00
segmented_sentences.resize(sentences.size());
2023-12-28 17:41:20 +01:00
for (int i = 0; i < sentences.size(); i++) {
segmented_sentences.write[i] = remove_stop_words(sentences[i]);
2023-01-24 19:00:54 +01:00
}
2023-12-28 17:41:20 +01:00
Ref<MLPPMatrix> TF;
TF.instance();
TF->resize(Size2i(word_list.size(), segmented_sentences.size()));
2023-01-24 19:00:54 +01:00
2023-12-28 17:41:20 +01:00
Vector<int> frequency;
frequency.resize(word_list.size());
frequency.fill(0);
2023-01-24 19:00:54 +01:00
2023-12-28 17:41:20 +01:00
Ref<MLPPVector> TF_row;
TF_row.instance();
TF_row->resize(word_list.size());
2023-01-24 19:00:54 +01:00
2023-12-28 17:41:20 +01:00
for (int i = 0; i < segmented_sentences.size(); i++) {
Vector<bool> present;
present.resize(word_list.size());
present.fill(false);
2023-01-24 19:00:54 +01:00
2023-12-28 17:41:20 +01:00
for (int j = 0; j < segmented_sentences[i].size(); j++) {
for (int k = 0; k < word_list.size(); k++) {
if (segmented_sentences[i][j] == word_list[k]) {
TF->element_set(i, k, TF->element_get(i, k) + 1);
2023-01-24 19:00:54 +01:00
2023-12-28 17:41:20 +01:00
if (!present[k]) {
frequency.write[k]++;
present.write[k] = true;
}
2023-01-24 19:00:54 +01:00
}
}
}
2023-12-28 17:41:20 +01:00
TF->row_get_into_mlpp_vector(i, TF_row);
TF_row->scalar_multiply(real_t(1) / real_t(segmented_sentences[i].size()));
TF->row_set_mlpp_vector(i, TF_row);
2023-01-24 19:00:54 +01:00
}
2023-12-28 17:41:20 +01:00
Vector<real_t> IDF;
IDF.resize(frequency.size());
for (int i = 0; i < IDF.size(); i++) {
IDF.write[i] = Math::log((real_t)segmented_sentences.size() / (real_t)frequency[i]);
2023-01-24 19:00:54 +01:00
}
2023-02-12 18:03:17 +01:00
2023-12-28 17:41:20 +01:00
Ref<MLPPMatrix> TFIDF;
TFIDF.instance();
Size2i tfidf_size = Size2i(word_list.size(), segmented_sentences.size());
TFIDF->resize(tfidf_size);
2023-02-12 18:03:17 +01:00
2023-12-28 17:41:20 +01:00
for (int i = 0; i < tfidf_size.y; i++) {
for (int j = 0; j < tfidf_size.x; j++) {
TFIDF->element_set(i, j, TF->element_get(i, j) * IDF[j]);
}
2023-01-24 19:00:54 +01:00
}
2023-02-12 18:03:17 +01:00
2023-12-28 17:41:20 +01:00
return TFIDF;
2023-01-24 19:00:54 +01:00
}
2023-12-28 17:41:20 +01:00
MLPPData::WordsToVecResult MLPPData::word_to_vec(Vector<String> sentences, WordToVecType type, int windowSize, int dimension, real_t learning_rate, int max_epoch) {
2023-01-26 14:52:49 +01:00
WordsToVecResult res;
2023-12-28 17:41:20 +01:00
res.word_list = remove_empty(remove_stop_words_vec(create_word_list(sentences)));
2023-01-26 14:52:49 +01:00
2023-12-28 17:41:20 +01:00
Vector<Vector<String>> segmented_sentences;
2023-01-26 14:52:49 +01:00
segmented_sentences.resize(sentences.size());
2023-12-28 17:41:20 +01:00
for (int i = 0; i < sentences.size(); i++) {
segmented_sentences.write[i] = remove_stop_words(sentences[i]);
2023-01-26 14:52:49 +01:00
}
2023-12-28 17:41:20 +01:00
Vector<String> inputStrings;
Vector<String> outputStrings;
2023-01-26 14:52:49 +01:00
2023-12-28 17:41:20 +01:00
for (int i = 0; i < segmented_sentences.size(); i++) {
for (int j = 0; j < segmented_sentences[i].size(); j++) {
2023-01-26 14:52:49 +01:00
for (int k = windowSize; k > 0; k--) {
2023-12-28 15:09:55 +01:00
int jmk = (int)j - k;
if (jmk >= 0) {
2023-01-26 14:52:49 +01:00
inputStrings.push_back(segmented_sentences[i][j]);
2023-12-28 15:09:55 +01:00
outputStrings.push_back(segmented_sentences[i][jmk]);
2023-01-26 14:52:49 +01:00
}
if (j + k <= segmented_sentences[i].size() - 1) {
inputStrings.push_back(segmented_sentences[i][j]);
outputStrings.push_back(segmented_sentences[i][j + k]);
}
}
}
}
2023-12-28 17:41:20 +01:00
int input_size = inputStrings.size();
2023-01-26 14:52:49 +01:00
2023-12-28 17:41:20 +01:00
inputStrings.append_array(outputStrings);
2023-01-26 14:52:49 +01:00
2023-12-28 17:41:20 +01:00
Ref<MLPPMatrix> bow = bag_of_words(inputStrings, BAG_OF_WORDS_TYPE_BINARY);
Size2i bow_size = bow->size();
2023-01-26 14:52:49 +01:00
2023-12-28 17:41:20 +01:00
Ref<MLPPMatrix> input_set;
Ref<MLPPMatrix> output_set;
input_set.instance();
output_set.instance();
2023-01-26 14:52:49 +01:00
2023-12-28 17:41:20 +01:00
input_set->resize(Size2i(bow_size.x, input_size));
Ref<MLPPVector> row_tmp;
row_tmp.instance();
row_tmp->resize(bow_size.x);
for (int i = 0; i < input_size; i++) {
bow->row_get_into_mlpp_vector(i, row_tmp);
input_set->row_set_mlpp_vector(i, row_tmp);
2023-01-26 14:52:49 +01:00
}
2023-12-28 17:41:20 +01:00
output_set->resize(Size2i(bow_size.x, bow_size.y - input_size));
Size2i output_set_size = output_set->size();
for (int i = 0; i < output_set_size.y; i++) {
bow->row_get_into_mlpp_vector(i + input_size, row_tmp);
input_set->row_set_mlpp_vector(i, row_tmp);
2023-01-26 14:52:49 +01:00
}
2023-02-12 18:03:17 +01:00
2023-12-28 17:41:20 +01:00
MLPPSoftmaxNet *model;
2023-02-12 18:03:17 +01:00
2023-12-28 17:41:20 +01:00
if (type == WORD_TO_VEC_TYPE_SKIPGRAM) {
model = memnew(MLPPSoftmaxNet(output_set, input_set, dimension));
2023-01-26 14:52:49 +01:00
} else { // else = CBOW. We maintain it is a default.
2023-12-28 17:41:20 +01:00
model = memnew(MLPPSoftmaxNet(input_set, output_set, dimension));
2023-01-26 14:52:49 +01:00
}
2023-02-12 18:03:17 +01:00
2023-12-28 17:41:20 +01:00
model->train_gradient_descent(learning_rate, max_epoch);
2023-01-26 14:52:49 +01:00
2023-12-28 17:41:20 +01:00
res.word_embeddings = model->get_embeddings();
memdelete(model);
2023-01-26 14:52:49 +01:00
return res;
}
2023-12-28 17:41:20 +01:00
Ref<MLPPMatrix> MLPPData::lsa(Vector<String> sentences, int dim) {
MLPPLinAlg alg;
Ref<MLPPMatrix> doc_word_data = bag_of_words(sentences, BAG_OF_WORDS_TYPE_BINARY);
MLPPLinAlg::SVDResult svr_res = alg.svd(doc_word_data);
Ref<MLPPMatrix> S_trunc = alg.zeromatnm(dim, dim);
Ref<MLPPMatrix> Vt_trunc;
Vt_trunc.instance();
Vt_trunc->resize(Size2i(svr_res.Vt->size().x, dim));
Ref<MLPPVector> row_rmp;
row_rmp.instance();
row_rmp->resize(svr_res.Vt->size().x);
2023-01-24 19:00:54 +01:00
for (int i = 0; i < dim; i++) {
2023-12-28 17:41:20 +01:00
S_trunc->element_set(i, i, svr_res.S->element_get(i, i));
svr_res.Vt->row_get_into_mlpp_vector(i, row_rmp);
Vt_trunc->row_set_mlpp_vector(i, row_rmp);
2023-01-24 19:00:54 +01:00
}
2023-12-28 17:41:20 +01:00
Ref<MLPPMatrix> embeddings = S_trunc->multn(Vt_trunc);
2023-01-24 19:00:54 +01:00
return embeddings;
}
2023-12-28 17:41:20 +01:00
Vector<String> MLPPData::create_word_list(Vector<String> sentences) {
String combined_text = "";
for (int i = 0; i < sentences.size(); i++) {
2023-01-24 19:00:54 +01:00
if (i != 0) {
2023-12-28 17:41:20 +01:00
combined_text += " ";
2023-01-24 19:00:54 +01:00
}
2023-12-28 17:41:20 +01:00
combined_text += sentences[i];
2023-01-24 19:00:54 +01:00
}
2023-12-28 17:41:20 +01:00
return remove_spaces(vec_to_set(remove_stop_words(combined_text)));
2023-01-24 19:00:54 +01:00
}
// EXTRA
2023-01-25 00:21:31 +01:00
void MLPPData::setInputNames(std::string fileName, std::vector<std::string> &inputNames) {
2023-01-24 19:00:54 +01:00
std::string inputNameTemp;
std::ifstream dataFile(fileName);
if (!dataFile.is_open()) {
std::cout << fileName << " failed to open." << std::endl;
}
while (std::getline(dataFile, inputNameTemp)) {
inputNames.push_back(inputNameTemp);
}
dataFile.close();
}
2023-12-28 21:06:16 +01:00
Ref<MLPPMatrix> MLPPData::feature_scaling(const Ref<MLPPMatrix> &p_X) {
Ref<MLPPMatrix> X = p_X->transposen();
2023-01-24 19:00:54 +01:00
2023-12-28 21:06:16 +01:00
Size2i x_size = X->size();
2023-01-24 19:00:54 +01:00
2023-12-28 21:06:16 +01:00
LocalVector<real_t> max_elements;
LocalVector<real_t> min_elements;
2023-01-24 19:00:54 +01:00
2023-12-28 21:06:16 +01:00
max_elements.resize(x_size.y);
min_elements.resize(x_size.y);
2023-01-24 19:00:54 +01:00
2023-12-28 21:06:16 +01:00
Ref<MLPPVector> row_tmp;
row_tmp.instance();
row_tmp->resize(x_size.x);
2023-01-24 19:00:54 +01:00
2023-12-28 21:06:16 +01:00
for (int i = 0; i < x_size.y; ++i) {
X->row_get_into_mlpp_vector(i, row_tmp);
max_elements[i] = row_tmp->max_element();
min_elements[i] = row_tmp->min_element();
2023-01-24 19:00:54 +01:00
}
2023-12-28 21:06:16 +01:00
for (int i = 0; i < x_size.y; i++) {
real_t maxe = max_elements[i];
real_t mine = min_elements[i];
for (int j = 0; j < x_size.x; j++) {
real_t xij = X->element_get(i, j);
X->element_set(i, j, (xij - mine) / (maxe - mine));
2023-01-24 19:00:54 +01:00
}
}
2023-12-28 21:06:16 +01:00
return X->transposen();
}
2023-01-24 19:20:18 +01:00
2023-02-08 01:26:37 +01:00
Ref<MLPPMatrix> MLPPData::mean_centering(const Ref<MLPPMatrix> &p_X) {
MLPPStat stat;
Ref<MLPPMatrix> X;
X.instance();
X->resize(p_X->size());
Size2i x_size = X->size();
Ref<MLPPVector> x_row_tmp;
x_row_tmp.instance();
x_row_tmp->resize(x_size.x);
for (int i = 0; i < x_size.y; ++i) {
2023-12-28 21:06:16 +01:00
p_X->row_get_into_mlpp_vector(i, x_row_tmp);
2023-02-08 01:26:37 +01:00
real_t mean_i = stat.meanv(x_row_tmp);
for (int j = 0; j < x_size.x; ++j) {
X->element_set(i, j, p_X->element_get(i, j) - mean_i);
2023-02-08 01:26:37 +01:00
}
}
return X;
}
2023-12-28 21:06:16 +01:00
Ref<MLPPMatrix> MLPPData::mean_normalization(const Ref<MLPPMatrix> &p_X) {
MLPPLinAlg alg;
MLPPStat stat;
// (X_j - mu_j) / std_j, for every j
Ref<MLPPMatrix> X = mean_centering(p_X);
Size2i x_size = X->size();
Ref<MLPPVector> x_row_tmp;
x_row_tmp.instance();
x_row_tmp->resize(x_size.x);
for (int i = 0; i < x_size.y; i++) {
X->row_get_into_mlpp_vector(i, x_row_tmp);
x_row_tmp->scalar_multiply((real_t)1 / stat.standard_deviationv(x_row_tmp));
X->row_set_mlpp_vector(i, x_row_tmp);
}
return X;
}
Ref<MLPPMatrix> MLPPData::one_hot_rep(const Ref<MLPPVector> &temp_output_set, int n_class) {
ERR_FAIL_COND_V(!temp_output_set.is_valid(), Ref<MLPPMatrix>());
Ref<MLPPMatrix> output_set;
output_set.instance();
int temp_output_set_size = temp_output_set->size();
const real_t *temp_output_set_ptr = temp_output_set->ptr();
output_set->resize(Size2i(n_class, temp_output_set_size));
for (int i = 0; i < temp_output_set_size; ++i) {
for (int j = 0; j <= n_class - 1; ++j) {
if (static_cast<int>(temp_output_set_ptr[i]) == j) {
output_set->element_set(i, j, 1);
} else {
output_set->element_set(i, j, 0);
}
}
}
return output_set;
}
2023-12-28 21:06:16 +01:00
std::vector<real_t> MLPPData::reverseOneHot(std::vector<std::vector<real_t>> tempOutputSet) {
std::vector<real_t> outputSet;
//uint32_t n_class = tempOutputSet[0].size();
for (uint32_t i = 0; i < tempOutputSet.size(); i++) {
int current_class = 1;
for (uint32_t j = 0; j < tempOutputSet[i].size(); j++) {
if (tempOutputSet[i][j] == 1) {
break;
} else {
current_class++;
}
}
outputSet.push_back(current_class);
}
return outputSet;
}
2023-12-28 17:41:20 +01:00
void MLPPData::load_default_suffixes() {
// Our list of suffixes which we use to compare against
suffixes = String("eer er ion ity ment ness or sion ship th able ible al ant ary ful ic ious ous ive less y ed en ing ize ise ly ward wise").split_spaces();
}
void MLPPData::load_default_stop_words() {
stop_words = String("i me my myself we our ours ourselves you your yours yourself yourselves he him his himself she her hers herself it its itself they them their theirs themselves what which who whom this that these those am is are was were be been being have has had having do does did doing a an the and but if or because as until while of at by for with about against between into through during before after above below to from up down in out on off over under again further then once here there when where why how all any both each few more most other some such no nor not only own same so than too very s t can will just don should now").split_spaces();
}
void MLPPData::_bind_methods() {
ClassDB::bind_method(D_METHOD("load_breast_cancer", "path"), &MLPPData::load_breast_cancer);
ClassDB::bind_method(D_METHOD("load_breast_cancer_svc", "path"), &MLPPData::load_breast_cancer_svc);
ClassDB::bind_method(D_METHOD("load_iris", "path"), &MLPPData::load_iris);
ClassDB::bind_method(D_METHOD("load_wine", "path"), &MLPPData::load_wine);
ClassDB::bind_method(D_METHOD("load_mnist_train", "path"), &MLPPData::load_mnist_train);
ClassDB::bind_method(D_METHOD("load_mnist_test", "path"), &MLPPData::load_mnist_test);
ClassDB::bind_method(D_METHOD("load_california_housing", "path"), &MLPPData::load_california_housing);
ClassDB::bind_method(D_METHOD("load_fires_and_crime", "path"), &MLPPData::load_fires_and_crime);
2023-01-26 14:52:49 +01:00
ClassDB::bind_method(D_METHOD("train_test_split", "data", "test_size"), &MLPPData::train_test_split_bind);
}