diff --git a/mlpp/data/data.cpp b/mlpp/data/data.cpp index b47597e..2647b92 100644 --- a/mlpp/data/data.cpp +++ b/mlpp/data/data.cpp @@ -6,9 +6,13 @@ // #include "data.h" + +#include "core/os/file_access.h" + #include "../lin_alg/lin_alg.h" #include "../softmax_net/softmax_net.h" #include "../stat/stat.h" + #include #include #include @@ -16,6 +20,185 @@ #include #include +void MLPPDataESimple::_bind_methods() { +} + +void MLPPDataSimple::_bind_methods() { +} + +void MLPPDataComplex::_bind_methods() { +} + +// Loading Datasets +Ref MLPPData::load_breast_cancer(const String &path) { + const int BREAST_CANCER_SIZE = 30; // k = 30 + + Ref data; + data.instance(); + + set_data_supervised(BREAST_CANCER_SIZE, path, data->input, data->output); + + return data; +} + +Ref MLPPData::load_breast_cancer_svc(const String &path) { + const int BREAST_CANCER_SIZE = 30; // k = 30 + + Ref data; + data.instance(); + + set_data_supervised(BREAST_CANCER_SIZE, path, data->input, data->output); + + return data; +} + +Ref MLPPData::load_iris(const String &path) { + const int IRIS_SIZE = 4; + const int ONE_HOT_NUM = 3; + + std::vector tempOutputSet; + + Ref data; + data.instance(); + + set_data_supervised(IRIS_SIZE, path, data->input, tempOutputSet); + data->output = oneHotRep(tempOutputSet, ONE_HOT_NUM); + + return data; +} + +Ref MLPPData::load_wine(const String &path) { + const int WINE_SIZE = 4; + const int ONE_HOT_NUM = 3; + + std::vector tempOutputSet; + + Ref data; + data.instance(); + + set_data_supervised(WINE_SIZE, path, data->input, tempOutputSet); + data->output = oneHotRep(tempOutputSet, ONE_HOT_NUM); + + return data; +} + +Ref MLPPData::load_mnist_train(const String &path) { + const int MNIST_SIZE = 784; + const int ONE_HOT_NUM = 10; + + std::vector> inputSet; + std::vector tempOutputSet; + + Ref data; + data.instance(); + + set_data_supervised(MNIST_SIZE, path, data->input, tempOutputSet); + data->output = oneHotRep(tempOutputSet, ONE_HOT_NUM); + + return data; +} + +Ref MLPPData::load_mnist_test(const String &path) { + const int MNIST_SIZE = 784; + const int ONE_HOT_NUM = 10; + std::vector> inputSet; + std::vector tempOutputSet; + + Ref data; + data.instance(); + + set_data_supervised(MNIST_SIZE, path, data->input, tempOutputSet); + data->output = oneHotRep(tempOutputSet, ONE_HOT_NUM); + + return data; +} + +Ref MLPPData::load_california_housing(const String &path) { + const int CALIFORNIA_HOUSING_SIZE = 13; // k = 30 + + Ref data; + data.instance(); + + set_data_supervised(CALIFORNIA_HOUSING_SIZE, path, data->input, data->output); + + return data; +} + +Ref MLPPData::load_fires_and_crime(const String &path) { + // k is implicitly 1. + + Ref data; + data.instance(); + + set_data_simple(path, data->input, data->output); + + return data; +} + +// MULTIVARIATE SUPERVISED + +void MLPPData::set_data_supervised(int k, const String &file_name, std::vector> &inputSet, std::vector &outputSet) { + MLPPLinAlg alg; + + inputSet.resize(k); + + FileAccess *file = FileAccess::open(file_name, FileAccess::READ); + + ERR_FAIL_COND(!file); + + while (!file->eof_reached()) { + Vector ll = file->get_csv_line(); + + for (int i = 0; i < k; ++i) { + inputSet[i].push_back(ll[i].to_double()); + } + + outputSet.push_back(ll[k].to_double()); + } + + inputSet = alg.transpose(inputSet); + + memdelete(file); +} + +void MLPPData::set_data_unsupervised(int k, const String &file_name, std::vector> &inputSet) { + MLPPLinAlg alg; + + inputSet.resize(k); + + FileAccess *file = FileAccess::open(file_name, FileAccess::READ); + + ERR_FAIL_COND(!file); + + while (!file->eof_reached()) { + Vector ll = file->get_csv_line(); + + for (int i = 0; i < k; ++i) { + inputSet[i].push_back(ll[i].to_double()); + } + } + + inputSet = alg.transpose(inputSet); + + memdelete(file); +} + +void MLPPData::set_data_simple(const String &file_name, std::vector &inputSet, std::vector &outputSet) { + FileAccess *file = FileAccess::open(file_name, FileAccess::READ); + + ERR_FAIL_COND(!file); + + while (!file->eof_reached()) { + Vector ll = file->get_csv_line(); + + for (int i = 0; i < ll.size(); i += 2) { + inputSet.push_back(ll[i].to_double()); + outputSet.push_back(ll[i + 1].to_double()); + } + } + + memdelete(file); +} // Loading Datasets std::tuple>, std::vector> MLPPData::loadBreastCancer() { @@ -699,7 +882,7 @@ std::vector> MLPPData::featureScaling(std::vector> MLPPData::meanNormalization(std::vector> X) { MLPPLinAlg alg; - MLPPStat stat; + MLPPStat stat; // (X_j - mu_j) / std_j, for every j X = meanCentering(X); @@ -711,7 +894,7 @@ std::vector> MLPPData::meanNormalization(std::vector> MLPPData::meanCentering(std::vector> X) { MLPPLinAlg alg; - MLPPStat stat; + MLPPStat stat; for (int i = 0; i < X.size(); i++) { double mean_i = stat.mean(X[i]); for (int j = 0; j < X[i].size(); j++) { @@ -754,3 +937,13 @@ std::vector MLPPData::reverseOneHot(std::vector> tem return outputSet; } +void MLPPData::_bind_methods() { + ClassDB::bind_method(D_METHOD("load_breast_cancer", "path"), &MLPPData::load_breast_cancer); + ClassDB::bind_method(D_METHOD("load_breast_cancer_svc", "path"), &MLPPData::load_breast_cancer_svc); + ClassDB::bind_method(D_METHOD("load_iris", "path"), &MLPPData::load_iris); + ClassDB::bind_method(D_METHOD("load_wine", "path"), &MLPPData::load_wine); + ClassDB::bind_method(D_METHOD("load_mnist_train", "path"), &MLPPData::load_mnist_train); + ClassDB::bind_method(D_METHOD("load_mnist_test", "path"), &MLPPData::load_mnist_test); + ClassDB::bind_method(D_METHOD("load_california_housing", "path"), &MLPPData::load_california_housing); + ClassDB::bind_method(D_METHOD("load_fires_and_crime", "path"), &MLPPData::load_fires_and_crime); +} diff --git a/mlpp/data/data.h b/mlpp/data/data.h index 82aa0b3..b639645 100644 --- a/mlpp/data/data.h +++ b/mlpp/data/data.h @@ -9,13 +9,65 @@ // Created by Marc Melikyan on 11/4/20. // +#include "core/string/ustring.h" + +#include "core/object/reference.h" + #include #include #include +class MLPPDataESimple : public Reference { + GDCLASS(MLPPDataESimple, Reference); -class MLPPData { public: + std::vector input; + std::vector output; + +protected: + static void _bind_methods(); +}; + +class MLPPDataSimple : public Reference { + GDCLASS(MLPPDataSimple, Reference); + +public: + std::vector> input; + std::vector output; + +protected: + static void _bind_methods(); +}; + +class MLPPDataComplex : public Reference { + GDCLASS(MLPPDataComplex, Reference); + +public: + std::vector> input; + std::vector> output; + +protected: + static void _bind_methods(); +}; + +class MLPPData : public Reference { + GDCLASS(MLPPData, Reference); + +public: + // Load Datasets + Ref load_breast_cancer(const String &path); + Ref load_breast_cancer_svc(const String &path); + Ref load_iris(const String &path); + Ref load_wine(const String &path); + Ref load_mnist_train(const String &path); + Ref load_mnist_test(const String &path); + Ref load_california_housing(const String &path); + Ref load_fires_and_crime(const String &path); + + void set_data_supervised(int k, const String &file_name, std::vector> &inputSet, std::vector &outputSet); + void set_data_unsupervised(int k, const String &file_name, std::vector> &inputSet); + void set_data_simple(const String &file_name, std::vector &inputSet, std::vector &outputSet); + // Load Datasets std::tuple>, std::vector> loadBreastCancer(); std::tuple>, std::vector> loadBreastCancerSVC(); @@ -92,8 +144,8 @@ public: return setInputSet; } -private: +protected: + static void _bind_methods(); }; - #endif /* Data_hpp */ diff --git a/register_types.cpp b/register_types.cpp index 875b840..78af669 100644 --- a/register_types.cpp +++ b/register_types.cpp @@ -23,10 +23,17 @@ SOFTWARE. #include "register_types.h" +#include "mlpp/data/data.h" + #include "test/mlpp_tests.h" void register_pmlpp_types(ModuleRegistrationLevel p_level) { if (p_level == MODULE_REGISTRATION_LEVEL_SCENE) { + ClassDB::register_class(); + ClassDB::register_class(); + ClassDB::register_class(); + ClassDB::register_class(); + ClassDB::register_class(); } }