#ifndef MLPP_DATA_H #define MLPP_DATA_H // // Data.hpp // MLP // // Created by Marc Melikyan on 11/4/20. // #include "core/math/math_defs.h" #include "core/string/ustring.h" #include "core/variant/array.h" #include "core/object/reference.h" #include <string> #include <tuple> #include <vector> class MLPPDataESimple : public Reference { GDCLASS(MLPPDataESimple, Reference); public: std::vector<real_t> input; std::vector<real_t> output; protected: static void _bind_methods(); }; class MLPPDataSimple : public Reference { GDCLASS(MLPPDataSimple, Reference); public: std::vector<std::vector<real_t>> input; std::vector<real_t> output; protected: static void _bind_methods(); }; class MLPPDataComplex : public Reference { GDCLASS(MLPPDataComplex, Reference); public: std::vector<std::vector<real_t>> input; std::vector<std::vector<real_t>> output; protected: static void _bind_methods(); }; class MLPPData : public Reference { GDCLASS(MLPPData, Reference); public: // Load Datasets Ref<MLPPDataSimple> load_breast_cancer(const String &path); Ref<MLPPDataSimple> load_breast_cancer_svc(const String &path); Ref<MLPPDataComplex> load_iris(const String &path); Ref<MLPPDataComplex> load_wine(const String &path); Ref<MLPPDataComplex> load_mnist_train(const String &path); Ref<MLPPDataComplex> load_mnist_test(const String &path); Ref<MLPPDataSimple> load_california_housing(const String &path); Ref<MLPPDataESimple> load_fires_and_crime(const String &path); void set_data_supervised(int k, const String &file_name, std::vector<std::vector<real_t>> &inputSet, std::vector<real_t> &outputSet); void set_data_unsupervised(int k, const String &file_name, std::vector<std::vector<real_t>> &inputSet); void set_data_simple(const String &file_name, std::vector<real_t> &inputSet, std::vector<real_t> &outputSet); struct SplitComplexData { Ref<MLPPDataComplex> train; Ref<MLPPDataComplex> test; }; SplitComplexData train_test_split(const Ref<MLPPDataComplex> &data, real_t test_size); Array train_test_split_bind(const Ref<MLPPDataComplex> &data, real_t test_size); // Load Datasets std::tuple<std::vector<std::vector<real_t>>, std::vector<real_t>> loadBreastCancer(); std::tuple<std::vector<std::vector<real_t>>, std::vector<real_t>> loadBreastCancerSVC(); std::tuple<std::vector<std::vector<real_t>>, std::vector<std::vector<real_t>>> loadIris(); std::tuple<std::vector<std::vector<real_t>>, std::vector<std::vector<real_t>>> loadWine(); std::tuple<std::vector<std::vector<real_t>>, std::vector<std::vector<real_t>>> loadMnistTrain(); std::tuple<std::vector<std::vector<real_t>>, std::vector<std::vector<real_t>>> loadMnistTest(); std::tuple<std::vector<std::vector<real_t>>, std::vector<real_t>> loadCaliforniaHousing(); std::tuple<std::vector<real_t>, std::vector<real_t>> loadFiresAndCrime(); std::tuple<std::vector<std::vector<real_t>>, std::vector<std::vector<real_t>>, std::vector<std::vector<real_t>>, std::vector<std::vector<real_t>>> trainTestSplit(std::vector<std::vector<real_t>> inputSet, std::vector<std::vector<real_t>> outputSet, real_t testSize); // Supervised void setData(int k, std::string fileName, std::vector<std::vector<real_t>> &inputSet, std::vector<real_t> &outputSet); void printData(std::vector<std::string> inputName, std::string outputName, std::vector<std::vector<real_t>> inputSet, std::vector<real_t> outputSet); // Unsupervised void setData(int k, std::string fileName, std::vector<std::vector<real_t>> &inputSet); void printData(std::vector<std::string> inputName, std::vector<std::vector<real_t>> inputSet); // Simple void setData(std::string fileName, std::vector<real_t> &inputSet, std::vector<real_t> &outputSet); void printData(std::string &inputName, std::string &outputName, std::vector<real_t> &inputSet, std::vector<real_t> &outputSet); // Images std::vector<std::vector<real_t>> rgb2gray(std::vector<std::vector<std::vector<real_t>>> input); std::vector<std::vector<std::vector<real_t>>> rgb2ycbcr(std::vector<std::vector<std::vector<real_t>>> input); std::vector<std::vector<std::vector<real_t>>> rgb2hsv(std::vector<std::vector<std::vector<real_t>>> input); std::vector<std::vector<std::vector<real_t>>> rgb2xyz(std::vector<std::vector<std::vector<real_t>>> input); std::vector<std::vector<std::vector<real_t>>> xyz2rgb(std::vector<std::vector<std::vector<real_t>>> input); // Text-Based & NLP std::string toLower(std::string text); std::vector<char> split(std::string text); std::vector<std::string> splitSentences(std::string data); std::vector<std::string> removeSpaces(std::vector<std::string> data); std::vector<std::string> removeNullByte(std::vector<std::string> data); std::vector<std::string> segment(std::string text); std::vector<real_t> tokenize(std::string text); std::vector<std::string> removeStopWords(std::string text); std::vector<std::string> removeStopWords(std::vector<std::string> segmented_data); std::string stemming(std::string text); std::vector<std::vector<real_t>> BOW(std::vector<std::string> sentences, std::string = "Default"); std::vector<std::vector<real_t>> TFIDF(std::vector<std::string> sentences); std::tuple<std::vector<std::vector<real_t>>, std::vector<std::string>> word2Vec(std::vector<std::string> sentences, std::string type, int windowSize, int dimension, real_t learning_rate, int max_epoch); struct WordsToVecResult { std::vector<std::vector<real_t>> word_embeddings; std::vector<std::string> word_list; }; WordsToVecResult word_to_vec(std::vector<std::string> sentences, std::string type, int windowSize, int dimension, real_t learning_rate, int max_epoch); std::vector<std::vector<real_t>> LSA(std::vector<std::string> sentences, int dim); std::vector<std::string> createWordList(std::vector<std::string> sentences); // Extra void setInputNames(std::string fileName, std::vector<std::string> &inputNames); std::vector<std::vector<real_t>> featureScaling(std::vector<std::vector<real_t>> X); std::vector<std::vector<real_t>> meanNormalization(std::vector<std::vector<real_t>> X); std::vector<std::vector<real_t>> meanCentering(std::vector<std::vector<real_t>> X); std::vector<std::vector<real_t>> oneHotRep(std::vector<real_t> tempOutputSet, int n_class); std::vector<real_t> reverseOneHot(std::vector<std::vector<real_t>> tempOutputSet); template <class T> std::vector<T> vecToSet(std::vector<T> inputSet) { std::vector<T> setInputSet; for (int i = 0; i < inputSet.size(); i++) { bool new_element = true; for (int j = 0; j < setInputSet.size(); j++) { if (setInputSet[j] == inputSet[i]) { new_element = false; } } if (new_element) { setInputSet.push_back(inputSet[i]); } } return setInputSet; } protected: static void _bind_methods(); }; #endif /* Data_hpp */