#ifndef MLPP_DATA_H #define MLPP_DATA_H // // Data.hpp // MLP // // Created by Marc Melikyan on 11/4/20. // #include "core/math/math_defs.h" #include "core/string/ustring.h" #include "core/variant/array.h" #include "core/object/reference.h" #include "../lin_alg/mlpp_matrix.h" #include "../lin_alg/mlpp_vector.h" #include #include #include class MLPPDataESimple : public Reference { GDCLASS(MLPPDataESimple, Reference); public: Ref get_input(); void set_input(const Ref &val); Ref get_output(); void set_output(const Ref &val); void instance_data(); protected: static void _bind_methods(); Ref _input; Ref _output; }; class MLPPDataSimple : public Reference { GDCLASS(MLPPDataSimple, Reference); public: Ref get_input(); void set_input(const Ref &val); Ref get_output(); void set_output(const Ref &val); void instance_data(); protected: static void _bind_methods(); Ref _input; Ref _output; }; class MLPPDataComplex : public Reference { GDCLASS(MLPPDataComplex, Reference); public: Ref get_input(); void set_input(const Ref &val); Ref get_output(); void set_output(const Ref &val); void instance_data(); protected: static void _bind_methods(); Ref _input; Ref _output; }; class MLPPData : public Reference { GDCLASS(MLPPData, Reference); public: // Load Datasets Ref load_breast_cancer(const String &path); Ref load_breast_cancer_svc(const String &path); Ref load_iris(const String &path); Ref load_wine(const String &path); Ref load_mnist_train(const String &path); Ref load_mnist_test(const String &path); Ref load_california_housing(const String &path); Ref load_fires_and_crime(const String &path); void set_data_supervised(int k, const String &file_name, Ref input_set, Ref output_set); void set_data_unsupervised(int k, const String &file_name, Ref input_set); void set_data_simple(const String &file_name, Ref input_set, Ref output_set); struct SplitComplexData { Ref train; Ref test; }; SplitComplexData train_test_split(Ref data, real_t test_size); Array train_test_split_bind(const Ref &data, real_t test_size); // Load Datasets std::tuple>, std::vector> loadBreastCancer(); std::tuple>, std::vector> loadBreastCancerSVC(); std::tuple>, std::vector>> loadIris(); std::tuple>, std::vector>> loadWine(); std::tuple>, std::vector>> loadMnistTrain(); std::tuple>, std::vector>> loadMnistTest(); std::tuple>, std::vector> loadCaliforniaHousing(); std::tuple, std::vector> loadFiresAndCrime(); std::tuple>, std::vector>, std::vector>, std::vector>> trainTestSplit(std::vector> inputSet, std::vector> outputSet, real_t testSize); // Supervised void setData(int k, std::string fileName, std::vector> &inputSet, std::vector &outputSet); void printData(std::vector inputName, std::string outputName, std::vector> inputSet, std::vector outputSet); // Unsupervised void setData(int k, std::string fileName, std::vector> &inputSet); void printData(std::vector inputName, std::vector> inputSet); // Simple void setData(std::string fileName, std::vector &inputSet, std::vector &outputSet); void printData(std::string &inputName, std::string &outputName, std::vector &inputSet, std::vector &outputSet); // Images std::vector> rgb2gray(std::vector>> input); std::vector>> rgb2ycbcr(std::vector>> input); std::vector>> rgb2hsv(std::vector>> input); std::vector>> rgb2xyz(std::vector>> input); std::vector>> xyz2rgb(std::vector>> input); // Text-Based & NLP std::string toLower(std::string text); std::vector split(std::string text); Vector split_sentences(String data); Vector remove_spaces(Vector data); Vector remove_empty(Vector data); Vector segment(String text); Vector tokenize(String text); Vector remove_stop_words(String text); Vector remove_stop_words_vec(Vector segmented_data); String stemming(String text); enum BagOfWordsType { BAG_OF_WORDS_TYPE_DEFAULT = 0, BAG_OF_WORDS_TYPE_BINARY, }; Ref bag_of_words(Vector sentences, BagOfWordsType type = BAG_OF_WORDS_TYPE_DEFAULT); Ref tfidf(Vector sentences); struct WordsToVecResult { Ref word_embeddings; Vector word_list; }; enum WordToVecType { WORD_TO_VEC_TYPE_CBOW = 0, WORD_TO_VEC_TYPE_SKIPGRAM, }; WordsToVecResult word_to_vec(Vector sentences, WordToVecType type, int windowSize, int dimension, real_t learning_rate, int max_epoch); Ref lsa(Vector sentences, int dim); Vector create_word_list(Vector sentences); // Extra void setInputNames(std::string fileName, std::vector &inputNames); Ref feature_scaling(const Ref &X); Ref mean_centering(const Ref &X); Ref mean_normalization(const Ref &X); Ref one_hot_rep(const Ref &temp_output_set, int n_class); std::vector reverseOneHot(std::vector> tempOutputSet); template std::vector vecToSet(std::vector inputSet) { std::vector setInputSet; for (uint32_t i = 0; i < inputSet.size(); i++) { bool new_element = true; for (uint32_t j = 0; j < setInputSet.size(); j++) { if (setInputSet[j] == inputSet[i]) { new_element = false; } } if (new_element) { setInputSet.push_back(inputSet[i]); } } return setInputSet; } template Vector vec_to_set(Vector input_set) { Vector set_input_set; for (int i = 0; i < input_set.size(); i++) { bool new_element = true; for (int j = 0; j < set_input_set.size(); j++) { if (set_input_set[j] == input_set[i]) { new_element = false; } } if (new_element) { set_input_set.push_back(input_set[i]); } } return set_input_set; } Ref vec_to_setnv(const Ref &input_set) { Vector set_input_set; for (int i = 0; i < input_set->size(); i++) { bool new_element = true; for (int j = 0; j < set_input_set.size(); j++) { if (set_input_set[j] == input_set->element_get(i)) { new_element = false; } } if (new_element) { set_input_set.push_back(input_set->element_get(i)); } } Ref ret; ret.instance(); ret->set_from_vector(set_input_set); return ret; } void load_default_suffixes(); void load_default_stop_words(); Vector suffixes; Vector stop_words; protected: static void _bind_methods(); }; #endif /* Data_hpp */