// // Data.cpp // MLP // // Created by Marc Melikyan on 11/4/20. // #include "data.h" #include "core/os/file_access.h" #include "../lin_alg/lin_alg.h" #include "../stat/stat.h" #include "../lin_alg/lin_alg_old.h" #include "../softmax_net/softmax_net.h" #include "../stat/stat_old.h" #include "data_old.h" #include #include #include #include #include #include Ref MLPPDataESimple::get_input() { return _input; } void MLPPDataESimple::set_input(const Ref &val) { _input = val; } Ref MLPPDataESimple::get_output() { return _output; } void MLPPDataESimple::set_output(const Ref &val) { _output = val; } void MLPPDataESimple::instance_data() { _input.instance(); _output.instance(); } void MLPPDataESimple::_bind_methods() { ClassDB::bind_method(D_METHOD("get_input"), &MLPPDataESimple::get_input); ClassDB::bind_method(D_METHOD("set_input", "val"), &MLPPDataESimple::set_input); ADD_PROPERTY(PropertyInfo(Variant::OBJECT, "input", PROPERTY_HINT_RESOURCE_TYPE, "MLPPVector"), "set_input", "get_input"); ClassDB::bind_method(D_METHOD("get_output"), &MLPPDataESimple::get_input); ClassDB::bind_method(D_METHOD("set_output", "val"), &MLPPDataESimple::set_output); ADD_PROPERTY(PropertyInfo(Variant::OBJECT, "output", PROPERTY_HINT_RESOURCE_TYPE, "MLPPVector"), "set_output", "get_output"); ClassDB::bind_method(D_METHOD("instance_data"), &MLPPDataESimple::instance_data); } Ref MLPPDataSimple::get_input() { return _input; } void MLPPDataSimple::set_input(const Ref &val) { _input = val; } Ref MLPPDataSimple::get_output() { return _output; } void MLPPDataSimple::set_output(const Ref &val) { _output = val; } void MLPPDataSimple::instance_data() { _input.instance(); _output.instance(); } void MLPPDataSimple::_bind_methods() { ClassDB::bind_method(D_METHOD("get_input"), &MLPPDataSimple::get_input); ClassDB::bind_method(D_METHOD("set_input", "val"), &MLPPDataSimple::set_input); ADD_PROPERTY(PropertyInfo(Variant::OBJECT, "input", PROPERTY_HINT_RESOURCE_TYPE, "MLPPMatrix"), "set_input", "get_input"); ClassDB::bind_method(D_METHOD("get_output"), &MLPPDataSimple::get_input); ClassDB::bind_method(D_METHOD("set_output", "val"), &MLPPDataSimple::set_output); ADD_PROPERTY(PropertyInfo(Variant::OBJECT, "output", PROPERTY_HINT_RESOURCE_TYPE, "MLPPVector"), "set_output", "get_output"); ClassDB::bind_method(D_METHOD("instance_data"), &MLPPDataSimple::instance_data); } Ref MLPPDataComplex::get_input() { return _input; } void MLPPDataComplex::set_input(const Ref &val) { _input = val; } Ref MLPPDataComplex::get_output() { return _output; } void MLPPDataComplex::set_output(const Ref &val) { _output = val; } void MLPPDataComplex::instance_data() { _input.instance(); _output.instance(); } void MLPPDataComplex::_bind_methods() { ClassDB::bind_method(D_METHOD("get_input"), &MLPPDataComplex::get_input); ClassDB::bind_method(D_METHOD("set_input", "val"), &MLPPDataComplex::set_input); ADD_PROPERTY(PropertyInfo(Variant::OBJECT, "input", PROPERTY_HINT_RESOURCE_TYPE, "MLPPMatrix"), "set_input", "get_input"); ClassDB::bind_method(D_METHOD("get_output"), &MLPPDataComplex::get_input); ClassDB::bind_method(D_METHOD("set_output", "val"), &MLPPDataComplex::set_output); ADD_PROPERTY(PropertyInfo(Variant::OBJECT, "output", PROPERTY_HINT_RESOURCE_TYPE, "MLPPMatrix"), "set_output", "get_output"); ClassDB::bind_method(D_METHOD("instance_data"), &MLPPDataComplex::instance_data); } // Loading Datasets Ref MLPPData::load_breast_cancer(const String &path) { const int BREAST_CANCER_SIZE = 30; // k = 30 Ref data; data.instance(); data->instance_data(); set_data_supervised(BREAST_CANCER_SIZE, path, data->get_input(), data->get_output()); return data; } Ref MLPPData::load_breast_cancer_svc(const String &path) { const int BREAST_CANCER_SIZE = 30; // k = 30 Ref data; data.instance(); data->instance_data(); set_data_supervised(BREAST_CANCER_SIZE, path, data->get_input(), data->get_output()); return data; } Ref MLPPData::load_iris(const String &path) { const int IRIS_SIZE = 4; const int ONE_HOT_NUM = 3; Ref temp_output_set; temp_output_set.instance(); Ref data; data.instance(); data->instance_data(); set_data_supervised(IRIS_SIZE, path, data->get_input(), temp_output_set); data->set_output(one_hot_rep(temp_output_set, ONE_HOT_NUM)); return data; } Ref MLPPData::load_wine(const String &path) { const int WINE_SIZE = 4; const int ONE_HOT_NUM = 3; Ref temp_output_set; temp_output_set.instance(); Ref data; data.instance(); data->instance_data(); set_data_supervised(WINE_SIZE, path, data->get_input(), temp_output_set); data->set_output(one_hot_rep(temp_output_set, ONE_HOT_NUM)); return data; } Ref MLPPData::load_mnist_train(const String &path) { const int MNIST_SIZE = 784; const int ONE_HOT_NUM = 10; Ref temp_output_set; temp_output_set.instance(); Ref data; data.instance(); data->instance_data(); set_data_supervised(MNIST_SIZE, path, data->get_input(), temp_output_set); data->set_output(one_hot_rep(temp_output_set, ONE_HOT_NUM)); return data; } Ref MLPPData::load_mnist_test(const String &path) { const int MNIST_SIZE = 784; const int ONE_HOT_NUM = 10; Ref temp_output_set; temp_output_set.instance(); Ref data; data.instance(); data->instance_data(); set_data_supervised(MNIST_SIZE, path, data->get_input(), temp_output_set); data->set_output(one_hot_rep(temp_output_set, ONE_HOT_NUM)); return data; } Ref MLPPData::load_california_housing(const String &path) { const int CALIFORNIA_HOUSING_SIZE = 13; // k = 30 Ref data; data.instance(); data->instance_data(); set_data_supervised(CALIFORNIA_HOUSING_SIZE, path, data->get_input(), data->get_output()); return data; } Ref MLPPData::load_fires_and_crime(const String &path) { // k is implicitly 1. Ref data; data.instance(); data->instance_data(); set_data_simple(path, data->get_input(), data->get_output()); return data; } // MULTIVARIATE SUPERVISED void MLPPData::set_data_supervised(int k, const String &file_name, Ref input_set, Ref output_set) { ERR_FAIL_COND(!input_set.is_valid() || !output_set.is_valid()); MLPPLinAlg alg; Vector> input_set_tmp; Vector output_set_tmp; FileAccess *file = FileAccess::open(file_name, FileAccess::READ); ERR_FAIL_COND(!file); while (!file->eof_reached()) { Vector ll = file->get_csv_line(); Vector row; for (int i = 0; i < k; ++i) { row.push_back(static_cast(ll[i].to_double())); } input_set_tmp.push_back(row); output_set_tmp.push_back(static_cast(ll[k].to_double())); } file->close(); memdelete(file); output_set->set_from_vector(output_set_tmp); input_set->set_from_vectors(input_set_tmp); } void MLPPData::set_data_unsupervised(int k, const String &file_name, Ref input_set) { ERR_FAIL_COND(!input_set.is_valid()); MLPPLinAlg alg; Vector> input_set_tmp; input_set_tmp.resize(k); FileAccess *file = FileAccess::open(file_name, FileAccess::READ); ERR_FAIL_COND(!file); while (!file->eof_reached()) { Vector ll = file->get_csv_line(); for (int i = 0; i < k; ++i) { input_set_tmp.write[i].push_back(static_cast(ll[i].to_double())); } } file->close(); memdelete(file); input_set->set_from_vectors(input_set_tmp); input_set = alg.transposenm(input_set); } void MLPPData::set_data_simple(const String &file_name, Ref input_set, Ref output_set) { ERR_FAIL_COND(!input_set.is_valid() || !output_set.is_valid()); FileAccess *file = FileAccess::open(file_name, FileAccess::READ); ERR_FAIL_COND(!file); Vector input_set_tmp; Vector output_set_tmp; while (!file->eof_reached()) { Vector ll = file->get_csv_line(); for (int i = 0; i < ll.size(); i += 2) { input_set_tmp.push_back(static_cast(ll[i].to_double())); output_set_tmp.push_back(static_cast(ll[i + 1].to_double())); } } file->close(); memdelete(file); input_set->set_from_vector(input_set_tmp); output_set->set_from_vector(output_set_tmp); } MLPPData::SplitComplexData MLPPData::train_test_split(Ref data, real_t test_size) { SplitComplexData res; res.train.instance(); res.train->instance_data(); res.test.instance(); res.test->instance_data(); ERR_FAIL_COND_V(!data.is_valid(), res); Ref orig_input = data->get_input(); Ref orig_output = data->get_output(); ERR_FAIL_COND_V(!orig_input.is_valid(), res); ERR_FAIL_COND_V(!orig_output.is_valid(), res); Size2i orig_input_size = orig_input->size(); Size2i orig_output_size = orig_output->size(); int is = MIN(orig_input_size.y, orig_output_size.y); Array indices; indices.resize(is); for (int i = 0; i < is; ++i) { indices[i] = i; } indices.shuffle(); Ref orig_input_row_tmp; orig_input_row_tmp.instance(); orig_input_row_tmp->resize(orig_input_size.x); Ref orig_output_row_tmp; orig_output_row_tmp.instance(); orig_output_row_tmp->resize(orig_output_size.x); int test_input_number = test_size * is; // implicit usage of floor Ref res_test_input = res.test->get_input(); Ref res_test_output = res.test->get_output(); res_test_input->resize(Size2i(orig_input_size.x, test_input_number)); res_test_output->resize(Size2i(orig_output_size.x, test_input_number)); for (int i = 0; i < test_input_number; ++i) { int index = indices[i]; orig_input->row_get_into_mlpp_vector(index, orig_input_row_tmp); orig_output->row_get_into_mlpp_vector(index, orig_output_row_tmp); res_test_input->row_set_mlpp_vector(i, orig_input_row_tmp); res_test_output->row_set_mlpp_vector(i, orig_output_row_tmp); } Ref res_train_input = res.train->get_input(); Ref res_train_output = res.train->get_output(); int train_input_number = is - test_input_number; res_train_input->resize(Size2i(orig_input_size.x, train_input_number)); res_train_output->resize(Size2i(orig_output_size.x, train_input_number)); for (int i = 0; i < train_input_number; ++i) { int index = indices[test_input_number + i]; orig_input->row_get_into_mlpp_vector(index, orig_input_row_tmp); orig_output->row_get_into_mlpp_vector(index, orig_output_row_tmp); res_train_input->row_set_mlpp_vector(i, orig_input_row_tmp); res_train_output->row_set_mlpp_vector(i, orig_output_row_tmp); } return res; } Array MLPPData::train_test_split_bind(const Ref &data, real_t test_size) { SplitComplexData res = train_test_split(data, test_size); Array arr; arr.push_back(res.train); arr.push_back(res.test); return arr; } // Loading Datasets std::tuple>, std::vector> MLPPData::loadBreastCancer() { const int BREAST_CANCER_SIZE = 30; // k = 30 std::vector> inputSet; std::vector outputSet; setData(BREAST_CANCER_SIZE, "MLPP/Data/Datasets/BreastCancer.csv", inputSet, outputSet); return { inputSet, outputSet }; } std::tuple>, std::vector> MLPPData::loadBreastCancerSVC() { const int BREAST_CANCER_SIZE = 30; // k = 30 std::vector> inputSet; std::vector outputSet; setData(BREAST_CANCER_SIZE, "MLPP/Data/Datasets/BreastCancerSVM.csv", inputSet, outputSet); return { inputSet, outputSet }; } std::tuple>, std::vector>> MLPPData::loadIris() { const int IRIS_SIZE = 4; const int ONE_HOT_NUM = 3; std::vector> inputSet; std::vector tempOutputSet; MLPPDataOld d; setData(IRIS_SIZE, "/Users/marcmelikyan/Desktop/Data/Iris.csv", inputSet, tempOutputSet); std::vector> outputSet = d.oneHotRep(tempOutputSet, ONE_HOT_NUM); return { inputSet, outputSet }; } std::tuple>, std::vector>> MLPPData::loadWine() { const int WINE_SIZE = 4; const int ONE_HOT_NUM = 3; std::vector> inputSet; std::vector tempOutputSet; MLPPDataOld d; setData(WINE_SIZE, "MLPP/Data/Datasets/Iris.csv", inputSet, tempOutputSet); std::vector> outputSet = d.oneHotRep(tempOutputSet, ONE_HOT_NUM); return { inputSet, outputSet }; } std::tuple>, std::vector>> MLPPData::loadMnistTrain() { const int MNIST_SIZE = 784; const int ONE_HOT_NUM = 10; std::vector> inputSet; std::vector tempOutputSet; MLPPDataOld d; setData(MNIST_SIZE, "MLPP/Data/Datasets/MnistTrain.csv", inputSet, tempOutputSet); std::vector> outputSet = d.oneHotRep(tempOutputSet, ONE_HOT_NUM); return { inputSet, outputSet }; } std::tuple>, std::vector>> MLPPData::loadMnistTest() { const int MNIST_SIZE = 784; const int ONE_HOT_NUM = 10; std::vector> inputSet; std::vector tempOutputSet; MLPPDataOld d; setData(MNIST_SIZE, "MLPP/Data/Datasets/MnistTest.csv", inputSet, tempOutputSet); std::vector> outputSet = d.oneHotRep(tempOutputSet, ONE_HOT_NUM); return { inputSet, outputSet }; } std::tuple>, std::vector> MLPPData::loadCaliforniaHousing() { const int CALIFORNIA_HOUSING_SIZE = 13; // k = 30 std::vector> inputSet; std::vector outputSet; setData(CALIFORNIA_HOUSING_SIZE, "MLPP/Data/Datasets/CaliforniaHousing.csv", inputSet, outputSet); return { inputSet, outputSet }; } std::tuple, std::vector> MLPPData::loadFiresAndCrime() { std::vector inputSet; // k is implicitly 1. std::vector outputSet; setData("MLPP/Data/Datasets/FiresAndCrime.csv", inputSet, outputSet); return { inputSet, outputSet }; } // Note that inputs and outputs should be pairs (technically), but this // implementation will separate them. (My implementation keeps them tied together.) // Not yet sure whether this is intentional or not (or it's something like a compiler specific difference) std::tuple>, std::vector>, std::vector>, std::vector>> MLPPData::trainTestSplit(std::vector> inputSet, std::vector> outputSet, real_t testSize) { std::random_device rd; std::default_random_engine generator(rd()); std::shuffle(inputSet.begin(), inputSet.end(), generator); // inputSet random shuffle std::shuffle(outputSet.begin(), outputSet.end(), generator); // outputSet random shuffle) std::vector> inputTestSet; std::vector> outputTestSet; int testInputNumber = testSize * inputSet.size(); // implicit usage of floor int testOutputNumber = testSize * outputSet.size(); // implicit usage of floor for (int i = 0; i < testInputNumber; i++) { inputTestSet.push_back(inputSet[i]); inputSet.erase(inputSet.begin()); } for (int i = 0; i < testOutputNumber; i++) { outputTestSet.push_back(outputSet[i]); outputSet.erase(outputSet.begin()); } return { inputSet, outputSet, inputTestSet, outputTestSet }; } // MULTIVARIATE SUPERVISED void MLPPData::setData(int k, std::string fileName, std::vector> &inputSet, std::vector &outputSet) { MLPPLinAlgOld alg; std::string inputTemp; std::string outputTemp; inputSet.resize(k); std::ifstream dataFile(fileName); if (!dataFile.is_open()) { std::cout << fileName << " failed to open." << std::endl; } std::string line; while (std::getline(dataFile, line)) { std::stringstream ss(line); for (int i = 0; i < k; i++) { std::getline(ss, inputTemp, ','); inputSet[i].push_back(std::stod(inputTemp)); } std::getline(ss, outputTemp, ','); outputSet.push_back(std::stod(outputTemp)); } inputSet = alg.transpose(inputSet); dataFile.close(); } void MLPPData::printData(std::vector inputName, std::string outputName, std::vector> inputSet, std::vector outputSet) { MLPPLinAlgOld alg; inputSet = alg.transpose(inputSet); for (uint32_t i = 0; i < inputSet.size(); i++) { std::cout << inputName[i] << std::endl; for (uint32_t j = 0; j < inputSet[i].size(); j++) { std::cout << inputSet[i][j] << std::endl; } } std::cout << outputName << std::endl; for (uint32_t i = 0; i < outputSet.size(); i++) { std::cout << outputSet[i] << std::endl; } } // UNSUPERVISED void MLPPData::setData(int k, std::string fileName, std::vector> &inputSet) { MLPPLinAlgOld alg; std::string inputTemp; inputSet.resize(k); std::ifstream dataFile(fileName); if (!dataFile.is_open()) { std::cout << fileName << " failed to open." << std::endl; } std::string line; while (std::getline(dataFile, line)) { std::stringstream ss(line); for (int i = 0; i < k; i++) { std::getline(ss, inputTemp, ','); inputSet[i].push_back(std::stod(inputTemp)); } } inputSet = alg.transpose(inputSet); dataFile.close(); } void MLPPData::printData(std::vector inputName, std::vector> inputSet) { MLPPLinAlgOld alg; inputSet = alg.transpose(inputSet); for (uint32_t i = 0; i < inputSet.size(); i++) { std::cout << inputName[i] << std::endl; for (uint32_t j = 0; j < inputSet[i].size(); j++) { std::cout << inputSet[i][j] << std::endl; } } } // SIMPLE void MLPPData::setData(std::string fileName, std::vector &inputSet, std::vector &outputSet) { std::string inputTemp, outputTemp; std::ifstream dataFile(fileName); if (!dataFile.is_open()) { std::cout << "The file failed to open." << std::endl; } std::string line; while (std::getline(dataFile, line)) { std::stringstream ss(line); std::getline(ss, inputTemp, ','); std::getline(ss, outputTemp, ','); inputSet.push_back(std::stod(inputTemp)); outputSet.push_back(std::stod(outputTemp)); } dataFile.close(); } void MLPPData::printData(std::string &inputName, std::string &outputName, std::vector &inputSet, std::vector &outputSet) { std::cout << inputName << std::endl; for (uint32_t i = 0; i < inputSet.size(); i++) { std::cout << inputSet[i] << std::endl; } std::cout << outputName << std::endl; for (uint32_t i = 0; i < inputSet.size(); i++) { std::cout << outputSet[i] << std::endl; } } // Images std::vector> MLPPData::rgb2gray(std::vector>> input) { std::vector> grayScale; grayScale.resize(input[0].size()); for (uint32_t i = 0; i < grayScale.size(); i++) { grayScale[i].resize(input[0][i].size()); } for (uint32_t i = 0; i < grayScale.size(); i++) { for (uint32_t j = 0; j < grayScale[i].size(); j++) { grayScale[i][j] = 0.299 * input[0][i][j] + 0.587 * input[1][i][j] + 0.114 * input[2][i][j]; } } return grayScale; } std::vector>> MLPPData::rgb2ycbcr(std::vector>> input) { MLPPLinAlgOld alg; std::vector>> YCbCr; YCbCr = alg.resize(YCbCr, input); for (uint32_t i = 0; i < YCbCr[0].size(); i++) { for (uint32_t j = 0; j < YCbCr[0][i].size(); j++) { YCbCr[0][i][j] = 0.299 * input[0][i][j] + 0.587 * input[1][i][j] + 0.114 * input[2][i][j]; YCbCr[1][i][j] = -0.169 * input[0][i][j] - 0.331 * input[1][i][j] + 0.500 * input[2][i][j]; YCbCr[2][i][j] = 0.500 * input[0][i][j] - 0.419 * input[1][i][j] - 0.081 * input[2][i][j]; } } return YCbCr; } // Conversion formulas available here: // https://www.rapidtables.com/convert/color/rgb-to-hsv.html std::vector>> MLPPData::rgb2hsv(std::vector>> input) { MLPPLinAlgOld alg; std::vector>> HSV; HSV = alg.resize(HSV, input); for (uint32_t i = 0; i < HSV[0].size(); i++) { for (uint32_t j = 0; j < HSV[0][i].size(); j++) { real_t rPrime = input[0][i][j] / 255; real_t gPrime = input[1][i][j] / 255; real_t bPrime = input[2][i][j] / 255; real_t cMax = alg.max({ rPrime, gPrime, bPrime }); real_t cMin = alg.min({ rPrime, gPrime, bPrime }); real_t delta = cMax - cMin; // H calculation. if (delta == 0) { HSV[0][i][j] = 0; } else { if (cMax == rPrime) { HSV[0][i][j] = 60 * fmod(((gPrime - bPrime) / delta), 6); } else if (cMax == gPrime) { HSV[0][i][j] = 60 * ((bPrime - rPrime) / delta + 2); } else { // cMax == bPrime HSV[0][i][j] = 60 * ((rPrime - gPrime) / delta + 6); } } // S calculation. if (cMax == 0) { HSV[1][i][j] = 0; } else { HSV[1][i][j] = delta / cMax; } // V calculation. HSV[2][i][j] = cMax; } } return HSV; } // http://machinethatsees.blogspot.com/2013/07/how-to-convert-rgb-to-xyz-or-vice-versa.html std::vector>> MLPPData::rgb2xyz(std::vector>> input) { MLPPLinAlgOld alg; std::vector>> XYZ; XYZ = alg.resize(XYZ, input); std::vector> RGB2XYZ = { { 0.4124564, 0.3575761, 0.1804375 }, { 0.2126726, 0.7151522, 0.0721750 }, { 0.0193339, 0.1191920, 0.9503041 } }; return alg.vector_wise_tensor_product(input, RGB2XYZ); } std::vector>> MLPPData::xyz2rgb(std::vector>> input) { MLPPLinAlgOld alg; std::vector>> XYZ; XYZ = alg.resize(XYZ, input); std::vector> RGB2XYZ = alg.inverse({ { 0.4124564, 0.3575761, 0.1804375 }, { 0.2126726, 0.7151522, 0.0721750 }, { 0.0193339, 0.1191920, 0.9503041 } }); return alg.vector_wise_tensor_product(input, RGB2XYZ); } // TEXT-BASED & NLP std::string MLPPData::toLower(std::string text) { for (uint32_t i = 0; i < text.size(); i++) { text[i] = tolower(text[i]); } return text; } std::vector MLPPData::split(std::string text) { std::vector split_data; for (uint32_t i = 0; i < text.size(); i++) { split_data.push_back(text[i]); } return split_data; } Vector MLPPData::split_sentences(String data) { Vector sentences; int start_index = 0; for (int i = 0; i < data.length() - 1; ++i) { if (data[i] == '.' && data[i + 1] != '.') { continue; } if (data[i] == '.') { sentences.push_back(data.substr_index(start_index, i)); start_index = i + 1; } } if (start_index != data.length() - 1) { sentences.push_back(data.substr_index(start_index, data.length() - 1)); } return sentences; } Vector MLPPData::remove_spaces(Vector data) { for (int i = 0; i < data.size(); i++) { data.write[i] = data[i].replace(" ", ""); } return data; } Vector MLPPData::remove_empty(Vector data) { for (int i = 0; i < data.size(); ++i) { if (data[i].empty()) { data.remove(i); } } return data; } Vector MLPPData::segment(String text) { Vector segmented_data; int prev_delim = 0; for (int i = 0; i < text.length(); i++) { if (text[i] == ' ') { segmented_data.push_back(text.substr(prev_delim, i - prev_delim)); prev_delim = i + 1; } else if (text[i] == ',' || text[i] == '!' || text[i] == '.' || text[i] == '-') { segmented_data.push_back(text.substr(prev_delim, i - prev_delim)); String punc; punc += text[i]; segmented_data.push_back(punc); prev_delim = i + 2; i++; } else if (i == text.length() - 1) { segmented_data.push_back(text.substr(prev_delim, text.length() - prev_delim)); // hehe oops- forgot this } } return segmented_data; } Vector MLPPData::tokenize(String text) { int max_num = 0; bool new_num = true; Vector segmented_data = segment(text); Vector tokenized_data; tokenized_data.resize(segmented_data.size()); for (int i = 0; i < segmented_data.size(); i++) { for (int j = i - 1; j >= 0; j--) { if (segmented_data[i] == segmented_data[j]) { tokenized_data.write[i] = tokenized_data[j]; new_num = false; } } if (!new_num) { new_num = true; } else { max_num++; tokenized_data.write[i] = max_num; } } return tokenized_data; } Vector MLPPData::remove_stop_words(String text) { Vector segmented_data = remove_spaces(segment(text.to_lower())); for (int i = 0; i < stop_words.size(); i++) { for (int j = 0; j < segmented_data.size(); j++) { if (segmented_data[j] == stop_words[i]) { segmented_data.remove(j); --j; } } } return segmented_data; } Vector MLPPData::remove_stop_words_vec(Vector segmented_data) { for (int i = 0; i < segmented_data.size(); i++) { for (int j = 0; j < stop_words.size(); j++) { if (segmented_data[i] == stop_words[j]) { segmented_data.remove(i); --i; } } } return segmented_data; } String MLPPData::stemming(String text) { int padding_size = 4; String padding = " "; // our padding text += String(padding).repeat(padding_size); // ' ' will be our padding value for (int i = 0; i < text.length(); i++) { for (int j = 0; j < suffixes.size(); j++) { if (text.substr(i, suffixes[j].length()) == suffixes[j] && (text[i + suffixes[j].length()] == ' ' || text[i + suffixes[j].length()] == ',' || text[i + suffixes[j].length()] == '-' || text[i + suffixes[j].length()] == '.' || text[i + suffixes[j].length()] == '!')) { text.erase(i, suffixes[j].length()); } } } return text; } Ref MLPPData::bag_of_words(Vector sentences, BagOfWordsType type) { /* STEPS OF BOW: 1) To lowercase (done by remove_stop_words function by def) 2) Removing stop words 3) Obtain a list of the used words 4) Create a one hot encoded vector of the words and sentences 5) Sentence.size() x list.size() matrix */ Vector word_list = remove_empty(remove_stop_words_vec(create_word_list(sentences))); Vector> segmented_sentences; segmented_sentences.resize(sentences.size()); for (int i = 0; i < sentences.size(); i++) { segmented_sentences.write[i] = remove_stop_words(sentences[i]); } Ref bow; bow.instance(); bow->resize(Size2i(word_list.size(), segmented_sentences.size())); bow->fill(0); for (int i = 0; i < segmented_sentences.size(); i++) { for (int j = 0; j < segmented_sentences[i].size(); j++) { for (int k = 0; k < word_list.size(); k++) { if (segmented_sentences[i][j] == word_list[k]) { if (type == BAG_OF_WORDS_TYPE_BINARY) { bow->element_set(i, k, 1); } else { bow->element_set(i, k, bow->element_get(i, k) + 1); } } } } } return bow; } Ref MLPPData::tfidf(Vector sentences) { Vector word_list = remove_empty(remove_stop_words_vec(create_word_list(sentences))); Vector> segmented_sentences; segmented_sentences.resize(sentences.size()); for (int i = 0; i < sentences.size(); i++) { segmented_sentences.write[i] = remove_stop_words(sentences[i]); } Ref TF; TF.instance(); TF->resize(Size2i(word_list.size(), segmented_sentences.size())); Vector frequency; frequency.resize(word_list.size()); frequency.fill(0); Ref TF_row; TF_row.instance(); TF_row->resize(word_list.size()); for (int i = 0; i < segmented_sentences.size(); i++) { Vector present; present.resize(word_list.size()); present.fill(false); for (int j = 0; j < segmented_sentences[i].size(); j++) { for (int k = 0; k < word_list.size(); k++) { if (segmented_sentences[i][j] == word_list[k]) { TF->element_set(i, k, TF->element_get(i, k) + 1); if (!present[k]) { frequency.write[k]++; present.write[k] = true; } } } } TF->row_get_into_mlpp_vector(i, TF_row); TF_row->scalar_multiply(real_t(1) / real_t(segmented_sentences[i].size())); TF->row_set_mlpp_vector(i, TF_row); } Vector IDF; IDF.resize(frequency.size()); for (int i = 0; i < IDF.size(); i++) { IDF.write[i] = Math::log((real_t)segmented_sentences.size() / (real_t)frequency[i]); } Ref TFIDF; TFIDF.instance(); Size2i tfidf_size = Size2i(word_list.size(), segmented_sentences.size()); TFIDF->resize(tfidf_size); for (int i = 0; i < tfidf_size.y; i++) { for (int j = 0; j < tfidf_size.x; j++) { TFIDF->element_set(i, j, TF->element_get(i, j) * IDF[j]); } } return TFIDF; } MLPPData::WordsToVecResult MLPPData::word_to_vec(Vector sentences, WordToVecType type, int windowSize, int dimension, real_t learning_rate, int max_epoch) { WordsToVecResult res; res.word_list = remove_empty(remove_stop_words_vec(create_word_list(sentences))); Vector> segmented_sentences; segmented_sentences.resize(sentences.size()); for (int i = 0; i < sentences.size(); i++) { segmented_sentences.write[i] = remove_stop_words(sentences[i]); } Vector inputStrings; Vector outputStrings; for (int i = 0; i < segmented_sentences.size(); i++) { for (int j = 0; j < segmented_sentences[i].size(); j++) { for (int k = windowSize; k > 0; k--) { int jmk = (int)j - k; if (jmk >= 0) { inputStrings.push_back(segmented_sentences[i][j]); outputStrings.push_back(segmented_sentences[i][jmk]); } if (j + k <= segmented_sentences[i].size() - 1) { inputStrings.push_back(segmented_sentences[i][j]); outputStrings.push_back(segmented_sentences[i][j + k]); } } } } int input_size = inputStrings.size(); inputStrings.append_array(outputStrings); Ref bow = bag_of_words(inputStrings, BAG_OF_WORDS_TYPE_BINARY); Size2i bow_size = bow->size(); Ref input_set; Ref output_set; input_set.instance(); output_set.instance(); input_set->resize(Size2i(bow_size.x, input_size)); Ref row_tmp; row_tmp.instance(); row_tmp->resize(bow_size.x); for (int i = 0; i < input_size; i++) { bow->row_get_into_mlpp_vector(i, row_tmp); input_set->row_set_mlpp_vector(i, row_tmp); } output_set->resize(Size2i(bow_size.x, bow_size.y - input_size)); Size2i output_set_size = output_set->size(); for (int i = 0; i < output_set_size.y; i++) { bow->row_get_into_mlpp_vector(i + input_size, row_tmp); input_set->row_set_mlpp_vector(i, row_tmp); } MLPPSoftmaxNet *model; if (type == WORD_TO_VEC_TYPE_SKIPGRAM) { model = memnew(MLPPSoftmaxNet(output_set, input_set, dimension)); } else { // else = CBOW. We maintain it is a default. model = memnew(MLPPSoftmaxNet(input_set, output_set, dimension)); } model->train_gradient_descent(learning_rate, max_epoch); res.word_embeddings = model->get_embeddings(); memdelete(model); return res; } Ref MLPPData::lsa(Vector sentences, int dim) { MLPPLinAlg alg; Ref doc_word_data = bag_of_words(sentences, BAG_OF_WORDS_TYPE_BINARY); MLPPLinAlg::SVDResult svr_res = alg.svd(doc_word_data); Ref S_trunc = alg.zeromatnm(dim, dim); Ref Vt_trunc; Vt_trunc.instance(); Vt_trunc->resize(Size2i(svr_res.Vt->size().x, dim)); Ref row_rmp; row_rmp.instance(); row_rmp->resize(svr_res.Vt->size().x); for (int i = 0; i < dim; i++) { S_trunc->element_set(i, i, svr_res.S->element_get(i, i)); svr_res.Vt->row_get_into_mlpp_vector(i, row_rmp); Vt_trunc->row_set_mlpp_vector(i, row_rmp); } Ref embeddings = S_trunc->multn(Vt_trunc); return embeddings; } Vector MLPPData::create_word_list(Vector sentences) { String combined_text = ""; for (int i = 0; i < sentences.size(); i++) { if (i != 0) { combined_text += " "; } combined_text += sentences[i]; } return remove_spaces(vec_to_set(remove_stop_words(combined_text))); } // EXTRA void MLPPData::setInputNames(std::string fileName, std::vector &inputNames) { std::string inputNameTemp; std::ifstream dataFile(fileName); if (!dataFile.is_open()) { std::cout << fileName << " failed to open." << std::endl; } while (std::getline(dataFile, inputNameTemp)) { inputNames.push_back(inputNameTemp); } dataFile.close(); } Ref MLPPData::feature_scaling(const Ref &p_X) { Ref X = p_X->transposen(); Size2i x_size = X->size(); LocalVector max_elements; LocalVector min_elements; max_elements.resize(x_size.y); min_elements.resize(x_size.y); Ref row_tmp; row_tmp.instance(); row_tmp->resize(x_size.x); for (int i = 0; i < x_size.y; ++i) { X->row_get_into_mlpp_vector(i, row_tmp); max_elements[i] = row_tmp->max_element(); min_elements[i] = row_tmp->min_element(); } for (int i = 0; i < x_size.y; i++) { real_t maxe = max_elements[i]; real_t mine = min_elements[i]; for (int j = 0; j < x_size.x; j++) { real_t xij = X->element_get(i, j); X->element_set(i, j, (xij - mine) / (maxe - mine)); } } return X->transposen(); } Ref MLPPData::mean_centering(const Ref &p_X) { MLPPStat stat; Ref X; X.instance(); X->resize(p_X->size()); Size2i x_size = X->size(); Ref x_row_tmp; x_row_tmp.instance(); x_row_tmp->resize(x_size.x); for (int i = 0; i < x_size.y; ++i) { p_X->row_get_into_mlpp_vector(i, x_row_tmp); real_t mean_i = stat.meanv(x_row_tmp); for (int j = 0; j < x_size.x; ++j) { X->element_set(i, j, p_X->element_get(i, j) - mean_i); } } return X; } Ref MLPPData::mean_normalization(const Ref &p_X) { MLPPLinAlg alg; MLPPStat stat; // (X_j - mu_j) / std_j, for every j Ref X = mean_centering(p_X); Size2i x_size = X->size(); Ref x_row_tmp; x_row_tmp.instance(); x_row_tmp->resize(x_size.x); for (int i = 0; i < x_size.y; i++) { X->row_get_into_mlpp_vector(i, x_row_tmp); x_row_tmp->scalar_multiply((real_t)1 / stat.standard_deviationv(x_row_tmp)); X->row_set_mlpp_vector(i, x_row_tmp); } return X; } Ref MLPPData::one_hot_rep(const Ref &temp_output_set, int n_class) { ERR_FAIL_COND_V(!temp_output_set.is_valid(), Ref()); Ref output_set; output_set.instance(); int temp_output_set_size = temp_output_set->size(); const real_t *temp_output_set_ptr = temp_output_set->ptr(); output_set->resize(Size2i(n_class, temp_output_set_size)); for (int i = 0; i < temp_output_set_size; ++i) { for (int j = 0; j <= n_class - 1; ++j) { if (static_cast(temp_output_set_ptr[i]) == j) { output_set->element_set(i, j, 1); } else { output_set->element_set(i, j, 0); } } } return output_set; } std::vector MLPPData::reverseOneHot(std::vector> tempOutputSet) { std::vector outputSet; //uint32_t n_class = tempOutputSet[0].size(); for (uint32_t i = 0; i < tempOutputSet.size(); i++) { int current_class = 1; for (uint32_t j = 0; j < tempOutputSet[i].size(); j++) { if (tempOutputSet[i][j] == 1) { break; } else { current_class++; } } outputSet.push_back(current_class); } return outputSet; } void MLPPData::load_default_suffixes() { // Our list of suffixes which we use to compare against suffixes = String("eer er ion ity ment ness or sion ship th able ible al ant ary ful ic ious ous ive less y ed en ing ize ise ly ward wise").split_spaces(); } void MLPPData::load_default_stop_words() { stop_words = String("i me my myself we our ours ourselves you your yours yourself yourselves he him his himself she her hers herself it its itself they them their theirs themselves what which who whom this that these those am is are was were be been being have has had having do does did doing a an the and but if or because as until while of at by for with about against between into through during before after above below to from up down in out on off over under again further then once here there when where why how all any both each few more most other some such no nor not only own same so than too very s t can will just don should now").split_spaces(); } void MLPPData::_bind_methods() { ClassDB::bind_method(D_METHOD("load_breast_cancer", "path"), &MLPPData::load_breast_cancer); ClassDB::bind_method(D_METHOD("load_breast_cancer_svc", "path"), &MLPPData::load_breast_cancer_svc); ClassDB::bind_method(D_METHOD("load_iris", "path"), &MLPPData::load_iris); ClassDB::bind_method(D_METHOD("load_wine", "path"), &MLPPData::load_wine); ClassDB::bind_method(D_METHOD("load_mnist_train", "path"), &MLPPData::load_mnist_train); ClassDB::bind_method(D_METHOD("load_mnist_test", "path"), &MLPPData::load_mnist_test); ClassDB::bind_method(D_METHOD("load_california_housing", "path"), &MLPPData::load_california_housing); ClassDB::bind_method(D_METHOD("load_fires_and_crime", "path"), &MLPPData::load_fires_and_crime); ClassDB::bind_method(D_METHOD("train_test_split", "data", "test_size"), &MLPPData::train_test_split_bind); }