From 6f10a7f5563279c1feecaa91c5738db82ef85d0b Mon Sep 17 00:00:00 2001 From: Relintai Date: Sat, 30 Dec 2023 00:12:58 +0100 Subject: [PATCH] Cleanups to MLPPData. --- mlpp/data/data.cpp | 255 ++++----------------------------------------- mlpp/data/data.h | 24 ----- 2 files changed, 20 insertions(+), 259 deletions(-) diff --git a/mlpp/data/data.cpp b/mlpp/data/data.cpp index a3cbbcd..02e0e4e 100644 --- a/mlpp/data/data.cpp +++ b/mlpp/data/data.cpp @@ -12,9 +12,7 @@ #include "../lin_alg/lin_alg.h" #include "../stat/stat.h" -#include "../lin_alg/lin_alg_old.h" #include "../softmax_net/softmax_net.h" -#include "../stat/stat_old.h" #include "data_old.h" #include @@ -407,241 +405,9 @@ Array MLPPData::train_test_split_bind(const Ref &data, real_t t return arr; } -// Loading Datasets -std::tuple>, std::vector> MLPPData::loadBreastCancer() { - const int BREAST_CANCER_SIZE = 30; // k = 30 - std::vector> inputSet; - std::vector outputSet; - - setData(BREAST_CANCER_SIZE, "MLPP/Data/Datasets/BreastCancer.csv", inputSet, outputSet); - return { inputSet, outputSet }; -} - -std::tuple>, std::vector> MLPPData::loadBreastCancerSVC() { - const int BREAST_CANCER_SIZE = 30; // k = 30 - std::vector> inputSet; - std::vector outputSet; - - setData(BREAST_CANCER_SIZE, "MLPP/Data/Datasets/BreastCancerSVM.csv", inputSet, outputSet); - return { inputSet, outputSet }; -} - -std::tuple>, std::vector>> MLPPData::loadIris() { - const int IRIS_SIZE = 4; - const int ONE_HOT_NUM = 3; - std::vector> inputSet; - std::vector tempOutputSet; - MLPPDataOld d; - - setData(IRIS_SIZE, "/Users/marcmelikyan/Desktop/Data/Iris.csv", inputSet, tempOutputSet); - std::vector> outputSet = d.oneHotRep(tempOutputSet, ONE_HOT_NUM); - return { inputSet, outputSet }; -} - -std::tuple>, std::vector>> MLPPData::loadWine() { - const int WINE_SIZE = 4; - const int ONE_HOT_NUM = 3; - std::vector> inputSet; - std::vector tempOutputSet; - MLPPDataOld d; - - setData(WINE_SIZE, "MLPP/Data/Datasets/Iris.csv", inputSet, tempOutputSet); - std::vector> outputSet = d.oneHotRep(tempOutputSet, ONE_HOT_NUM); - return { inputSet, outputSet }; -} - -std::tuple>, std::vector>> MLPPData::loadMnistTrain() { - const int MNIST_SIZE = 784; - const int ONE_HOT_NUM = 10; - std::vector> inputSet; - std::vector tempOutputSet; - MLPPDataOld d; - - setData(MNIST_SIZE, "MLPP/Data/Datasets/MnistTrain.csv", inputSet, tempOutputSet); - std::vector> outputSet = d.oneHotRep(tempOutputSet, ONE_HOT_NUM); - return { inputSet, outputSet }; -} - -std::tuple>, std::vector>> MLPPData::loadMnistTest() { - const int MNIST_SIZE = 784; - const int ONE_HOT_NUM = 10; - std::vector> inputSet; - std::vector tempOutputSet; - MLPPDataOld d; - - setData(MNIST_SIZE, "MLPP/Data/Datasets/MnistTest.csv", inputSet, tempOutputSet); - std::vector> outputSet = d.oneHotRep(tempOutputSet, ONE_HOT_NUM); - return { inputSet, outputSet }; -} - -std::tuple>, std::vector> MLPPData::loadCaliforniaHousing() { - const int CALIFORNIA_HOUSING_SIZE = 13; // k = 30 - std::vector> inputSet; - std::vector outputSet; - - setData(CALIFORNIA_HOUSING_SIZE, "MLPP/Data/Datasets/CaliforniaHousing.csv", inputSet, outputSet); - return { inputSet, outputSet }; -} - -std::tuple, std::vector> MLPPData::loadFiresAndCrime() { - std::vector inputSet; // k is implicitly 1. - std::vector outputSet; - - setData("MLPP/Data/Datasets/FiresAndCrime.csv", inputSet, outputSet); - return { inputSet, outputSet }; -} - -// Note that inputs and outputs should be pairs (technically), but this -// implementation will separate them. (My implementation keeps them tied together.) -// Not yet sure whether this is intentional or not (or it's something like a compiler specific difference) -std::tuple>, std::vector>, std::vector>, std::vector>> MLPPData::trainTestSplit(std::vector> inputSet, std::vector> outputSet, real_t testSize) { - std::random_device rd; - std::default_random_engine generator(rd()); - - std::shuffle(inputSet.begin(), inputSet.end(), generator); // inputSet random shuffle - std::shuffle(outputSet.begin(), outputSet.end(), generator); // outputSet random shuffle) - - std::vector> inputTestSet; - std::vector> outputTestSet; - - int testInputNumber = testSize * inputSet.size(); // implicit usage of floor - int testOutputNumber = testSize * outputSet.size(); // implicit usage of floor - - for (int i = 0; i < testInputNumber; i++) { - inputTestSet.push_back(inputSet[i]); - inputSet.erase(inputSet.begin()); - } - - for (int i = 0; i < testOutputNumber; i++) { - outputTestSet.push_back(outputSet[i]); - outputSet.erase(outputSet.begin()); - } - - return { inputSet, outputSet, inputTestSet, outputTestSet }; -} - -// MULTIVARIATE SUPERVISED - -void MLPPData::setData(int k, std::string fileName, std::vector> &inputSet, std::vector &outputSet) { - MLPPLinAlgOld alg; - std::string inputTemp; - std::string outputTemp; - - inputSet.resize(k); - - std::ifstream dataFile(fileName); - if (!dataFile.is_open()) { - std::cout << fileName << " failed to open." << std::endl; - } - - std::string line; - while (std::getline(dataFile, line)) { - std::stringstream ss(line); - - for (int i = 0; i < k; i++) { - std::getline(ss, inputTemp, ','); - inputSet[i].push_back(std::stod(inputTemp)); - } - - std::getline(ss, outputTemp, ','); - outputSet.push_back(std::stod(outputTemp)); - } - inputSet = alg.transpose(inputSet); - dataFile.close(); -} - -void MLPPData::printData(std::vector inputName, std::string outputName, std::vector> inputSet, std::vector outputSet) { - MLPPLinAlgOld alg; - inputSet = alg.transpose(inputSet); - for (uint32_t i = 0; i < inputSet.size(); i++) { - std::cout << inputName[i] << std::endl; - for (uint32_t j = 0; j < inputSet[i].size(); j++) { - std::cout << inputSet[i][j] << std::endl; - } - } - - std::cout << outputName << std::endl; - for (uint32_t i = 0; i < outputSet.size(); i++) { - std::cout << outputSet[i] << std::endl; - } -} - -// UNSUPERVISED - -void MLPPData::setData(int k, std::string fileName, std::vector> &inputSet) { - MLPPLinAlgOld alg; - std::string inputTemp; - - inputSet.resize(k); - - std::ifstream dataFile(fileName); - if (!dataFile.is_open()) { - std::cout << fileName << " failed to open." << std::endl; - } - - std::string line; - while (std::getline(dataFile, line)) { - std::stringstream ss(line); - - for (int i = 0; i < k; i++) { - std::getline(ss, inputTemp, ','); - inputSet[i].push_back(std::stod(inputTemp)); - } - } - inputSet = alg.transpose(inputSet); - dataFile.close(); -} - -void MLPPData::printData(std::vector inputName, std::vector> inputSet) { - MLPPLinAlgOld alg; - inputSet = alg.transpose(inputSet); - for (uint32_t i = 0; i < inputSet.size(); i++) { - std::cout << inputName[i] << std::endl; - for (uint32_t j = 0; j < inputSet[i].size(); j++) { - std::cout << inputSet[i][j] << std::endl; - } - } -} - -// SIMPLE - -void MLPPData::setData(std::string fileName, std::vector &inputSet, std::vector &outputSet) { - std::string inputTemp, outputTemp; - - std::ifstream dataFile(fileName); - if (!dataFile.is_open()) { - std::cout << "The file failed to open." << std::endl; - } - - std::string line; - - while (std::getline(dataFile, line)) { - std::stringstream ss(line); - - std::getline(ss, inputTemp, ','); - std::getline(ss, outputTemp, ','); - - inputSet.push_back(std::stod(inputTemp)); - outputSet.push_back(std::stod(outputTemp)); - } - - dataFile.close(); -} - -void MLPPData::printData(std::string &inputName, std::string &outputName, std::vector &inputSet, std::vector &outputSet) { - std::cout << inputName << std::endl; - for (uint32_t i = 0; i < inputSet.size(); i++) { - std::cout << inputSet[i] << std::endl; - } - - std::cout << outputName << std::endl; - for (uint32_t i = 0; i < inputSet.size(); i++) { - std::cout << outputSet[i] << std::endl; - } -} - // Images std::vector> MLPPData::rgb2gray(std::vector>> input) { + /* std::vector> grayScale; grayScale.resize(input[0].size()); for (uint32_t i = 0; i < grayScale.size(); i++) { @@ -653,9 +419,13 @@ std::vector> MLPPData::rgb2gray(std::vector>(); } std::vector>> MLPPData::rgb2ycbcr(std::vector>> input) { + /* MLPPLinAlgOld alg; std::vector>> YCbCr; YCbCr = alg.resize(YCbCr, input); @@ -667,11 +437,15 @@ std::vector>> MLPPData::rgb2ycbcr(std::vector>>(); } // Conversion formulas available here: // https://www.rapidtables.com/convert/color/rgb-to-hsv.html std::vector>> MLPPData::rgb2hsv(std::vector>> input) { + /* MLPPLinAlgOld alg; std::vector>> HSV; HSV = alg.resize(HSV, input); @@ -710,23 +484,34 @@ std::vector>> MLPPData::rgb2hsv(std::vector>>(); } // http://machinethatsees.blogspot.com/2013/07/how-to-convert-rgb-to-xyz-or-vice-versa.html std::vector>> MLPPData::rgb2xyz(std::vector>> input) { + /* MLPPLinAlgOld alg; std::vector>> XYZ; XYZ = alg.resize(XYZ, input); std::vector> RGB2XYZ = { { 0.4124564, 0.3575761, 0.1804375 }, { 0.2126726, 0.7151522, 0.0721750 }, { 0.0193339, 0.1191920, 0.9503041 } }; return alg.vector_wise_tensor_product(input, RGB2XYZ); + */ + + return std::vector>>(); } std::vector>> MLPPData::xyz2rgb(std::vector>> input) { + /* MLPPLinAlgOld alg; std::vector>> XYZ; XYZ = alg.resize(XYZ, input); std::vector> RGB2XYZ = alg.inverse({ { 0.4124564, 0.3575761, 0.1804375 }, { 0.2126726, 0.7151522, 0.0721750 }, { 0.0193339, 0.1191920, 0.9503041 } }); return alg.vector_wise_tensor_product(input, RGB2XYZ); + */ + + return std::vector>>(); } // TEXT-BASED & NLP diff --git a/mlpp/data/data.h b/mlpp/data/data.h index 565eef1..d7cc9e3 100644 --- a/mlpp/data/data.h +++ b/mlpp/data/data.h @@ -106,30 +106,6 @@ public: SplitComplexData train_test_split(Ref data, real_t test_size); Array train_test_split_bind(const Ref &data, real_t test_size); - // Load Datasets - std::tuple>, std::vector> loadBreastCancer(); - std::tuple>, std::vector> loadBreastCancerSVC(); - std::tuple>, std::vector>> loadIris(); - std::tuple>, std::vector>> loadWine(); - std::tuple>, std::vector>> loadMnistTrain(); - std::tuple>, std::vector>> loadMnistTest(); - std::tuple>, std::vector> loadCaliforniaHousing(); - std::tuple, std::vector> loadFiresAndCrime(); - - std::tuple>, std::vector>, std::vector>, std::vector>> trainTestSplit(std::vector> inputSet, std::vector> outputSet, real_t testSize); - - // Supervised - void setData(int k, std::string fileName, std::vector> &inputSet, std::vector &outputSet); - void printData(std::vector inputName, std::string outputName, std::vector> inputSet, std::vector outputSet); - - // Unsupervised - void setData(int k, std::string fileName, std::vector> &inputSet); - void printData(std::vector inputName, std::vector> inputSet); - - // Simple - void setData(std::string fileName, std::vector &inputSet, std::vector &outputSet); - void printData(std::string &inputName, std::string &outputName, std::vector &inputSet, std::vector &outputSet); - // Images std::vector> rgb2gray(std::vector>> input); std::vector>> rgb2ycbcr(std::vector>> input);