diff --git a/mlpp/data/data.cpp b/mlpp/data/data.cpp index 1705f5c..a3cbbcd 100644 --- a/mlpp/data/data.cpp +++ b/mlpp/data/data.cpp @@ -15,6 +15,7 @@ #include "../lin_alg/lin_alg_old.h" #include "../softmax_net/softmax_net.h" #include "../stat/stat_old.h" +#include "data_old.h" #include #include @@ -430,9 +431,10 @@ std::tuple>, std::vector>> M const int ONE_HOT_NUM = 3; std::vector> inputSet; std::vector tempOutputSet; + MLPPDataOld d; setData(IRIS_SIZE, "/Users/marcmelikyan/Desktop/Data/Iris.csv", inputSet, tempOutputSet); - std::vector> outputSet = oneHotRep(tempOutputSet, ONE_HOT_NUM); + std::vector> outputSet = d.oneHotRep(tempOutputSet, ONE_HOT_NUM); return { inputSet, outputSet }; } @@ -441,9 +443,10 @@ std::tuple>, std::vector>> M const int ONE_HOT_NUM = 3; std::vector> inputSet; std::vector tempOutputSet; + MLPPDataOld d; setData(WINE_SIZE, "MLPP/Data/Datasets/Iris.csv", inputSet, tempOutputSet); - std::vector> outputSet = oneHotRep(tempOutputSet, ONE_HOT_NUM); + std::vector> outputSet = d.oneHotRep(tempOutputSet, ONE_HOT_NUM); return { inputSet, outputSet }; } @@ -452,9 +455,10 @@ std::tuple>, std::vector>> M const int ONE_HOT_NUM = 10; std::vector> inputSet; std::vector tempOutputSet; + MLPPDataOld d; setData(MNIST_SIZE, "MLPP/Data/Datasets/MnistTrain.csv", inputSet, tempOutputSet); - std::vector> outputSet = oneHotRep(tempOutputSet, ONE_HOT_NUM); + std::vector> outputSet = d.oneHotRep(tempOutputSet, ONE_HOT_NUM); return { inputSet, outputSet }; } @@ -463,9 +467,10 @@ std::tuple>, std::vector>> M const int ONE_HOT_NUM = 10; std::vector> inputSet; std::vector tempOutputSet; + MLPPDataOld d; setData(MNIST_SIZE, "MLPP/Data/Datasets/MnistTest.csv", inputSet, tempOutputSet); - std::vector> outputSet = oneHotRep(tempOutputSet, ONE_HOT_NUM); + std::vector> outputSet = d.oneHotRep(tempOutputSet, ONE_HOT_NUM); return { inputSet, outputSet }; } @@ -1117,80 +1122,40 @@ void MLPPData::setInputNames(std::string fileName, std::vector &inp dataFile.close(); } -std::vector> MLPPData::featureScaling(std::vector> X) { - MLPPLinAlgOld alg; - X = alg.transpose(X); - std::vector max_elements, min_elements; - max_elements.resize(X.size()); - min_elements.resize(X.size()); +Ref MLPPData::feature_scaling(const Ref &p_X) { + Ref X = p_X->transposen(); - for (uint32_t i = 0; i < X.size(); i++) { - max_elements[i] = alg.max(X[i]); - min_elements[i] = alg.min(X[i]); + Size2i x_size = X->size(); + + LocalVector max_elements; + LocalVector min_elements; + + max_elements.resize(x_size.y); + min_elements.resize(x_size.y); + + Ref row_tmp; + row_tmp.instance(); + row_tmp->resize(x_size.x); + + for (int i = 0; i < x_size.y; ++i) { + X->row_get_into_mlpp_vector(i, row_tmp); + + max_elements[i] = row_tmp->max_element(); + min_elements[i] = row_tmp->min_element(); } - for (uint32_t i = 0; i < X.size(); i++) { - for (uint32_t j = 0; j < X[i].size(); j++) { - X[i][j] = (X[i][j] - min_elements[i]) / (max_elements[i] - min_elements[i]); + for (int i = 0; i < x_size.y; i++) { + real_t maxe = max_elements[i]; + real_t mine = min_elements[i]; + + for (int j = 0; j < x_size.x; j++) { + real_t xij = X->element_get(i, j); + + X->element_set(i, j, (xij - mine) / (maxe - mine)); } } - return alg.transpose(X); -} -std::vector> MLPPData::meanNormalization(std::vector> X) { - MLPPLinAlgOld alg; - MLPPStatOld stat; - // (X_j - mu_j) / std_j, for every j - - X = meanCentering(X); - for (uint32_t i = 0; i < X.size(); i++) { - X[i] = alg.scalarMultiply(1 / stat.standardDeviation(X[i]), X[i]); - } - return X; -} - -std::vector> MLPPData::meanCentering(std::vector> X) { - MLPPStatOld stat; - for (uint32_t i = 0; i < X.size(); i++) { - real_t mean_i = stat.mean(X[i]); - for (uint32_t j = 0; j < X[i].size(); j++) { - X[i][j] -= mean_i; - } - } - return X; -} - -std::vector> MLPPData::oneHotRep(std::vector tempOutputSet, int n_class) { - std::vector> outputSet; - outputSet.resize(tempOutputSet.size()); - for (uint32_t i = 0; i < tempOutputSet.size(); i++) { - for (int j = 0; j <= n_class - 1; j++) { - if (tempOutputSet[i] == j) { - outputSet[i].push_back(1); - } else { - outputSet[i].push_back(0); - } - } - } - return outputSet; -} - -std::vector MLPPData::reverseOneHot(std::vector> tempOutputSet) { - std::vector outputSet; - //uint32_t n_class = tempOutputSet[0].size(); - for (uint32_t i = 0; i < tempOutputSet.size(); i++) { - int current_class = 1; - for (uint32_t j = 0; j < tempOutputSet[i].size(); j++) { - if (tempOutputSet[i][j] == 1) { - break; - } else { - current_class++; - } - } - outputSet.push_back(current_class); - } - - return outputSet; + return X->transposen(); } Ref MLPPData::mean_centering(const Ref &p_X) { @@ -1207,7 +1172,7 @@ Ref MLPPData::mean_centering(const Ref &p_X) { x_row_tmp->resize(x_size.x); for (int i = 0; i < x_size.y; ++i) { - X->row_get_into_mlpp_vector(i, x_row_tmp); + p_X->row_get_into_mlpp_vector(i, x_row_tmp); real_t mean_i = stat.meanv(x_row_tmp); @@ -1219,6 +1184,30 @@ Ref MLPPData::mean_centering(const Ref &p_X) { return X; } +Ref MLPPData::mean_normalization(const Ref &p_X) { + MLPPLinAlg alg; + MLPPStat stat; + + // (X_j - mu_j) / std_j, for every j + + Ref X = mean_centering(p_X); + Size2i x_size = X->size(); + + Ref x_row_tmp; + x_row_tmp.instance(); + x_row_tmp->resize(x_size.x); + + for (int i = 0; i < x_size.y; i++) { + X->row_get_into_mlpp_vector(i, x_row_tmp); + + x_row_tmp->scalar_multiply((real_t)1 / stat.standard_deviationv(x_row_tmp)); + + X->row_set_mlpp_vector(i, x_row_tmp); + } + + return X; +} + Ref MLPPData::one_hot_rep(const Ref &temp_output_set, int n_class) { ERR_FAIL_COND_V(!temp_output_set.is_valid(), Ref()); @@ -1243,6 +1232,24 @@ Ref MLPPData::one_hot_rep(const Ref &temp_output_set, in return output_set; } +std::vector MLPPData::reverseOneHot(std::vector> tempOutputSet) { + std::vector outputSet; + //uint32_t n_class = tempOutputSet[0].size(); + for (uint32_t i = 0; i < tempOutputSet.size(); i++) { + int current_class = 1; + for (uint32_t j = 0; j < tempOutputSet[i].size(); j++) { + if (tempOutputSet[i][j] == 1) { + break; + } else { + current_class++; + } + } + outputSet.push_back(current_class); + } + + return outputSet; +} + void MLPPData::load_default_suffixes() { // Our list of suffixes which we use to compare against suffixes = String("eer er ion ity ment ness or sion ship th able ible al ant ary ful ic ious ous ive less y ed en ing ize ise ly ward wise").split_spaces(); diff --git a/mlpp/data/data.h b/mlpp/data/data.h index bb8b7e1..565eef1 100644 --- a/mlpp/data/data.h +++ b/mlpp/data/data.h @@ -176,14 +176,11 @@ public: // Extra void setInputNames(std::string fileName, std::vector &inputNames); - std::vector> featureScaling(std::vector> X); - std::vector> meanNormalization(std::vector> X); - std::vector> meanCentering(std::vector> X); - std::vector> oneHotRep(std::vector tempOutputSet, int n_class); - std::vector reverseOneHot(std::vector> tempOutputSet); - + Ref feature_scaling(const Ref &X); Ref mean_centering(const Ref &X); + Ref mean_normalization(const Ref &X); Ref one_hot_rep(const Ref &temp_output_set, int n_class); + std::vector reverseOneHot(std::vector> tempOutputSet); template std::vector vecToSet(std::vector inputSet) { diff --git a/mlpp/pca/pca_old.cpp b/mlpp/pca/pca_old.cpp index 2f916cf..1cd3b1b 100644 --- a/mlpp/pca/pca_old.cpp +++ b/mlpp/pca/pca_old.cpp @@ -5,7 +5,7 @@ // #include "pca_old.h" -#include "../data/data.h" +#include "../data/data_old.h" #include "../lin_alg/lin_alg_old.h" #include @@ -19,7 +19,7 @@ MLPPPCAOld::MLPPPCAOld(std::vector> inputSet, int k) : std::vector> MLPPPCAOld::principalComponents() { MLPPLinAlgOld alg; - MLPPData data; + MLPPDataOld data; MLPPLinAlgOld::SVDResultOld svr_res = alg.SVD(alg.cov(inputSet)); X_normalized = data.meanCentering(inputSet); diff --git a/test/mlpp_tests.cpp b/test/mlpp_tests.cpp index 2c815d4..f8c00f9 100644 --- a/test/mlpp_tests.cpp +++ b/test/mlpp_tests.cpp @@ -978,20 +978,18 @@ void MLPPTests::test_nlp_and_data(bool ui) { PLOG_MSG("LSA:"); PLOG_MSG(data.lsa(text_archive2, 2)->to_string()); - /* - std::vector> inputSet = { { 1, 2 }, { 2, 3 }, { 3, 4 }, { 4, 5 }, { 5, 6 } }; - std::cout << "Feature Scaling Example:" << std::endl; - alg.printMatrix(data.featureScaling(inputSet)); - std::cout << std::endl; + std::vector> input_set_vec = { { 1, 2 }, { 2, 3 }, { 3, 4 }, { 4, 5 }, { 5, 6 } }; - std::cout << "Mean Centering Example:" << std::endl; - alg.printMatrix(data.meanCentering(inputSet)); - std::cout << std::endl; + Ref input_set = Ref(memnew(MLPPMatrix(input_set_vec))); - std::cout << "Mean Normalization Example:" << std::endl; - alg.printMatrix(data.meanNormalization(inputSet)); - std::cout << std::endl; - */ + PLOG_MSG("Feature Scaling Example:"); + PLOG_MSG(data.feature_scaling(input_set)->to_string()); + + PLOG_MSG("Mean Centering Example:"); + PLOG_MSG(data.mean_centering(input_set)->to_string()); + + PLOG_MSG("Mean Normalization Example:"); + PLOG_MSG(data.mean_normalization(input_set)->to_string()); } void MLPPTests::test_outlier_finder(bool ui) { MLPPLinAlg alg; diff --git a/test/mlpp_tests_old.cpp b/test/mlpp_tests_old.cpp index 8d1c23e..57173c4 100644 --- a/test/mlpp_tests_old.cpp +++ b/test/mlpp_tests_old.cpp @@ -400,7 +400,6 @@ void MLPPTestsOld::test_outlier_finder(bool ui) { MLPPLinAlgOld alg; // Outlier Finder - //std::vector inputSet = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 23554332523523 }; std::vector inputSet = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 23554332 }; MLPPOutlierFinderOld outlierFinderOld(2); // Any datapoint outside of 2 stds from the mean is marked as an outlier. alg.printVector(outlierFinderOld.modelTest(inputSet));