From 5ad25ad918e765781a70adb3782f0101bc72f989 Mon Sep 17 00:00:00 2001 From: Relintai Date: Thu, 16 Feb 2023 21:07:31 +0100 Subject: [PATCH] Now MLPPBernoulliNB uses engine classes. --- mlpp/bernoulli_nb/bernoulli_nb.cpp | 121 ++++++++++++++++------------- mlpp/bernoulli_nb/bernoulli_nb.h | 22 +++--- mlpp/data/data.h | 45 +++++++++++ mlpp/lin_alg/lin_alg.cpp | 23 +++++- mlpp/lin_alg/lin_alg.h | 3 +- test/mlpp_tests.cpp | 4 +- 6 files changed, 146 insertions(+), 72 deletions(-) diff --git a/mlpp/bernoulli_nb/bernoulli_nb.cpp b/mlpp/bernoulli_nb/bernoulli_nb.cpp index 4f8a691..563f280 100644 --- a/mlpp/bernoulli_nb/bernoulli_nb.cpp +++ b/mlpp/bernoulli_nb/bernoulli_nb.cpp @@ -12,41 +12,51 @@ #include #include -std::vector MLPPBernoulliNB::model_set_test(std::vector> X) { - std::vector y_hat; - for (uint32_t i = 0; i < X.size(); i++) { - y_hat.push_back(model_test(X[i])); +Ref MLPPBernoulliNB::model_set_test(const Ref &X) { + Ref y_hat; + y_hat.instance(); + y_hat->resize(X->size().y); + + Ref x_row_tmp; + x_row_tmp.instance(); + x_row_tmp->resize(X->size().x); + + for (int i = 0; i < X->size().y; i++) { + X->get_row_into_mlpp_vector(i, x_row_tmp); + + y_hat->set_element(i, model_test(x_row_tmp)); } + return y_hat; } -real_t MLPPBernoulliNB::model_test(std::vector x) { +real_t MLPPBernoulliNB::model_test(const Ref &x) { real_t score_0 = 1; real_t score_1 = 1; - std::vector foundIndices; + Vector found_indices; - for (uint32_t j = 0; j < x.size(); j++) { - for (uint32_t k = 0; k < _vocab.size(); k++) { - if (x[j] == _vocab[k]) { - score_0 *= _theta[0][_vocab[k]]; - score_1 *= _theta[1][_vocab[k]]; + for (int j = 0; j < x->size(); j++) { + for (int k = 0; k < _vocab->size(); k++) { + if (x->get_element(j) == _vocab->get_element(k)) { + score_0 *= _theta[0][_vocab->get_element(k)]; + score_1 *= _theta[1][_vocab->get_element(k)]; - foundIndices.push_back(k); + found_indices.push_back(k); } } } - for (uint32_t i = 0; i < _vocab.size(); i++) { + for (int i = 0; i < _vocab->size(); i++) { bool found = false; - for (uint32_t j = 0; j < foundIndices.size(); j++) { - if (_vocab[i] == _vocab[foundIndices[j]]) { + for (int j = 0; j < found_indices.size(); j++) { + if (_vocab->get_element(i) == _vocab->get_element(found_indices[j])) { found = true; } } if (!found) { - score_0 *= 1 - _theta[0][_vocab[i]]; - score_1 *= 1 - _theta[1][_vocab[i]]; + score_0 *= 1 - _theta[0][_vocab->get_element(i)]; + score_1 *= 1 - _theta[1][_vocab->get_element(i)]; } } @@ -64,10 +74,11 @@ real_t MLPPBernoulliNB::model_test(std::vector x) { real_t MLPPBernoulliNB::score() { MLPPUtilities util; - return util.performance(_y_hat, _output_set); + + return util.performance_vec(_y_hat, _output_set); } -MLPPBernoulliNB::MLPPBernoulliNB(std::vector> p_input_set, std::vector p_output_set) { +MLPPBernoulliNB::MLPPBernoulliNB(const Ref &p_input_set, const Ref &p_output_set) { _input_set = p_input_set; _output_set = p_output_set; _class_num = 2; @@ -75,7 +86,9 @@ MLPPBernoulliNB::MLPPBernoulliNB(std::vector> p_input_set, s _prior_1 = 0; _prior_0 = 0; - _y_hat.resize(_output_set.size()); + _y_hat.instance(); + _y_hat->resize(_output_set->size()); + evaluate(); } @@ -89,7 +102,8 @@ MLPPBernoulliNB::~MLPPBernoulliNB() { void MLPPBernoulliNB::compute_vocab() { MLPPLinAlg alg; MLPPData data; - _vocab = data.vecToSet(alg.flatten(_input_set)); + + _vocab = data.vec_to_setnv(alg.flattenv(_input_set)); } void MLPPBernoulliNB::compute_theta() { @@ -98,43 +112,43 @@ void MLPPBernoulliNB::compute_theta() { // Setting all values in the hasmap by default to 0. for (int i = _class_num - 1; i >= 0; i--) { - for (uint32_t j = 0; j < _vocab.size(); j++) { - _theta[i][_vocab[j]] = 0; + for (int j = 0; j < _vocab->size(); j++) { + _theta.write[i][_vocab->get_element(j)] = 0; } } - for (uint32_t i = 0; i < _input_set.size(); i++) { - for (uint32_t j = 0; j < _input_set[0].size(); j++) { - _theta[_output_set[i]][_input_set[i][j]]++; + for (int i = 0; i < _input_set->size().y; i++) { + for (int j = 0; j < _input_set->size().x; j++) { + _theta.write[_output_set->get_element(i)][_input_set->get_element(i, j)]++; } } - for (uint32_t i = 0; i < _theta.size(); i++) { + for (int i = 0; i < _theta.size(); i++) { for (uint32_t j = 0; j < _theta[i].size(); j++) { if (i == 0) { - _theta[i][j] /= _prior_0 * _y_hat.size(); + _theta.write[i][j] /= _prior_0 * _y_hat->size(); } else { - _theta[i][j] /= _prior_1 * _y_hat.size(); + _theta.write[i][j] /= _prior_1 * _y_hat->size(); } } } } void MLPPBernoulliNB::evaluate() { - for (uint32_t i = 0; i < _output_set.size(); i++) { + for (int i = 0; i < _output_set->size(); i++) { // Pr(B | A) * Pr(A) real_t score_0 = 1; real_t score_1 = 1; real_t sum = 0; - for (uint32_t ii = 0; ii < _output_set.size(); ii++) { - if (_output_set[ii] == 1) { - sum += _output_set[ii]; + for (int ii = 0; ii < _output_set->size(); ii++) { + if (_output_set->get_element(ii) == 1) { + sum += 1; } } // Easy computation of priors, i.e. Pr(C_k) - _prior_1 = sum / _y_hat.size(); + _prior_1 = sum / _y_hat->size(); _prior_0 = 1 - _prior_1; // Evaluating Theta... @@ -143,47 +157,44 @@ void MLPPBernoulliNB::evaluate() { // Evaluating the vocab set... compute_vocab(); - std::vector foundIndices; + Vector found_indices; - for (uint32_t j = 0; j < _input_set.size(); j++) { - for (uint32_t k = 0; k < _vocab.size(); k++) { - if (_input_set[i][j] == _vocab[k]) { - score_0 += std::log(_theta[0][_vocab[k]]); - score_1 += std::log(_theta[1][_vocab[k]]); + for (int j = 0; j < _input_set->size().y; j++) { + for (int k = 0; k < _vocab->size(); k++) { + if (_input_set->get_element(i, j) == _vocab->get_element(k)) { + score_0 += Math::log(static_cast(_theta[0][_vocab->get_element(k)])); + score_1 += Math::log(static_cast(_theta[1][_vocab->get_element(k)])); - foundIndices.push_back(k); + found_indices.push_back(k); } } } - for (uint32_t ii = 0; ii < _vocab.size(); ii++) { + for (int ii = 0; ii < _vocab->size(); ii++) { bool found = false; - for (uint32_t j = 0; j < foundIndices.size(); j++) { - if (_vocab[ii] == _vocab[foundIndices[j]]) { + for (int j = 0; j < found_indices.size(); j++) { + if (_vocab->get_element(ii) == _vocab->get_element(found_indices[j])) { found = true; } } if (!found) { - score_0 += std::log(1 - _theta[0][_vocab[ii]]); - score_1 += std::log(1 - _theta[1][_vocab[ii]]); + score_0 += Math::log(1.0 - _theta[0][_vocab->get_element(ii)]); + score_1 += Math::log(1.0 - _theta[1][_vocab->get_element(ii)]); } } - score_0 += std::log(_prior_0); - score_1 += std::log(_prior_1); + score_0 += Math::log(_prior_0); + score_1 += Math::log(_prior_1); - score_0 = exp(score_0); - score_1 = exp(score_1); - - std::cout << score_0 << std::endl; - std::cout << score_1 << std::endl; + score_0 = Math::exp(score_0); + score_1 = Math::exp(score_1); // Assigning the traning example to a class if (score_0 > score_1) { - _y_hat[i] = 0; + _y_hat->set_element(i, 0); } else { - _y_hat[i] = 1; + _y_hat->set_element(i, 1); } } } diff --git a/mlpp/bernoulli_nb/bernoulli_nb.h b/mlpp/bernoulli_nb/bernoulli_nb.h index ec79e55..76de6cf 100644 --- a/mlpp/bernoulli_nb/bernoulli_nb.h +++ b/mlpp/bernoulli_nb/bernoulli_nb.h @@ -8,23 +8,25 @@ // Created by Marc Melikyan on 1/17/21. // +#include "core/containers/hash_map.h" +#include "core/containers/vector.h" #include "core/math/math_defs.h" #include "core/object/reference.h" -#include -#include +#include "../lin_alg/mlpp_matrix.h" +#include "../lin_alg/mlpp_vector.h" class MLPPBernoulliNB : public Reference { GDCLASS(MLPPBernoulliNB, Reference); public: - std::vector model_set_test(std::vector> X); - real_t model_test(std::vector x); + Ref model_set_test(const Ref &X); + real_t model_test(const Ref &x); real_t score(); - MLPPBernoulliNB(std::vector> p_input_set, std::vector p_output_set); + MLPPBernoulliNB(const Ref &p_input_set, const Ref &p_output_set); MLPPBernoulliNB(); ~MLPPBernoulliNB(); @@ -40,14 +42,14 @@ protected: real_t _prior_1; real_t _prior_0; - std::vector> _theta; - std::vector _vocab; + Vector> _theta; + Ref _vocab; int _class_num; // Datasets - std::vector> _input_set; - std::vector _output_set; - std::vector _y_hat; + Ref _input_set; + Ref _output_set; + Ref _y_hat; }; #endif /* BernoulliNB_hpp */ \ No newline at end of file diff --git a/mlpp/data/data.h b/mlpp/data/data.h index 6dbc396..af3e338 100644 --- a/mlpp/data/data.h +++ b/mlpp/data/data.h @@ -194,6 +194,51 @@ public: return setInputSet; } + template + Vector vec_to_set(Vector input_set) { + Vector set_input_set; + + for (int i = 0; i < input_set.size(); i++) { + bool new_element = true; + + for (int j = 0; j < set_input_set.size(); j++) { + if (set_input_set[j] == input_set[i]) { + new_element = false; + } + } + + if (new_element) { + set_input_set.push_back(input_set[i]); + } + } + + return set_input_set; + } + + Ref vec_to_setnv(const Ref &input_set) { + Vector set_input_set; + + for (int i = 0; i < input_set->size(); i++) { + bool new_element = true; + + for (int j = 0; j < set_input_set.size(); j++) { + if (set_input_set[j] == input_set->get_element(i)) { + new_element = false; + } + } + + if (new_element) { + set_input_set.push_back(input_set->get_element(i)); + } + } + + Ref ret; + ret.instance(); + ret->set_from_vector(set_input_set); + + return ret; + } + protected: static void _bind_methods(); }; diff --git a/mlpp/lin_alg/lin_alg.cpp b/mlpp/lin_alg/lin_alg.cpp index 1cfd29b..48e12e0 100644 --- a/mlpp/lin_alg/lin_alg.cpp +++ b/mlpp/lin_alg/lin_alg.cpp @@ -302,7 +302,7 @@ Ref MLPPLinAlg::kronecker_productm(const Ref &A, const R row.push_back(scalar_multiplynv(a_ptr[A->calculate_index(i, k)], row_tmp)); } - Ref flattened_row = flattenv(row); + Ref flattened_row = flattenvv(row); C->set_row_mlpp_vector(i * b_size.y + j, flattened_row); } @@ -1009,8 +1009,6 @@ Ref MLPPLinAlg::maxnvv(const Ref &a, const Refptr(); real_t *ret_ptr = ret->ptrw(); - real_t dist = 0; - for (int i = 0; i < a_size; i++) { real_t aa_i = aa[i]; real_t bb_i = ba[i]; @@ -1678,7 +1676,7 @@ std::vector MLPPLinAlg::flatten(std::vector> A) { return a; } -Ref MLPPLinAlg::flattenv(const Vector> &A) { +Ref MLPPLinAlg::flattenvv(const Vector> &A) { Ref a; a.instance(); @@ -1707,6 +1705,23 @@ Ref MLPPLinAlg::flattenv(const Vector> &A) { return a; } +Ref MLPPLinAlg::flattenv(const Ref &A) { + int data_size = A->data_size(); + + Ref res; + res.instance(); + res->resize(data_size); + + real_t *res_ptr = res->ptrw(); + const real_t *a_ptr = A->ptr(); + + for (int i = 0; i < data_size; ++i) { + res_ptr[i] = a_ptr[i]; + } + + return res; +} + std::vector MLPPLinAlg::solve(std::vector> A, std::vector b) { return mat_vec_mult(inverse(A), b); } diff --git a/mlpp/lin_alg/lin_alg.h b/mlpp/lin_alg/lin_alg.h index 26541d3..50da6d3 100644 --- a/mlpp/lin_alg/lin_alg.h +++ b/mlpp/lin_alg/lin_alg.h @@ -181,7 +181,8 @@ public: real_t sum_elements(std::vector> A); std::vector flatten(std::vector> A); - Ref flattenv(const Vector> &A); + Ref flattenvv(const Vector> &A); + Ref flattenv(const Ref &A); std::vector solve(std::vector> A, std::vector b); diff --git a/test/mlpp_tests.cpp b/test/mlpp_tests.cpp index 2cac3ae..1bb1e97 100644 --- a/test/mlpp_tests.cpp +++ b/test/mlpp_tests.cpp @@ -799,8 +799,8 @@ void MLPPTests::test_naive_bayes() { MLPPBernoulliNBOld BNBOld(alg.transpose(inputSet), outputSet); alg.printVector(BNBOld.modelSetTest(alg.transpose(inputSet))); - MLPPBernoulliNB BNB(alg.transpose(inputSet), outputSet); - alg.printVector(BNB.model_set_test(alg.transpose(inputSet))); + MLPPBernoulliNB BNB(alg.transposem(input_set), output_set); + PLOG_MSG(BNB.model_set_test(alg.transposem(input_set))->to_string()); MLPPGaussianNBOld GNBOld(alg.transpose(inputSet), outputSet, 2); alg.printVector(GNBOld.modelSetTest(alg.transpose(inputSet)));