Now MLPPBernoulliNB uses engine classes.

2025-02-18 19:34:20 +01:00 · 2023-02-16 21:07:31 +01:00 · 2023-02-16 21:07:31 +01:00 · 5ad25ad918
commit 5ad25ad918
parent 741475a4ab
6 changed files with 146 additions and 72 deletions
--- a/mlpp/bernoulli_nb/bernoulli_nb.cpp
+++ b/mlpp/bernoulli_nb/bernoulli_nb.cpp
@ -12,41 +12,51 @@
 #include <iostream>
 #include <random>

-std::vector<real_t> MLPPBernoulliNB::model_set_test(std::vector<std::vector<real_t>> X) {
-	std::vector<real_t> y_hat;
-	for (uint32_t i = 0; i < X.size(); i++) {
-		y_hat.push_back(model_test(X[i]));
+Ref<MLPPVector> MLPPBernoulliNB::model_set_test(const Ref<MLPPMatrix> &X) {
+	Ref<MLPPVector> y_hat;
+	y_hat.instance();
+	y_hat->resize(X->size().y);
+
+	Ref<MLPPVector> x_row_tmp;
+	x_row_tmp.instance();
+	x_row_tmp->resize(X->size().x);
+
+	for (int i = 0; i < X->size().y; i++) {
+		X->get_row_into_mlpp_vector(i, x_row_tmp);
+
+		y_hat->set_element(i, model_test(x_row_tmp));
 	}
+
 	return y_hat;
 }

-real_t MLPPBernoulliNB::model_test(std::vector<real_t> x) {
+real_t MLPPBernoulliNB::model_test(const Ref<MLPPVector> &x) {
 	real_t score_0 = 1;
 	real_t score_1 = 1;

-	std::vector<int> foundIndices;
+	Vector<int> found_indices;

-	for (uint32_t j = 0; j < x.size(); j++) {
-		for (uint32_t k = 0; k < _vocab.size(); k++) {
-			if (x[j] == _vocab[k]) {
-				score_0 *= _theta[0][_vocab[k]];
-				score_1 *= _theta[1][_vocab[k]];
+	for (int j = 0; j < x->size(); j++) {
+		for (int k = 0; k < _vocab->size(); k++) {
+			if (x->get_element(j) == _vocab->get_element(k)) {
+				score_0 *= _theta[0][_vocab->get_element(k)];
+				score_1 *= _theta[1][_vocab->get_element(k)];

-				foundIndices.push_back(k);
+				found_indices.push_back(k);
 			}
 		}
 	}

-	for (uint32_t i = 0; i < _vocab.size(); i++) {
+	for (int i = 0; i < _vocab->size(); i++) {
 		bool found = false;
-		for (uint32_t j = 0; j < foundIndices.size(); j++) {
-			if (_vocab[i] == _vocab[foundIndices[j]]) {
+		for (int j = 0; j < found_indices.size(); j++) {
+			if (_vocab->get_element(i) == _vocab->get_element(found_indices[j])) {
 				found = true;
 			}
 		}
 		if (!found) {
-			score_0 *= 1 - _theta[0][_vocab[i]];
-			score_1 *= 1 - _theta[1][_vocab[i]];
+			score_0 *= 1 - _theta[0][_vocab->get_element(i)];
+			score_1 *= 1 - _theta[1][_vocab->get_element(i)];
 		}
 	}

@ -64,10 +74,11 @@ real_t MLPPBernoulliNB::model_test(std::vector<real_t> x) {

 real_t MLPPBernoulliNB::score() {
 	MLPPUtilities util;
-	return util.performance(_y_hat, _output_set);
+
+	return util.performance_vec(_y_hat, _output_set);
 }

-MLPPBernoulliNB::MLPPBernoulliNB(std::vector<std::vector<real_t>> p_input_set, std::vector<real_t> p_output_set) {
+MLPPBernoulliNB::MLPPBernoulliNB(const Ref<MLPPMatrix> &p_input_set, const Ref<MLPPVector> &p_output_set) {
 	_input_set = p_input_set;
 	_output_set = p_output_set;
 	_class_num = 2;
@ -75,7 +86,9 @@ MLPPBernoulliNB::MLPPBernoulliNB(std::vector<std::vector<real_t>> p_input_set, s
 	_prior_1 = 0;
 	_prior_0 = 0;

-	_y_hat.resize(_output_set.size());
+	_y_hat.instance();
+	_y_hat->resize(_output_set->size());
+
 	evaluate();
 }

@ -89,7 +102,8 @@ MLPPBernoulliNB::~MLPPBernoulliNB() {
 void MLPPBernoulliNB::compute_vocab() {
 	MLPPLinAlg alg;
 	MLPPData data;
-	_vocab = data.vecToSet<real_t>(alg.flatten(_input_set));
+
+	_vocab = data.vec_to_setnv(alg.flattenv(_input_set));
 }

 void MLPPBernoulliNB::compute_theta() {
@ -98,43 +112,43 @@ void MLPPBernoulliNB::compute_theta() {

 	// Setting all values in the hasmap by default to 0.
 	for (int i = _class_num - 1; i >= 0; i--) {
-		for (uint32_t j = 0; j < _vocab.size(); j++) {
-			_theta[i][_vocab[j]] = 0;
+		for (int j = 0; j < _vocab->size(); j++) {
+			_theta.write[i][_vocab->get_element(j)] = 0;
 		}
 	}

-	for (uint32_t i = 0; i < _input_set.size(); i++) {
-		for (uint32_t j = 0; j < _input_set[0].size(); j++) {
-			_theta[_output_set[i]][_input_set[i][j]]++;
+	for (int i = 0; i < _input_set->size().y; i++) {
+		for (int j = 0; j < _input_set->size().x; j++) {
+			_theta.write[_output_set->get_element(i)][_input_set->get_element(i, j)]++;
 		}
 	}

-	for (uint32_t i = 0; i < _theta.size(); i++) {
+	for (int i = 0; i < _theta.size(); i++) {
 		for (uint32_t j = 0; j < _theta[i].size(); j++) {
 			if (i == 0) {
-				_theta[i][j] /= _prior_0 * _y_hat.size();
+				_theta.write[i][j] /= _prior_0 * _y_hat->size();
 			} else {
-				_theta[i][j] /= _prior_1 * _y_hat.size();
+				_theta.write[i][j] /= _prior_1 * _y_hat->size();
 			}
 		}
 	}
 }

 void MLPPBernoulliNB::evaluate() {
-	for (uint32_t i = 0; i < _output_set.size(); i++) {
+	for (int i = 0; i < _output_set->size(); i++) {
 		// Pr(B | A) * Pr(A)
 		real_t score_0 = 1;
 		real_t score_1 = 1;

 		real_t sum = 0;
-		for (uint32_t ii = 0; ii < _output_set.size(); ii++) {
-			if (_output_set[ii] == 1) {
-				sum += _output_set[ii];
+		for (int ii = 0; ii < _output_set->size(); ii++) {
+			if (_output_set->get_element(ii) == 1) {
+				sum += 1;
 			}
 		}

 		// Easy computation of priors, i.e. Pr(C_k)
-		_prior_1 = sum / _y_hat.size();
+		_prior_1 = sum / _y_hat->size();
 		_prior_0 = 1 - _prior_1;

 		// Evaluating Theta...
@ -143,47 +157,44 @@ void MLPPBernoulliNB::evaluate() {
 		// Evaluating the vocab set...
 		compute_vocab();

-		std::vector<int> foundIndices;
+		Vector<int> found_indices;

-		for (uint32_t j = 0; j < _input_set.size(); j++) {
-			for (uint32_t k = 0; k < _vocab.size(); k++) {
-				if (_input_set[i][j] == _vocab[k]) {
-					score_0 += std::log(_theta[0][_vocab[k]]);
-					score_1 += std::log(_theta[1][_vocab[k]]);
+		for (int j = 0; j < _input_set->size().y; j++) {
+			for (int k = 0; k < _vocab->size(); k++) {
+				if (_input_set->get_element(i, j) == _vocab->get_element(k)) {
+					score_0 += Math::log(static_cast<real_t>(_theta[0][_vocab->get_element(k)]));
+					score_1 += Math::log(static_cast<real_t>(_theta[1][_vocab->get_element(k)]));

-					foundIndices.push_back(k);
+					found_indices.push_back(k);
 				}
 			}
 		}

-		for (uint32_t ii = 0; ii < _vocab.size(); ii++) {
+		for (int ii = 0; ii < _vocab->size(); ii++) {
 			bool found = false;
-			for (uint32_t j = 0; j < foundIndices.size(); j++) {
-				if (_vocab[ii] == _vocab[foundIndices[j]]) {
+			for (int j = 0; j < found_indices.size(); j++) {
+				if (_vocab->get_element(ii) == _vocab->get_element(found_indices[j])) {
 					found = true;
 				}
 			}
 			if (!found) {
-				score_0 += std::log(1 - _theta[0][_vocab[ii]]);
-				score_1 += std::log(1 - _theta[1][_vocab[ii]]);
+				score_0 += Math::log(1.0 - _theta[0][_vocab->get_element(ii)]);
+				score_1 += Math::log(1.0 - _theta[1][_vocab->get_element(ii)]);
 			}
 		}

-		score_0 += std::log(_prior_0);
-		score_1 += std::log(_prior_1);
+		score_0 += Math::log(_prior_0);
+		score_1 += Math::log(_prior_1);

-		score_0 = exp(score_0);
-		score_1 = exp(score_1);
-
-		std::cout << score_0 << std::endl;
-		std::cout << score_1 << std::endl;
+		score_0 = Math::exp(score_0);
+		score_1 = Math::exp(score_1);

 		// Assigning the traning example to a class

 		if (score_0 > score_1) {
-			_y_hat[i] = 0;
+			_y_hat->set_element(i, 0);
 		} else {
-			_y_hat[i] = 1;
+			_y_hat->set_element(i, 1);
 		}
 	}
 }
--- a/mlpp/bernoulli_nb/bernoulli_nb.h
+++ b/mlpp/bernoulli_nb/bernoulli_nb.h
@ -8,23 +8,25 @@
 //  Created by Marc Melikyan on 1/17/21.
 //

+#include "core/containers/hash_map.h"
+#include "core/containers/vector.h"
 #include "core/math/math_defs.h"

 #include "core/object/reference.h"

-#include <map>
-#include <vector>
+#include "../lin_alg/mlpp_matrix.h"
+#include "../lin_alg/mlpp_vector.h"

 class MLPPBernoulliNB : public Reference {
 	GDCLASS(MLPPBernoulliNB, Reference);

 public:
-	std::vector<real_t> model_set_test(std::vector<std::vector<real_t>> X);
-	real_t model_test(std::vector<real_t> x);
+	Ref<MLPPVector> model_set_test(const Ref<MLPPMatrix> &X);
+	real_t model_test(const Ref<MLPPVector> &x);

 	real_t score();

-	MLPPBernoulliNB(std::vector<std::vector<real_t>> p_input_set, std::vector<real_t> p_output_set);
+	MLPPBernoulliNB(const Ref<MLPPMatrix> &p_input_set, const Ref<MLPPVector> &p_output_set);

 	MLPPBernoulliNB();
 	~MLPPBernoulliNB();
@ -40,14 +42,14 @@ protected:
 	real_t _prior_1;
 	real_t _prior_0;

-	std::vector<std::map<real_t, int>> _theta;
-	std::vector<real_t> _vocab;
+	Vector<HashMap<real_t, int>> _theta;
+	Ref<MLPPVector> _vocab;
 	int _class_num;

 	// Datasets
-	std::vector<std::vector<real_t>> _input_set;
-	std::vector<real_t> _output_set;
-	std::vector<real_t> _y_hat;
+	Ref<MLPPMatrix> _input_set;
+	Ref<MLPPVector> _output_set;
+	Ref<MLPPVector> _y_hat;
 };

 #endif /* BernoulliNB_hpp */
--- a/mlpp/data/data.h
+++ b/mlpp/data/data.h
@ -194,6 +194,51 @@ public:
 		return setInputSet;
 	}

+	template <class T>
+	Vector<T> vec_to_set(Vector<T> input_set) {
+		Vector<T> set_input_set;
+
+		for (int i = 0; i < input_set.size(); i++) {
+			bool new_element = true;
+
+			for (int j = 0; j < set_input_set.size(); j++) {
+				if (set_input_set[j] == input_set[i]) {
+					new_element = false;
+				}
+			}
+
+			if (new_element) {
+				set_input_set.push_back(input_set[i]);
+			}
+		}
+
+		return set_input_set;
+	}
+
+	Ref<MLPPVector> vec_to_setnv(const Ref<MLPPVector> &input_set) {
+		Vector<real_t> set_input_set;
+
+		for (int i = 0; i < input_set->size(); i++) {
+			bool new_element = true;
+
+			for (int j = 0; j < set_input_set.size(); j++) {
+				if (set_input_set[j] == input_set->get_element(i)) {
+					new_element = false;
+				}
+			}
+
+			if (new_element) {
+				set_input_set.push_back(input_set->get_element(i));
+			}
+		}
+
+		Ref<MLPPVector> ret;
+		ret.instance();
+		ret->set_from_vector(set_input_set);
+
+		return ret;
+	}
+
 protected:
 	static void _bind_methods();
 };
--- a/mlpp/lin_alg/lin_alg.cpp
+++ b/mlpp/lin_alg/lin_alg.cpp
@ -302,7 +302,7 @@ Ref<MLPPMatrix> MLPPLinAlg::kronecker_productm(const Ref<MLPPMatrix> &A, const R
 				row.push_back(scalar_multiplynv(a_ptr[A->calculate_index(i, k)], row_tmp));
 			}

-			Ref<MLPPVector> flattened_row = flattenv(row);
+			Ref<MLPPVector> flattened_row = flattenvv(row);

 			C->set_row_mlpp_vector(i * b_size.y + j, flattened_row);
 		}
@ -1009,8 +1009,6 @@ Ref<MLPPVector> MLPPLinAlg::maxnvv(const Ref<MLPPVector> &a, const Ref<MLPPVecto
 	const real_t *ba = b->ptr();
 	real_t *ret_ptr = ret->ptrw();

-	real_t dist = 0;
-
 	for (int i = 0; i < a_size; i++) {
 		real_t aa_i = aa[i];
 		real_t bb_i = ba[i];
@ -1678,7 +1676,7 @@ std::vector<real_t> MLPPLinAlg::flatten(std::vector<std::vector<real_t>> A) {
 	return a;
 }

-Ref<MLPPVector> MLPPLinAlg::flattenv(const Vector<Ref<MLPPVector>> &A) {
+Ref<MLPPVector> MLPPLinAlg::flattenvv(const Vector<Ref<MLPPVector>> &A) {
 	Ref<MLPPVector> a;
 	a.instance();

@ -1707,6 +1705,23 @@ Ref<MLPPVector> MLPPLinAlg::flattenv(const Vector<Ref<MLPPVector>> &A) {
 	return a;
 }

+Ref<MLPPVector> MLPPLinAlg::flattenv(const Ref<MLPPMatrix> &A) {
+	int data_size = A->data_size();
+
+	Ref<MLPPVector> res;
+	res.instance();
+	res->resize(data_size);
+
+	real_t *res_ptr = res->ptrw();
+	const real_t *a_ptr = A->ptr();
+
+	for (int i = 0; i < data_size; ++i) {
+		res_ptr[i] = a_ptr[i];
+	}
+
+	return res;
+}
+
 std::vector<real_t> MLPPLinAlg::solve(std::vector<std::vector<real_t>> A, std::vector<real_t> b) {
 	return mat_vec_mult(inverse(A), b);
 }
--- a/mlpp/lin_alg/lin_alg.h
+++ b/mlpp/lin_alg/lin_alg.h
@ -181,7 +181,8 @@ public:
 	real_t sum_elements(std::vector<std::vector<real_t>> A);

 	std::vector<real_t> flatten(std::vector<std::vector<real_t>> A);
-	Ref<MLPPVector> flattenv(const Vector<Ref<MLPPVector>> &A);
+	Ref<MLPPVector> flattenvv(const Vector<Ref<MLPPVector>> &A);
+	Ref<MLPPVector> flattenv(const Ref<MLPPMatrix> &A);

 	std::vector<real_t> solve(std::vector<std::vector<real_t>> A, std::vector<real_t> b);

--- a/test/mlpp_tests.cpp
+++ b/test/mlpp_tests.cpp
@ -799,8 +799,8 @@ void MLPPTests::test_naive_bayes() {
 	MLPPBernoulliNBOld BNBOld(alg.transpose(inputSet), outputSet);
 	alg.printVector(BNBOld.modelSetTest(alg.transpose(inputSet)));

-	MLPPBernoulliNB BNB(alg.transpose(inputSet), outputSet);
-	alg.printVector(BNB.model_set_test(alg.transpose(inputSet)));
+	MLPPBernoulliNB BNB(alg.transposem(input_set), output_set);
+	PLOG_MSG(BNB.model_set_test(alg.transposem(input_set))->to_string());

 	MLPPGaussianNBOld GNBOld(alg.transpose(inputSet), outputSet, 2);
 	alg.printVector(GNBOld.modelSetTest(alg.transpose(inputSet)));