Added new optimizers for Neural Nets.

This commit is contained in:
novak_99 2022-01-18 22:26:57 -08:00
parent a4c36293f9
commit 2c83feb410
9 changed files with 408 additions and 347 deletions

View File

@ -114,6 +114,157 @@ namespace MLPP {
forwardPass();
}
void ANN::Momentum(double learning_rate, int max_epoch, int mini_batch_size, double gamma, bool NAG, bool UI){
class Cost cost;
LinAlg alg;
double cost_prev = 0;
int epoch = 1;
// Creating the mini-batches
int n_mini_batch = n/mini_batch_size;
// always evaluate the result
// always do forward pass only ONCE at end.
auto [inputMiniBatches, outputMiniBatches] = Utilities::createMiniBatches(inputSet, outputSet, n_mini_batch);
// Initializing necessary components for Adam.
std::vector<std::vector<std::vector<double>>> v_hidden;
std::vector<double> v_output;
while(true){
for(int i = 0; i < n_mini_batch; i++){
std::vector<double> y_hat = modelSetTest(inputMiniBatches[i]);
cost_prev = Cost(y_hat, outputMiniBatches[i]);
auto [cumulativeHiddenLayerWGrad, outputWGrad] = computeGradients(y_hat, outputMiniBatches[i]);
if(!network.empty() && v_hidden.empty()){ // Initing our tensor
v_hidden = alg.resize(v_hidden, cumulativeHiddenLayerWGrad);
}
if(v_output.empty()){
v_output.resize(outputWGrad.size());
}
if(NAG){ // "Aposterori" calculation
updateParameters(v_hidden, v_output, 0); // DON'T update bias.
}
v_hidden = alg.addition(alg.scalarMultiply(gamma, v_hidden), alg.scalarMultiply(learning_rate/n, cumulativeHiddenLayerWGrad));
v_output = alg.addition(alg.scalarMultiply(gamma, v_output), alg.scalarMultiply(learning_rate/n, outputWGrad));
updateParameters(v_hidden, v_output, learning_rate); // subject to change. may want bias to have this matrix too.
y_hat = modelSetTest(inputMiniBatches[i]);
if(UI) { ANN::UI(epoch, cost_prev, y_hat, outputMiniBatches[i]); }
}
epoch++;
if(epoch > max_epoch) { break; }
}
forwardPass();
}
void ANN::Adagrad(double learning_rate, int max_epoch, int mini_batch_size, double e, bool UI){
class Cost cost;
LinAlg alg;
double cost_prev = 0;
int epoch = 1;
// Creating the mini-batches
int n_mini_batch = n/mini_batch_size;
// always evaluate the result
// always do forward pass only ONCE at end.
auto [inputMiniBatches, outputMiniBatches] = Utilities::createMiniBatches(inputSet, outputSet, n_mini_batch);
// Initializing necessary components for Adam.
std::vector<std::vector<std::vector<double>>> v_hidden;
std::vector<double> v_output;
while(true){
for(int i = 0; i < n_mini_batch; i++){
std::vector<double> y_hat = modelSetTest(inputMiniBatches[i]);
cost_prev = Cost(y_hat, outputMiniBatches[i]);
auto [cumulativeHiddenLayerWGrad, outputWGrad] = computeGradients(y_hat, outputMiniBatches[i]);
if(!network.empty() && v_hidden.empty()){ // Initing our tensor
v_hidden = alg.resize(v_hidden, cumulativeHiddenLayerWGrad);
}
if(v_output.empty()){
v_output.resize(outputWGrad.size());
}
v_hidden = alg.addition(v_hidden, alg.exponentiate(cumulativeHiddenLayerWGrad, 2));
v_output = alg.addition(v_output, alg.exponentiate(outputWGrad, 2));
std::vector<std::vector<std::vector<double>>> hiddenLayerUpdations = alg.scalarMultiply(learning_rate/n, alg.elementWiseDivision(cumulativeHiddenLayerWGrad, alg.scalarAdd(e, alg.sqrt(v_hidden))));
std::vector<double> outputLayerUpdation = alg.scalarMultiply(learning_rate/n, alg.elementWiseDivision(outputWGrad, alg.scalarAdd(e, alg.sqrt(v_output))));
updateParameters(hiddenLayerUpdations, outputLayerUpdation, learning_rate); // subject to change. may want bias to have this matrix too.
y_hat = modelSetTest(inputMiniBatches[i]);
if(UI) { ANN::UI(epoch, cost_prev, y_hat, outputMiniBatches[i]); }
}
epoch++;
if(epoch > max_epoch) { break; }
}
forwardPass();
}
void ANN::Adadelta(double learning_rate, int max_epoch, int mini_batch_size, double b1, double e, bool UI){
class Cost cost;
LinAlg alg;
double cost_prev = 0;
int epoch = 1;
// Creating the mini-batches
int n_mini_batch = n/mini_batch_size;
// always evaluate the result
// always do forward pass only ONCE at end.
auto [inputMiniBatches, outputMiniBatches] = Utilities::createMiniBatches(inputSet, outputSet, n_mini_batch);
// Initializing necessary components for Adam.
std::vector<std::vector<std::vector<double>>> v_hidden;
std::vector<double> v_output;
while(true){
for(int i = 0; i < n_mini_batch; i++){
std::vector<double> y_hat = modelSetTest(inputMiniBatches[i]);
cost_prev = Cost(y_hat, outputMiniBatches[i]);
auto [cumulativeHiddenLayerWGrad, outputWGrad] = computeGradients(y_hat, outputMiniBatches[i]);
if(!network.empty() && v_hidden.empty()){ // Initing our tensor
v_hidden = alg.resize(v_hidden, cumulativeHiddenLayerWGrad);
}
if(v_output.empty()){
v_output.resize(outputWGrad.size());
}
v_hidden = alg.addition(alg.scalarMultiply(1 - b1, v_hidden), alg.scalarMultiply(b1, alg.exponentiate(cumulativeHiddenLayerWGrad, 2)));
v_output = alg.addition(v_output, alg.exponentiate(outputWGrad, 2));
std::vector<std::vector<std::vector<double>>> hiddenLayerUpdations = alg.scalarMultiply(learning_rate/n, alg.elementWiseDivision(cumulativeHiddenLayerWGrad, alg.scalarAdd(e, alg.sqrt(v_hidden))));
std::vector<double> outputLayerUpdation = alg.scalarMultiply(learning_rate/n, alg.elementWiseDivision(outputWGrad, alg.scalarAdd(e, alg.sqrt(v_output))));
updateParameters(hiddenLayerUpdations, outputLayerUpdation, learning_rate); // subject to change. may want bias to have this matrix too.
y_hat = modelSetTest(inputMiniBatches[i]);
if(UI) { ANN::UI(epoch, cost_prev, y_hat, outputMiniBatches[i]); }
}
epoch++;
if(epoch > max_epoch) { break; }
}
forwardPass();
}
void ANN::Adam(double learning_rate, int max_epoch, int mini_batch_size, double b1, double b2, double e, bool UI){
class Cost cost;
LinAlg alg;
@ -139,18 +290,9 @@ namespace MLPP {
cost_prev = Cost(y_hat, outputMiniBatches[i]);
auto [cumulativeHiddenLayerWGrad, outputWGrad] = computeGradients(y_hat, outputMiniBatches[i]);
if(!network.empty() && m_hidden.empty() && v_hidden.empty()){ // Initing our tensor
m_hidden.resize(cumulativeHiddenLayerWGrad.size());
v_hidden.resize(cumulativeHiddenLayerWGrad.size());
for(int i = 0; i < cumulativeHiddenLayerWGrad.size(); i++){
m_hidden[i].resize(cumulativeHiddenLayerWGrad[i].size());
v_hidden[i].resize(cumulativeHiddenLayerWGrad[i].size());
for(int j = 0; j < cumulativeHiddenLayerWGrad[i].size(); j++){
m_hidden[i][j].resize(cumulativeHiddenLayerWGrad[i][j].size());
v_hidden[i][j].resize(cumulativeHiddenLayerWGrad[i][j].size());
}
}
m_hidden = alg.resize(m_hidden, cumulativeHiddenLayerWGrad);
v_hidden = alg.resize(v_hidden, cumulativeHiddenLayerWGrad);
}
if(m_output.empty() && v_output.empty()){
@ -185,6 +327,198 @@ namespace MLPP {
forwardPass();
}
void ANN::Adamax(double learning_rate, int max_epoch, int mini_batch_size, double b1, double b2, double e, bool UI){
class Cost cost;
LinAlg alg;
double cost_prev = 0;
int epoch = 1;
// Creating the mini-batches
int n_mini_batch = n/mini_batch_size;
// always evaluate the result
// always do forward pass only ONCE at end.
auto [inputMiniBatches, outputMiniBatches] = Utilities::createMiniBatches(inputSet, outputSet, n_mini_batch);
// Initializing necessary components for Adam.
std::vector<std::vector<std::vector<double>>> m_hidden;
std::vector<std::vector<std::vector<double>>> u_hidden;
std::vector<double> m_output;
std::vector<double> u_output;
while(true){
for(int i = 0; i < n_mini_batch; i++){
std::vector<double> y_hat = modelSetTest(inputMiniBatches[i]);
cost_prev = Cost(y_hat, outputMiniBatches[i]);
auto [cumulativeHiddenLayerWGrad, outputWGrad] = computeGradients(y_hat, outputMiniBatches[i]);
if(!network.empty() && m_hidden.empty() && u_hidden.empty()){ // Initing our tensor
m_hidden = alg.resize(m_hidden, cumulativeHiddenLayerWGrad);
u_hidden = alg.resize(u_hidden, cumulativeHiddenLayerWGrad);
}
if(m_output.empty() && u_output.empty()){
m_output.resize(outputWGrad.size());
u_output.resize(outputWGrad.size());
}
m_hidden = alg.addition(alg.scalarMultiply(b1, m_hidden), alg.scalarMultiply(1 - b1, cumulativeHiddenLayerWGrad));
u_hidden = alg.max(alg.scalarMultiply(b2, u_hidden), alg.abs(cumulativeHiddenLayerWGrad));
m_output = alg.addition(alg.scalarMultiply(b1, m_output), alg.scalarMultiply(1 - b1, outputWGrad));
u_output = alg.max(alg.scalarMultiply(b2, u_output), alg.abs(outputWGrad));
std::vector<std::vector<std::vector<double>>> m_hidden_hat = alg.scalarMultiply(1/(1 - pow(b1, epoch)), m_hidden);
std::vector<double> m_output_hat = alg.scalarMultiply(1/(1 - pow(b1, epoch)), m_output);
std::vector<std::vector<std::vector<double>>> hiddenLayerUpdations = alg.scalarMultiply(learning_rate/n, alg.elementWiseDivision(m_hidden_hat, alg.scalarAdd(e, u_hidden)));
std::vector<double> outputLayerUpdation = alg.scalarMultiply(learning_rate/n, alg.elementWiseDivision(m_output_hat, alg.scalarAdd(e, u_output)));
updateParameters(hiddenLayerUpdations, outputLayerUpdation, learning_rate); // subject to change. may want bias to have this matrix too.
y_hat = modelSetTest(inputMiniBatches[i]);
if(UI) { ANN::UI(epoch, cost_prev, y_hat, outputMiniBatches[i]); }
}
epoch++;
if(epoch > max_epoch) { break; }
}
forwardPass();
}
void ANN::Nadam(double learning_rate, int max_epoch, int mini_batch_size, double b1, double b2, double e, bool UI){
class Cost cost;
LinAlg alg;
double cost_prev = 0;
int epoch = 1;
// Creating the mini-batches
int n_mini_batch = n/mini_batch_size;
// always evaluate the result
// always do forward pass only ONCE at end.
auto [inputMiniBatches, outputMiniBatches] = Utilities::createMiniBatches(inputSet, outputSet, n_mini_batch);
// Initializing necessary components for Adam.
std::vector<std::vector<std::vector<double>>> m_hidden;
std::vector<std::vector<std::vector<double>>> v_hidden;
std::vector<std::vector<std::vector<double>>> m_hidden_final;
std::vector<double> m_output;
std::vector<double> v_output;
while(true){
for(int i = 0; i < n_mini_batch; i++){
std::vector<double> y_hat = modelSetTest(inputMiniBatches[i]);
cost_prev = Cost(y_hat, outputMiniBatches[i]);
auto [cumulativeHiddenLayerWGrad, outputWGrad] = computeGradients(y_hat, outputMiniBatches[i]);
if(!network.empty() && m_hidden.empty() && v_hidden.empty()){ // Initing our tensor
m_hidden = alg.resize(m_hidden, cumulativeHiddenLayerWGrad);
v_hidden = alg.resize(v_hidden, cumulativeHiddenLayerWGrad);
}
if(m_output.empty() && v_output.empty()){
m_output.resize(outputWGrad.size());
v_output.resize(outputWGrad.size());
}
m_hidden = alg.addition(alg.scalarMultiply(b1, m_hidden), alg.scalarMultiply(1 - b1, cumulativeHiddenLayerWGrad));
v_hidden = alg.addition(alg.scalarMultiply(b2, v_hidden), alg.scalarMultiply(1 - b2, alg.exponentiate(cumulativeHiddenLayerWGrad, 2)));
m_output = alg.addition(alg.scalarMultiply(b1, m_output), alg.scalarMultiply(1 - b1, outputWGrad));
v_output = alg.addition(alg.scalarMultiply(b2, v_output), alg.scalarMultiply(1 - b2, alg.exponentiate(outputWGrad, 2)));
std::vector<std::vector<std::vector<double>>> m_hidden_hat = alg.scalarMultiply(1/(1 - pow(b1, epoch)), m_hidden);
std::vector<std::vector<std::vector<double>>> v_hidden_hat = alg.scalarMultiply(1/(1 - pow(b2, epoch)), v_hidden);
std::vector<std::vector<std::vector<double>>> m_hidden_final = alg.addition(alg.scalarMultiply(b1, m_hidden_hat), alg.scalarMultiply((1 - b1)/(1 - pow(b1, epoch)), cumulativeHiddenLayerWGrad));
std::vector<double> m_output_hat = alg.scalarMultiply(1/(1 - pow(b1, epoch)), m_output);
std::vector<double> v_output_hat = alg.scalarMultiply(1/(1 - pow(b2, epoch)), v_output);
std::vector<double> m_output_final = alg.addition(alg.scalarMultiply(b1, m_output_hat), alg.scalarMultiply((1 - b1)/(1 - pow(b1, epoch)), outputWGrad));
std::vector<std::vector<std::vector<double>>> hiddenLayerUpdations = alg.scalarMultiply(learning_rate/n, alg.elementWiseDivision(m_hidden_final, alg.scalarAdd(e, alg.sqrt(v_hidden_hat))));
std::vector<double> outputLayerUpdation = alg.scalarMultiply(learning_rate/n, alg.elementWiseDivision(m_output_final, alg.scalarAdd(e, alg.sqrt(v_output_hat))));
updateParameters(hiddenLayerUpdations, outputLayerUpdation, learning_rate); // subject to change. may want bias to have this matrix too.
y_hat = modelSetTest(inputMiniBatches[i]);
if(UI) { ANN::UI(epoch, cost_prev, y_hat, outputMiniBatches[i]); }
}
epoch++;
if(epoch > max_epoch) { break; }
}
forwardPass();
}
void ANN::AMSGrad(double learning_rate, int max_epoch, int mini_batch_size, double b1, double b2, double e, bool UI){
class Cost cost;
LinAlg alg;
double cost_prev = 0;
int epoch = 1;
// Creating the mini-batches
int n_mini_batch = n/mini_batch_size;
// always evaluate the result
// always do forward pass only ONCE at end.
auto [inputMiniBatches, outputMiniBatches] = Utilities::createMiniBatches(inputSet, outputSet, n_mini_batch);
// Initializing necessary components for Adam.
std::vector<std::vector<std::vector<double>>> m_hidden;
std::vector<std::vector<std::vector<double>>> v_hidden;
std::vector<std::vector<std::vector<double>>> v_hidden_hat;
std::vector<double> m_output;
std::vector<double> v_output;
std::vector<double> v_output_hat;
while(true){
for(int i = 0; i < n_mini_batch; i++){
std::vector<double> y_hat = modelSetTest(inputMiniBatches[i]);
cost_prev = Cost(y_hat, outputMiniBatches[i]);
auto [cumulativeHiddenLayerWGrad, outputWGrad] = computeGradients(y_hat, outputMiniBatches[i]);
if(!network.empty() && m_hidden.empty() && v_hidden.empty()){ // Initing our tensor
m_hidden = alg.resize(m_hidden, cumulativeHiddenLayerWGrad);
v_hidden = alg.resize(v_hidden, cumulativeHiddenLayerWGrad);
v_hidden_hat = alg.resize(v_hidden_hat, cumulativeHiddenLayerWGrad);
}
if(m_output.empty() && v_output.empty()){
m_output.resize(outputWGrad.size());
v_output.resize(outputWGrad.size());
v_output_hat.resize(outputWGrad.size());
}
m_hidden = alg.addition(alg.scalarMultiply(b1, m_hidden), alg.scalarMultiply(1 - b1, cumulativeHiddenLayerWGrad));
v_hidden = alg.addition(alg.scalarMultiply(b2, v_hidden), alg.scalarMultiply(1 - b2, alg.exponentiate(cumulativeHiddenLayerWGrad, 2)));
m_output = alg.addition(alg.scalarMultiply(b1, m_output), alg.scalarMultiply(1 - b1, outputWGrad));
v_output = alg.addition(alg.scalarMultiply(b2, v_output), alg.scalarMultiply(1 - b2, alg.exponentiate(outputWGrad, 2)));
v_hidden_hat = alg.max(v_hidden_hat, v_hidden);
v_output_hat = alg.max(v_output_hat, v_output);
std::vector<std::vector<std::vector<double>>> hiddenLayerUpdations = alg.scalarMultiply(learning_rate/n, alg.elementWiseDivision(m_hidden, alg.scalarAdd(e, alg.sqrt(v_hidden_hat))));
std::vector<double> outputLayerUpdation = alg.scalarMultiply(learning_rate/n, alg.elementWiseDivision(m_output, alg.scalarAdd(e, alg.sqrt(v_output_hat))));
updateParameters(hiddenLayerUpdations, outputLayerUpdation, learning_rate); // subject to change. may want bias to have this matrix too.
y_hat = modelSetTest(inputMiniBatches[i]);
if(UI) { ANN::UI(epoch, cost_prev, y_hat, outputMiniBatches[i]); }
}
epoch++;
if(epoch > max_epoch) { break; }
}
forwardPass();
}
double ANN::score(){
Utilities util;
forwardPass();

View File

@ -24,7 +24,13 @@ class ANN{
double modelTest(std::vector<double> x);
void gradientDescent(double learning_rate, int max_epoch, bool UI = 1);
void MBGD(double learning_rate, int max_epoch, int mini_batch_size, bool UI = 1);
void Momentum(double learning_rate, int max_epoch, int mini_batch_size, double gamma, bool NAG, bool UI = 1);
void Adagrad(double learning_rate, int max_epoch, int mini_batch_size, double e, bool UI = 1);
void Adadelta(double learning_rate, int max_epoch, int mini_batch_size, double b1, double e, bool UI = 1);
void Adam(double learning_rate, int max_epoch, int mini_batch_size, double b1, double b2, double e, bool UI = 1);
void Adamax(double learning_rate, int max_epoch, int mini_batch_size, double b1, double b2, double e, bool UI = 1);
void Nadam(double learning_rate, int max_epoch, int mini_batch_size, double b1, double b2, double e, bool UI = 1);
void AMSGrad(double learning_rate, int max_epoch, int mini_batch_size, double b1, double b2, double e, bool UI = 1);
double score();
void save(std::string fileName);

View File

@ -1130,4 +1130,29 @@ namespace MLPP{
}
return A;
}
std::vector<std::vector<std::vector<double>>> LinAlg::resize(std::vector<std::vector<std::vector<double>>> A, std::vector<std::vector<std::vector<double>>> B){
A.resize(B.size());
for(int i = 0; i < B.size(); i++){
A[i].resize(B[i].size());
for(int j = 0; j < B[i].size(); j++){
A[i][j].resize(B[i][j].size());
}
}
return A;
}
std::vector<std::vector<std::vector<double>>> LinAlg::max(std::vector<std::vector<std::vector<double>>> A, std::vector<std::vector<std::vector<double>>> B){
for(int i = 0; i < A.size(); i++){
A[i] = max(A[i], B[i]);
}
return A;
}
std::vector<std::vector<std::vector<double>>> LinAlg::abs(std::vector<std::vector<std::vector<double>>> A){
for(int i = 0; i < A.size(); i++){
A[i] = abs(A[i]);
}
return A;
}
}

View File

@ -210,6 +210,12 @@ namespace MLPP{
std::vector<std::vector<std::vector<double>>> resize(std::vector<std::vector<std::vector<double>>> A, std::vector<std::vector<std::vector<double>>> B);
std::vector<std::vector<std::vector<double>>> hadamard_product(std::vector<std::vector<std::vector<double>>> A, std::vector<std::vector<std::vector<double>>> B);
std::vector<std::vector<std::vector<double>>> max(std::vector<std::vector<std::vector<double>>> A, std::vector<std::vector<std::vector<double>>> B);
std::vector<std::vector<std::vector<double>>> abs(std::vector<std::vector<std::vector<double>>> A);
private:
};

View File

@ -166,327 +166,6 @@ namespace MLPP{
forwardPass();
}
void LinReg::Momentum(double learning_rate, int max_epoch, int mini_batch_size, double gamma, bool UI){
LinAlg alg;
Reg regularization;
double cost_prev = 0;
int epoch = 1;
// Creating the mini-batches
int n_mini_batch = n/mini_batch_size;
auto [inputMiniBatches, outputMiniBatches] = Utilities::createMiniBatches(inputSet, outputSet, n_mini_batch);
// Initializing necessary components for Momentum.
std::vector<double> v = alg.zerovec(weights.size());
while(true){
for(int i = 0; i < n_mini_batch; i++){
std::vector<double> y_hat = Evaluate(inputMiniBatches[i]);
cost_prev = Cost(y_hat, outputMiniBatches[i]);
std::vector<double> error = alg.subtraction(y_hat, outputMiniBatches[i]);
// Calculating the weight gradients
std::vector<double> gradient = alg.scalarMultiply(1/outputMiniBatches[i].size(), alg.mat_vec_mult(alg.transpose(inputMiniBatches[i]), error));
std::vector<double> RegDerivTerm = regularization.regDerivTerm(weights, lambda, alpha, reg);
std::vector<double> weight_grad = alg.addition(gradient, RegDerivTerm); // Weight_grad_final
v = alg.addition(alg.scalarMultiply(gamma, v), alg.scalarMultiply(learning_rate, weight_grad));
weights = alg.subtraction(weights, v);
// Calculating the bias gradients
bias -= learning_rate * alg.sum_elements(error) / outputMiniBatches[i].size(); // As normal
y_hat = Evaluate(inputMiniBatches[i]);
if(UI) {
Utilities::CostInfo(epoch, cost_prev, Cost(y_hat, outputMiniBatches[i]));
Utilities::UI(weights, bias);
}
}
epoch++;
if(epoch > max_epoch) { break; }
}
forwardPass();
}
void LinReg::NAG(double learning_rate, int max_epoch, int mini_batch_size, double gamma, bool UI){
LinAlg alg;
Reg regularization;
double cost_prev = 0;
int epoch = 1;
// Creating the mini-batches
int n_mini_batch = n/mini_batch_size;
auto [inputMiniBatches, outputMiniBatches] = Utilities::createMiniBatches(inputSet, outputSet, n_mini_batch);
// Initializing necessary components for Momentum.
std::vector<double> v = alg.zerovec(weights.size());
while(true){
for(int i = 0; i < n_mini_batch; i++){
weights = alg.subtraction(weights, alg.scalarMultiply(gamma, v)); // "Aposterori" calculation
std::vector<double> y_hat = Evaluate(inputMiniBatches[i]);
cost_prev = Cost(y_hat, outputMiniBatches[i]);
std::vector<double> error = alg.subtraction(y_hat, outputMiniBatches[i]);
// Calculating the weight gradients
std::vector<double> gradient = alg.scalarMultiply(1/outputMiniBatches[i].size(), alg.mat_vec_mult(alg.transpose(inputMiniBatches[i]), error));
std::vector<double> RegDerivTerm = regularization.regDerivTerm(weights, lambda, alpha, reg);
std::vector<double> weight_grad = alg.addition(gradient, RegDerivTerm); // Weight_grad_final
v = alg.addition(alg.scalarMultiply(gamma, v), alg.scalarMultiply(learning_rate, weight_grad));
weights = alg.subtraction(weights, v);
// Calculating the bias gradients
bias -= learning_rate * alg.sum_elements(error) / outputMiniBatches[i].size(); // As normal
y_hat = Evaluate(inputMiniBatches[i]);
if(UI) {
Utilities::CostInfo(epoch, cost_prev, Cost(y_hat, outputMiniBatches[i]));
Utilities::UI(weights, bias);
}
}
epoch++;
if(epoch > max_epoch) { break; }
}
forwardPass();
}
void LinReg::Adagrad(double learning_rate, int max_epoch, int mini_batch_size, double e, bool UI){
LinAlg alg;
Reg regularization;
double cost_prev = 0;
int epoch = 1;
// Creating the mini-batches
int n_mini_batch = n/mini_batch_size;
auto [inputMiniBatches, outputMiniBatches] = Utilities::createMiniBatches(inputSet, outputSet, n_mini_batch);
// Initializing necessary components for Adagrad.
std::vector<double> v = alg.zerovec(weights.size());
while(true){
for(int i = 0; i < n_mini_batch; i++){
std::vector<double> y_hat = Evaluate(inputMiniBatches[i]);
cost_prev = Cost(y_hat, outputMiniBatches[i]);
std::vector<double> error = alg.subtraction(y_hat, outputMiniBatches[i]);
// Calculating the weight gradients
std::vector<double> gradient = alg.scalarMultiply(1/outputMiniBatches[i].size(), alg.mat_vec_mult(alg.transpose(inputMiniBatches[i]), error));
std::vector<double> RegDerivTerm = regularization.regDerivTerm(weights, lambda, alpha, reg);
std::vector<double> weight_grad = alg.addition(gradient, RegDerivTerm); // Weight_grad_final
v = alg.hadamard_product(weight_grad, weight_grad);
weights = alg.subtraction(weights, alg.scalarMultiply(learning_rate, alg.elementWiseDivision(weight_grad, alg.sqrt(alg.scalarAdd(e, v)))));
// Calculating the bias gradients
bias -= learning_rate * alg.sum_elements(error) / outputMiniBatches[i].size(); // As normal
y_hat = Evaluate(inputMiniBatches[i]);
if(UI) {
Utilities::CostInfo(epoch, cost_prev, Cost(y_hat, outputMiniBatches[i]));
Utilities::UI(weights, bias);
}
}
epoch++;
if(epoch > max_epoch) { break; }
}
forwardPass();
}
void LinReg::Adadelta(double learning_rate, int max_epoch, int mini_batch_size, double b1, double e, bool UI){
// Adagrad upgrade. Momentum is applied.
LinAlg alg;
Reg regularization;
double cost_prev = 0;
int epoch = 1;
// Creating the mini-batches
int n_mini_batch = n/mini_batch_size;
auto [inputMiniBatches, outputMiniBatches] = Utilities::createMiniBatches(inputSet, outputSet, n_mini_batch);
// Initializing necessary components for Adagrad.
std::vector<double> v = alg.zerovec(weights.size());
while(true){
for(int i = 0; i < n_mini_batch; i++){
std::vector<double> y_hat = Evaluate(inputMiniBatches[i]);
cost_prev = Cost(y_hat, outputMiniBatches[i]);
std::vector<double> error = alg.subtraction(y_hat, outputMiniBatches[i]);
// Calculating the weight gradients
std::vector<double> gradient = alg.scalarMultiply(1/outputMiniBatches[i].size(), alg.mat_vec_mult(alg.transpose(inputMiniBatches[i]), error));
std::vector<double> RegDerivTerm = regularization.regDerivTerm(weights, lambda, alpha, reg);
std::vector<double> weight_grad = alg.addition(gradient, RegDerivTerm); // Weight_grad_final
v = alg.addition(alg.scalarMultiply(b1, v), alg.scalarMultiply(1 - b1, alg.hadamard_product(weight_grad, weight_grad)));
weights = alg.subtraction(weights, alg.scalarMultiply(learning_rate, alg.elementWiseDivision(weight_grad, alg.sqrt(alg.scalarAdd(e, v)))));
// Calculating the bias gradients
bias -= learning_rate * alg.sum_elements(error) / outputMiniBatches[i].size(); // As normal
y_hat = Evaluate(inputMiniBatches[i]);
if(UI) {
Utilities::CostInfo(epoch, cost_prev, Cost(y_hat, outputMiniBatches[i]));
Utilities::UI(weights, bias);
}
}
epoch++;
if(epoch > max_epoch) { break; }
}
forwardPass();
}
void LinReg::Adam(double learning_rate, int max_epoch, int mini_batch_size, double b1, double b2, double e, bool UI){
LinAlg alg;
Reg regularization;
double cost_prev = 0;
int epoch = 1;
// Creating the mini-batches
int n_mini_batch = n/mini_batch_size;
auto [inputMiniBatches, outputMiniBatches] = Utilities::createMiniBatches(inputSet, outputSet, n_mini_batch);
// Initializing necessary components for Adam.
std::vector<double> m = alg.zerovec(weights.size());
std::vector<double> v = alg.zerovec(weights.size());
while(true){
for(int i = 0; i < n_mini_batch; i++){
std::vector<double> y_hat = Evaluate(inputMiniBatches[i]);
cost_prev = Cost(y_hat, outputMiniBatches[i]);
std::vector<double> error = alg.subtraction(y_hat, outputMiniBatches[i]);
// Calculating the weight gradients
std::vector<double> gradient = alg.scalarMultiply(1/outputMiniBatches[i].size(), alg.mat_vec_mult(alg.transpose(inputMiniBatches[i]), error));
std::vector<double> RegDerivTerm = regularization.regDerivTerm(weights, lambda, alpha, reg);
std::vector<double> weight_grad = alg.addition(gradient, RegDerivTerm); // Weight_grad_final
m = alg.addition(alg.scalarMultiply(b1, m), alg.scalarMultiply(1 - b1, weight_grad));
v = alg.addition(alg.scalarMultiply(b2, v), alg.scalarMultiply(1 - b2, alg.exponentiate(weight_grad, 2)));
std::vector<double> m_hat = alg.scalarMultiply(1/(1 - pow(b1, epoch)), m);
std::vector<double> v_hat = alg.scalarMultiply(1/(1 - pow(b2, epoch)), v);
weights = alg.subtraction(weights, alg.scalarMultiply(learning_rate, alg.elementWiseDivision(m_hat, alg.scalarAdd(e, alg.sqrt(v_hat)))));
// Calculating the bias gradients
bias -= learning_rate * alg.sum_elements(error) / outputMiniBatches[i].size(); // As normal
y_hat = Evaluate(inputMiniBatches[i]);
if(UI) {
Utilities::CostInfo(epoch, cost_prev, Cost(y_hat, outputMiniBatches[i]));
Utilities::UI(weights, bias);
}
}
epoch++;
if(epoch > max_epoch) { break; }
}
forwardPass();
}
void LinReg::Adamax(double learning_rate, int max_epoch, int mini_batch_size, double b1, double b2, double e, bool UI){
LinAlg alg;
Reg regularization;
double cost_prev = 0;
int epoch = 1;
// Creating the mini-batches
int n_mini_batch = n/mini_batch_size;
auto [inputMiniBatches, outputMiniBatches] = Utilities::createMiniBatches(inputSet, outputSet, n_mini_batch);
std::vector<double> m = alg.zerovec(weights.size());
std::vector<double> u = alg.zerovec(weights.size());
while(true){
for(int i = 0; i < n_mini_batch; i++){
std::vector<double> y_hat = Evaluate(inputMiniBatches[i]);
cost_prev = Cost(y_hat, outputMiniBatches[i]);
std::vector<double> error = alg.subtraction(y_hat, outputMiniBatches[i]);
// Calculating the weight gradients
std::vector<double> gradient = alg.scalarMultiply(1/outputMiniBatches[i].size(), alg.mat_vec_mult(alg.transpose(inputMiniBatches[i]), error));
std::vector<double> RegDerivTerm = regularization.regDerivTerm(weights, lambda, alpha, reg);
std::vector<double> weight_grad = alg.addition(gradient, RegDerivTerm); // Weight_grad_final
m = alg.addition(alg.scalarMultiply(b1, m), alg.scalarMultiply(1 - b1, weight_grad));
u = alg.max(alg.scalarMultiply(b2, u), alg.abs(weight_grad));
std::vector<double> m_hat = alg.scalarMultiply(1/(1 - pow(b1, epoch)), m);
weights = alg.subtraction(weights, alg.scalarMultiply(learning_rate, alg.elementWiseDivision(m_hat, u)));
// Calculating the bias gradients
bias -= learning_rate * alg.sum_elements(error) / outputMiniBatches[i].size(); // As normal
y_hat = Evaluate(inputMiniBatches[i]);
if(UI) {
Utilities::CostInfo(epoch, cost_prev, Cost(y_hat, outputMiniBatches[i]));
Utilities::UI(weights, bias);
}
}
epoch++;
if(epoch > max_epoch) { break; }
}
forwardPass();
}
void LinReg::Nadam(double learning_rate, int max_epoch, int mini_batch_size, double b1, double b2, double e, bool UI){
LinAlg alg;
Reg regularization;
double cost_prev = 0;
int epoch = 1;
// Creating the mini-batches
int n_mini_batch = n/mini_batch_size;
auto [inputMiniBatches, outputMiniBatches] = Utilities::createMiniBatches(inputSet, outputSet, n_mini_batch);
// Initializing necessary components for Adam.
std::vector<double> m = alg.zerovec(weights.size());
std::vector<double> v = alg.zerovec(weights.size());
std::vector<double> m_final = alg.zerovec(weights.size());
while(true){
for(int i = 0; i < n_mini_batch; i++){
std::vector<double> y_hat = Evaluate(inputMiniBatches[i]);
cost_prev = Cost(y_hat, outputMiniBatches[i]);
std::vector<double> error = alg.subtraction(y_hat, outputMiniBatches[i]);
// Calculating the weight gradients
std::vector<double> gradient = alg.scalarMultiply(1/outputMiniBatches[i].size(), alg.mat_vec_mult(alg.transpose(inputMiniBatches[i]), error));
std::vector<double> RegDerivTerm = regularization.regDerivTerm(weights, lambda, alpha, reg);
std::vector<double> weight_grad = alg.addition(gradient, RegDerivTerm); // Weight_grad_final
m = alg.addition(alg.scalarMultiply(b1, m), alg.scalarMultiply(1 - b1, weight_grad));
v = alg.addition(alg.scalarMultiply(b2, v), alg.scalarMultiply(1 - b2, alg.exponentiate(weight_grad, 2)));
m_final = alg.addition(alg.scalarMultiply(b1, m), alg.scalarMultiply((1 - b1)/(1 - pow(b1, epoch)), weight_grad));
std::vector<double> m_hat = alg.scalarMultiply(1/(1 - pow(b1, epoch)), m);
std::vector<double> v_hat = alg.scalarMultiply(1/(1 - pow(b2, epoch)), v);
weights = alg.subtraction(weights, alg.scalarMultiply(learning_rate, alg.elementWiseDivision(m_final, alg.scalarAdd(e, alg.sqrt(v_hat)))));
// Calculating the bias gradients
bias -= learning_rate * alg.sum_elements(error) / outputMiniBatches[i].size(); // As normal
y_hat = Evaluate(inputMiniBatches[i]);
if(UI) {
Utilities::CostInfo(epoch, cost_prev, Cost(y_hat, outputMiniBatches[i]));
Utilities::UI(weights, bias);
}
}
epoch++;
if(epoch > max_epoch) { break; }
}
forwardPass();
}
void LinReg::normalEquation(){
LinAlg alg;
Stat stat;

View File

@ -20,14 +20,7 @@ namespace MLPP{
void NewtonRaphson(double learning_rate, int max_epoch, bool UI);
void gradientDescent(double learning_rate, int max_epoch, bool UI = 1);
void SGD(double learning_rate, int max_epoch, bool UI = 1);
// void MBGD(double learning_rate, int max_epoch, int mini_batch_size, bool UI = 1);
// void Momentum(double learning_rate, int max_epoch, int mini_batch_size, double gamma, bool UI = 1);
// void NAG(double learning_rate, int max_epoch, int mini_batch_size, double gamma, bool UI = 1);
// void Adagrad(double learning_rate, int max_epoch, int mini_batch_size, double e, bool UI = 1);
// void Adadelta(double learning_rate, int max_epoch, int mini_batch_size, double b1, double e, bool UI = 1);
// void Adam(double learning_rate, int max_epoch, int mini_batch_size, double b1, double b2, double e, bool UI = 1);
// void Adamax(double learning_rate, int max_epoch, int mini_batch_size, double b1, double b2, double e, bool UI = 1);
// void Nadam(double learning_rate, int max_epoch, int mini_batch_size, double b1, double b2, double e, bool UI = 1);
void MBGD(double learning_rate, int max_epoch, int mini_batch_size, bool UI = 1);
void normalEquation();
double score();
void save(std::string fileName);

View File

@ -91,7 +91,23 @@ The result will be the model's predictions for the entire dataset.
- Arcsch
- Arsech
- Arcoth
2. Possible Loss Functions
2. Possible Optimization Algorithms
- Batch Gradient Descent
- Mini-Batch Gradient Descent
- Stochastic Gradient Descent
- Gradient Descent with Momentum
- Nesterov Accelerated Gradient
- Adagrad Optimizer
- Adadelta Optimizer
- Adam Optimizer
- Adamax Optimizer
- Nadam Optimizer
- AMSGrad Optimizer
- 2nd Order Newton-Raphson Optimizer*
- Normal Equation*
* Only available for linear regression
3. Possible Loss Functions
- MSE
- RMSE
- MAE
@ -99,11 +115,11 @@ The result will be the model's predictions for the entire dataset.
- Log Loss
- Cross Entropy
- Hinge Loss
3. Possible Regularization Methods
4. Possible Regularization Methods
- Lasso
- Ridge
- ElasticNet
4. Possible Weight Initialization Methods
5. Possible Weight Initialization Methods
- Uniform
- Xavier Normal
- Xavier Uniform

BIN
a.out

Binary file not shown.

View File

@ -364,10 +364,12 @@ int main() {
std::vector<std::vector<double>> inputSet = {{0,0,1,1}, {0,1,0,1}};
std::vector<double> outputSet = {0,1,1,0};
ANN ann(alg.transpose(inputSet), outputSet);
//ann.addLayer(10, "RELU", "Default", "Ridge", 0.0001);
ann.addLayer(10, "RELU", "Default", "XavierNormal");
//ann.addLayer(10, "RELU");
ann.addLayer(10, "Sigmoid");
ann.addOutputLayer("Sigmoid", "LogLoss");
ann.Adam(0.1, 800, 2, 0.9, 0.999, 1e-8, 1);
//ann.AMSGrad(0.1, 10000, 1, 0.9, 0.999, 0.000001, 1);
//ann.Adadelta(1, 1000, 2, 0.9, 0.000001, 1);
ann.Momentum(0.1, 8000, 2, 0.9, true, 1);
//ann.MBGD(0.1, 1000, 2, 1);
alg.printVector(ann.modelSetTest(alg.transpose(inputSet)));
std::cout << "ACCURACY: " << 100 * ann.score() << "%" << std::endl;