2023-01-23 21:13:26 +01:00
//
// Data.cpp
// MLP
//
// Created by Marc Melikyan on 11/4/20.
//
2023-01-24 19:14:38 +01:00
# include "data.h"
2023-01-25 18:27:14 +01:00
# include "core/os/file_access.h"
2023-01-24 18:12:23 +01:00
# include "../lin_alg/lin_alg.h"
2023-04-27 18:37:59 +02:00
# include "../stat/stat.h"
2023-04-22 17:17:58 +02:00
# include "../lin_alg/lin_alg_old.h"
2023-12-28 17:41:20 +01:00
# include "../softmax_net/softmax_net.h"
2023-04-22 17:17:58 +02:00
# include "../stat/stat_old.h"
2023-01-25 18:27:14 +01:00
2023-01-24 19:00:54 +01:00
# include <algorithm>
2023-01-23 21:13:26 +01:00
# include <cmath>
# include <fstream>
2023-01-24 19:00:54 +01:00
# include <iostream>
# include <random>
2023-01-23 21:13:26 +01:00
# include <sstream>
2023-02-09 11:40:16 +01:00
Ref < MLPPVector > MLPPDataESimple : : get_input ( ) {
return _input ;
}
void MLPPDataESimple : : set_input ( const Ref < MLPPVector > & val ) {
_input = val ;
}
Ref < MLPPVector > MLPPDataESimple : : get_output ( ) {
return _output ;
}
void MLPPDataESimple : : set_output ( const Ref < MLPPVector > & val ) {
_output = val ;
}
void MLPPDataESimple : : instance_data ( ) {
_input . instance ( ) ;
_output . instance ( ) ;
}
2023-01-25 18:27:14 +01:00
void MLPPDataESimple : : _bind_methods ( ) {
2023-02-09 11:40:16 +01:00
ClassDB : : bind_method ( D_METHOD ( " get_input " ) , & MLPPDataESimple : : get_input ) ;
ClassDB : : bind_method ( D_METHOD ( " set_input " , " val " ) , & MLPPDataESimple : : set_input ) ;
ADD_PROPERTY ( PropertyInfo ( Variant : : OBJECT , " input " , PROPERTY_HINT_RESOURCE_TYPE , " MLPPVector " ) , " set_input " , " get_input " ) ;
ClassDB : : bind_method ( D_METHOD ( " get_output " ) , & MLPPDataESimple : : get_input ) ;
ClassDB : : bind_method ( D_METHOD ( " set_output " , " val " ) , & MLPPDataESimple : : set_output ) ;
ADD_PROPERTY ( PropertyInfo ( Variant : : OBJECT , " output " , PROPERTY_HINT_RESOURCE_TYPE , " MLPPVector " ) , " set_output " , " get_output " ) ;
ClassDB : : bind_method ( D_METHOD ( " instance_data " ) , & MLPPDataESimple : : instance_data ) ;
}
Ref < MLPPMatrix > MLPPDataSimple : : get_input ( ) {
return _input ;
}
void MLPPDataSimple : : set_input ( const Ref < MLPPMatrix > & val ) {
_input = val ;
}
Ref < MLPPVector > MLPPDataSimple : : get_output ( ) {
return _output ;
}
void MLPPDataSimple : : set_output ( const Ref < MLPPVector > & val ) {
_output = val ;
}
void MLPPDataSimple : : instance_data ( ) {
_input . instance ( ) ;
_output . instance ( ) ;
2023-01-25 18:27:14 +01:00
}
void MLPPDataSimple : : _bind_methods ( ) {
2023-02-09 11:40:16 +01:00
ClassDB : : bind_method ( D_METHOD ( " get_input " ) , & MLPPDataSimple : : get_input ) ;
ClassDB : : bind_method ( D_METHOD ( " set_input " , " val " ) , & MLPPDataSimple : : set_input ) ;
ADD_PROPERTY ( PropertyInfo ( Variant : : OBJECT , " input " , PROPERTY_HINT_RESOURCE_TYPE , " MLPPMatrix " ) , " set_input " , " get_input " ) ;
ClassDB : : bind_method ( D_METHOD ( " get_output " ) , & MLPPDataSimple : : get_input ) ;
ClassDB : : bind_method ( D_METHOD ( " set_output " , " val " ) , & MLPPDataSimple : : set_output ) ;
ADD_PROPERTY ( PropertyInfo ( Variant : : OBJECT , " output " , PROPERTY_HINT_RESOURCE_TYPE , " MLPPVector " ) , " set_output " , " get_output " ) ;
ClassDB : : bind_method ( D_METHOD ( " instance_data " ) , & MLPPDataSimple : : instance_data ) ;
}
Ref < MLPPMatrix > MLPPDataComplex : : get_input ( ) {
return _input ;
}
void MLPPDataComplex : : set_input ( const Ref < MLPPMatrix > & val ) {
_input = val ;
}
Ref < MLPPMatrix > MLPPDataComplex : : get_output ( ) {
return _output ;
}
void MLPPDataComplex : : set_output ( const Ref < MLPPMatrix > & val ) {
_output = val ;
}
void MLPPDataComplex : : instance_data ( ) {
_input . instance ( ) ;
_output . instance ( ) ;
2023-01-25 18:27:14 +01:00
}
void MLPPDataComplex : : _bind_methods ( ) {
2023-02-09 11:40:16 +01:00
ClassDB : : bind_method ( D_METHOD ( " get_input " ) , & MLPPDataComplex : : get_input ) ;
ClassDB : : bind_method ( D_METHOD ( " set_input " , " val " ) , & MLPPDataComplex : : set_input ) ;
ADD_PROPERTY ( PropertyInfo ( Variant : : OBJECT , " input " , PROPERTY_HINT_RESOURCE_TYPE , " MLPPMatrix " ) , " set_input " , " get_input " ) ;
ClassDB : : bind_method ( D_METHOD ( " get_output " ) , & MLPPDataComplex : : get_input ) ;
ClassDB : : bind_method ( D_METHOD ( " set_output " , " val " ) , & MLPPDataComplex : : set_output ) ;
ADD_PROPERTY ( PropertyInfo ( Variant : : OBJECT , " output " , PROPERTY_HINT_RESOURCE_TYPE , " MLPPMatrix " ) , " set_output " , " get_output " ) ;
ClassDB : : bind_method ( D_METHOD ( " instance_data " ) , & MLPPDataComplex : : instance_data ) ;
2023-01-25 18:27:14 +01:00
}
// Loading Datasets
Ref < MLPPDataSimple > MLPPData : : load_breast_cancer ( const String & path ) {
const int BREAST_CANCER_SIZE = 30 ; // k = 30
Ref < MLPPDataSimple > data ;
data . instance ( ) ;
2023-02-09 11:40:16 +01:00
data - > instance_data ( ) ;
2023-01-25 18:27:14 +01:00
2023-02-09 11:40:16 +01:00
set_data_supervised ( BREAST_CANCER_SIZE , path , data - > get_input ( ) , data - > get_output ( ) ) ;
2023-01-25 18:27:14 +01:00
return data ;
}
Ref < MLPPDataSimple > MLPPData : : load_breast_cancer_svc ( const String & path ) {
const int BREAST_CANCER_SIZE = 30 ; // k = 30
Ref < MLPPDataSimple > data ;
data . instance ( ) ;
2023-02-09 11:40:16 +01:00
data - > instance_data ( ) ;
2023-01-25 18:27:14 +01:00
2023-02-09 11:40:16 +01:00
set_data_supervised ( BREAST_CANCER_SIZE , path , data - > get_input ( ) , data - > get_output ( ) ) ;
2023-01-25 18:27:14 +01:00
return data ;
}
Ref < MLPPDataComplex > MLPPData : : load_iris ( const String & path ) {
const int IRIS_SIZE = 4 ;
const int ONE_HOT_NUM = 3 ;
2023-02-09 11:40:16 +01:00
Ref < MLPPVector > temp_output_set ;
temp_output_set . instance ( ) ;
2023-01-25 18:27:14 +01:00
Ref < MLPPDataComplex > data ;
data . instance ( ) ;
2023-02-09 11:40:16 +01:00
data - > instance_data ( ) ;
2023-01-25 18:27:14 +01:00
2023-02-09 11:40:16 +01:00
set_data_supervised ( IRIS_SIZE , path , data - > get_input ( ) , temp_output_set ) ;
data - > set_output ( one_hot_rep ( temp_output_set , ONE_HOT_NUM ) ) ;
2023-01-25 18:27:14 +01:00
return data ;
}
Ref < MLPPDataComplex > MLPPData : : load_wine ( const String & path ) {
const int WINE_SIZE = 4 ;
const int ONE_HOT_NUM = 3 ;
2023-02-09 11:40:16 +01:00
Ref < MLPPVector > temp_output_set ;
temp_output_set . instance ( ) ;
2023-01-25 18:27:14 +01:00
Ref < MLPPDataComplex > data ;
data . instance ( ) ;
2023-02-09 11:40:16 +01:00
data - > instance_data ( ) ;
2023-01-25 18:27:14 +01:00
2023-02-09 11:40:16 +01:00
set_data_supervised ( WINE_SIZE , path , data - > get_input ( ) , temp_output_set ) ;
data - > set_output ( one_hot_rep ( temp_output_set , ONE_HOT_NUM ) ) ;
2023-01-25 18:27:14 +01:00
return data ;
}
Ref < MLPPDataComplex > MLPPData : : load_mnist_train ( const String & path ) {
const int MNIST_SIZE = 784 ;
const int ONE_HOT_NUM = 10 ;
2023-02-09 11:40:16 +01:00
Ref < MLPPVector > temp_output_set ;
temp_output_set . instance ( ) ;
2023-01-25 18:27:14 +01:00
Ref < MLPPDataComplex > data ;
data . instance ( ) ;
2023-02-09 11:40:16 +01:00
data - > instance_data ( ) ;
2023-01-25 18:27:14 +01:00
2023-02-09 11:40:16 +01:00
set_data_supervised ( MNIST_SIZE , path , data - > get_input ( ) , temp_output_set ) ;
data - > set_output ( one_hot_rep ( temp_output_set , ONE_HOT_NUM ) ) ;
2023-01-25 18:27:14 +01:00
return data ;
}
Ref < MLPPDataComplex > MLPPData : : load_mnist_test ( const String & path ) {
const int MNIST_SIZE = 784 ;
const int ONE_HOT_NUM = 10 ;
2023-02-09 11:40:16 +01:00
Ref < MLPPVector > temp_output_set ;
temp_output_set . instance ( ) ;
2023-01-25 18:27:14 +01:00
Ref < MLPPDataComplex > data ;
data . instance ( ) ;
2023-02-09 11:40:16 +01:00
data - > instance_data ( ) ;
2023-01-25 18:27:14 +01:00
2023-02-09 11:40:16 +01:00
set_data_supervised ( MNIST_SIZE , path , data - > get_input ( ) , temp_output_set ) ;
data - > set_output ( one_hot_rep ( temp_output_set , ONE_HOT_NUM ) ) ;
2023-01-25 18:27:14 +01:00
return data ;
}
Ref < MLPPDataSimple > MLPPData : : load_california_housing ( const String & path ) {
const int CALIFORNIA_HOUSING_SIZE = 13 ; // k = 30
Ref < MLPPDataSimple > data ;
data . instance ( ) ;
2023-02-09 11:40:16 +01:00
data - > instance_data ( ) ;
2023-01-25 18:27:14 +01:00
2023-02-09 11:40:16 +01:00
set_data_supervised ( CALIFORNIA_HOUSING_SIZE , path , data - > get_input ( ) , data - > get_output ( ) ) ;
2023-01-25 18:27:14 +01:00
return data ;
}
Ref < MLPPDataESimple > MLPPData : : load_fires_and_crime ( const String & path ) {
// k is implicitly 1.
Ref < MLPPDataESimple > data ;
data . instance ( ) ;
2023-02-09 11:40:16 +01:00
data - > instance_data ( ) ;
2023-01-25 18:27:14 +01:00
2023-02-09 11:40:16 +01:00
set_data_simple ( path , data - > get_input ( ) , data - > get_output ( ) ) ;
2023-01-25 18:27:14 +01:00
return data ;
}
// MULTIVARIATE SUPERVISED
2023-02-09 11:40:16 +01:00
void MLPPData : : set_data_supervised ( int k , const String & file_name , Ref < MLPPMatrix > input_set , Ref < MLPPVector > output_set ) {
ERR_FAIL_COND ( ! input_set . is_valid ( ) | | ! output_set . is_valid ( ) ) ;
2023-01-25 18:27:14 +01:00
MLPPLinAlg alg ;
2023-02-09 11:40:16 +01:00
Vector < Vector < real_t > > input_set_tmp ;
Vector < real_t > output_set_tmp ;
2023-01-25 18:27:14 +01:00
FileAccess * file = FileAccess : : open ( file_name , FileAccess : : READ ) ;
ERR_FAIL_COND ( ! file ) ;
while ( ! file - > eof_reached ( ) ) {
Vector < String > ll = file - > get_csv_line ( ) ;
2023-12-27 12:44:52 +01:00
Vector < real_t > row ;
2023-01-25 18:27:14 +01:00
for ( int i = 0 ; i < k ; + + i ) {
2023-12-27 12:44:52 +01:00
row . push_back ( static_cast < real_t > ( ll [ i ] . to_double ( ) ) ) ;
2023-01-25 18:27:14 +01:00
}
2023-12-27 12:44:52 +01:00
input_set_tmp . push_back ( row ) ;
2023-02-09 11:40:16 +01:00
output_set_tmp . push_back ( static_cast < real_t > ( ll [ k ] . to_double ( ) ) ) ;
2023-01-25 18:27:14 +01:00
}
2023-02-09 11:40:16 +01:00
file - > close ( ) ;
2023-01-25 18:27:14 +01:00
memdelete ( file ) ;
2023-02-09 11:40:16 +01:00
output_set - > set_from_vector ( output_set_tmp ) ;
input_set - > set_from_vectors ( input_set_tmp ) ;
2023-01-25 18:27:14 +01:00
}
2023-02-09 11:40:16 +01:00
void MLPPData : : set_data_unsupervised ( int k , const String & file_name , Ref < MLPPMatrix > input_set ) {
ERR_FAIL_COND ( ! input_set . is_valid ( ) ) ;
2023-01-25 18:27:14 +01:00
MLPPLinAlg alg ;
2023-02-09 11:40:16 +01:00
Vector < Vector < real_t > > input_set_tmp ;
input_set_tmp . resize ( k ) ;
2023-01-25 18:27:14 +01:00
FileAccess * file = FileAccess : : open ( file_name , FileAccess : : READ ) ;
ERR_FAIL_COND ( ! file ) ;
while ( ! file - > eof_reached ( ) ) {
Vector < String > ll = file - > get_csv_line ( ) ;
for ( int i = 0 ; i < k ; + + i ) {
2023-02-09 11:40:16 +01:00
input_set_tmp . write [ i ] . push_back ( static_cast < real_t > ( ll [ i ] . to_double ( ) ) ) ;
2023-01-25 18:27:14 +01:00
}
}
2023-02-09 11:40:16 +01:00
file - > close ( ) ;
2023-01-25 18:27:14 +01:00
memdelete ( file ) ;
2023-02-09 11:40:16 +01:00
input_set - > set_from_vectors ( input_set_tmp ) ;
2023-04-22 14:23:51 +02:00
input_set = alg . transposenm ( input_set ) ;
2023-01-25 18:27:14 +01:00
}
2023-02-09 11:40:16 +01:00
void MLPPData : : set_data_simple ( const String & file_name , Ref < MLPPVector > input_set , Ref < MLPPVector > output_set ) {
ERR_FAIL_COND ( ! input_set . is_valid ( ) | | ! output_set . is_valid ( ) ) ;
2023-01-25 18:27:14 +01:00
FileAccess * file = FileAccess : : open ( file_name , FileAccess : : READ ) ;
ERR_FAIL_COND ( ! file ) ;
2023-02-09 11:40:16 +01:00
Vector < real_t > input_set_tmp ;
Vector < real_t > output_set_tmp ;
2023-01-25 18:27:14 +01:00
while ( ! file - > eof_reached ( ) ) {
Vector < String > ll = file - > get_csv_line ( ) ;
for ( int i = 0 ; i < ll . size ( ) ; i + = 2 ) {
2023-02-09 11:40:16 +01:00
input_set_tmp . push_back ( static_cast < real_t > ( ll [ i ] . to_double ( ) ) ) ;
output_set_tmp . push_back ( static_cast < real_t > ( ll [ i + 1 ] . to_double ( ) ) ) ;
2023-01-25 18:27:14 +01:00
}
}
2023-02-09 11:40:16 +01:00
file - > close ( ) ;
2023-01-25 18:27:14 +01:00
memdelete ( file ) ;
2023-02-09 11:40:16 +01:00
input_set - > set_from_vector ( input_set_tmp ) ;
output_set - > set_from_vector ( output_set_tmp ) ;
2023-01-25 18:27:14 +01:00
}
2023-01-24 19:20:18 +01:00
2023-02-09 11:40:16 +01:00
MLPPData : : SplitComplexData MLPPData : : train_test_split ( Ref < MLPPDataComplex > data , real_t test_size ) {
2023-01-26 14:52:49 +01:00
SplitComplexData res ;
res . train . instance ( ) ;
2023-02-09 11:40:16 +01:00
res . train - > instance_data ( ) ;
2023-01-26 14:52:49 +01:00
res . test . instance ( ) ;
2023-02-09 11:40:16 +01:00
res . test - > instance_data ( ) ;
2023-01-26 14:52:49 +01:00
ERR_FAIL_COND_V ( ! data . is_valid ( ) , res ) ;
2023-02-09 11:40:16 +01:00
Ref < MLPPMatrix > orig_input = data - > get_input ( ) ;
Ref < MLPPMatrix > orig_output = data - > get_output ( ) ;
2023-12-28 11:30:56 +01:00
ERR_FAIL_COND_V ( ! orig_input . is_valid ( ) , res ) ;
ERR_FAIL_COND_V ( ! orig_output . is_valid ( ) , res ) ;
2023-02-09 11:40:16 +01:00
Size2i orig_input_size = orig_input - > size ( ) ;
Size2i orig_output_size = orig_output - > size ( ) ;
int is = MIN ( orig_input_size . y , orig_output_size . y ) ;
2023-01-26 14:52:49 +01:00
Array indices ;
indices . resize ( is ) ;
for ( int i = 0 ; i < is ; + + i ) {
indices [ i ] = i ;
}
indices . shuffle ( ) ;
2023-02-09 11:40:16 +01:00
Ref < MLPPVector > orig_input_row_tmp ;
orig_input_row_tmp . instance ( ) ;
orig_input_row_tmp - > resize ( orig_input_size . x ) ;
Ref < MLPPVector > orig_output_row_tmp ;
orig_output_row_tmp . instance ( ) ;
orig_output_row_tmp - > resize ( orig_output_size . x ) ;
2023-01-26 14:52:49 +01:00
int test_input_number = test_size * is ; // implicit usage of floor
2023-02-09 11:40:16 +01:00
Ref < MLPPMatrix > res_test_input = res . test - > get_input ( ) ;
Ref < MLPPMatrix > res_test_output = res . test - > get_output ( ) ;
res_test_input - > resize ( Size2i ( orig_input_size . x , test_input_number ) ) ;
res_test_output - > resize ( Size2i ( orig_output_size . x , test_input_number ) ) ;
2023-01-26 14:52:49 +01:00
for ( int i = 0 ; i < test_input_number ; + + i ) {
int index = indices [ i ] ;
2023-04-29 15:07:30 +02:00
orig_input - > row_get_into_mlpp_vector ( index , orig_input_row_tmp ) ;
orig_output - > row_get_into_mlpp_vector ( index , orig_output_row_tmp ) ;
2023-02-09 11:40:16 +01:00
2023-12-28 11:30:56 +01:00
res_test_input - > row_set_mlpp_vector ( i , orig_input_row_tmp ) ;
res_test_output - > row_set_mlpp_vector ( i , orig_output_row_tmp ) ;
2023-01-26 14:52:49 +01:00
}
2023-02-09 11:40:16 +01:00
Ref < MLPPMatrix > res_train_input = res . train - > get_input ( ) ;
Ref < MLPPMatrix > res_train_output = res . train - > get_output ( ) ;
int train_input_number = is - test_input_number ;
res_train_input - > resize ( Size2i ( orig_input_size . x , train_input_number ) ) ;
res_train_output - > resize ( Size2i ( orig_output_size . x , train_input_number ) ) ;
2023-01-26 14:52:49 +01:00
2023-02-09 11:40:16 +01:00
for ( int i = 0 ; i < train_input_number ; + + i ) {
2023-12-28 11:30:56 +01:00
int index = indices [ test_input_number + i ] ;
2023-02-09 11:40:16 +01:00
2023-04-29 15:07:30 +02:00
orig_input - > row_get_into_mlpp_vector ( index , orig_input_row_tmp ) ;
orig_output - > row_get_into_mlpp_vector ( index , orig_output_row_tmp ) ;
2023-02-09 11:40:16 +01:00
2023-12-28 11:30:56 +01:00
res_train_input - > row_set_mlpp_vector ( i , orig_input_row_tmp ) ;
res_train_output - > row_set_mlpp_vector ( i , orig_output_row_tmp ) ;
2023-01-26 14:52:49 +01:00
}
return res ;
}
2023-01-27 13:01:16 +01:00
Array MLPPData : : train_test_split_bind ( const Ref < MLPPDataComplex > & data , real_t test_size ) {
2023-01-26 14:52:49 +01:00
SplitComplexData res = train_test_split ( data , test_size ) ;
Array arr ;
arr . push_back ( res . train ) ;
arr . push_back ( res . test ) ;
return arr ;
}
2023-01-24 19:00:54 +01:00
// Loading Datasets
2023-01-27 13:01:16 +01:00
std : : tuple < std : : vector < std : : vector < real_t > > , std : : vector < real_t > > MLPPData : : loadBreastCancer ( ) {
2023-01-24 19:00:54 +01:00
const int BREAST_CANCER_SIZE = 30 ; // k = 30
2023-01-27 13:01:16 +01:00
std : : vector < std : : vector < real_t > > inputSet ;
std : : vector < real_t > outputSet ;
2023-01-24 19:00:54 +01:00
setData ( BREAST_CANCER_SIZE , " MLPP/Data/Datasets/BreastCancer.csv " , inputSet , outputSet ) ;
return { inputSet , outputSet } ;
}
2023-01-27 13:01:16 +01:00
std : : tuple < std : : vector < std : : vector < real_t > > , std : : vector < real_t > > MLPPData : : loadBreastCancerSVC ( ) {
2023-01-24 19:00:54 +01:00
const int BREAST_CANCER_SIZE = 30 ; // k = 30
2023-01-27 13:01:16 +01:00
std : : vector < std : : vector < real_t > > inputSet ;
std : : vector < real_t > outputSet ;
2023-01-24 19:00:54 +01:00
setData ( BREAST_CANCER_SIZE , " MLPP/Data/Datasets/BreastCancerSVM.csv " , inputSet , outputSet ) ;
return { inputSet , outputSet } ;
}
2023-01-27 13:01:16 +01:00
std : : tuple < std : : vector < std : : vector < real_t > > , std : : vector < std : : vector < real_t > > > MLPPData : : loadIris ( ) {
2023-01-24 19:00:54 +01:00
const int IRIS_SIZE = 4 ;
const int ONE_HOT_NUM = 3 ;
2023-01-27 13:01:16 +01:00
std : : vector < std : : vector < real_t > > inputSet ;
std : : vector < real_t > tempOutputSet ;
2023-01-24 19:00:54 +01:00
setData ( IRIS_SIZE , " /Users/marcmelikyan/Desktop/Data/Iris.csv " , inputSet , tempOutputSet ) ;
2023-01-27 13:01:16 +01:00
std : : vector < std : : vector < real_t > > outputSet = oneHotRep ( tempOutputSet , ONE_HOT_NUM ) ;
2023-01-24 19:00:54 +01:00
return { inputSet , outputSet } ;
}
2023-01-27 13:01:16 +01:00
std : : tuple < std : : vector < std : : vector < real_t > > , std : : vector < std : : vector < real_t > > > MLPPData : : loadWine ( ) {
2023-01-24 19:00:54 +01:00
const int WINE_SIZE = 4 ;
const int ONE_HOT_NUM = 3 ;
2023-01-27 13:01:16 +01:00
std : : vector < std : : vector < real_t > > inputSet ;
std : : vector < real_t > tempOutputSet ;
2023-01-24 19:00:54 +01:00
setData ( WINE_SIZE , " MLPP/Data/Datasets/Iris.csv " , inputSet , tempOutputSet ) ;
2023-01-27 13:01:16 +01:00
std : : vector < std : : vector < real_t > > outputSet = oneHotRep ( tempOutputSet , ONE_HOT_NUM ) ;
2023-01-24 19:00:54 +01:00
return { inputSet , outputSet } ;
}
2023-01-27 13:01:16 +01:00
std : : tuple < std : : vector < std : : vector < real_t > > , std : : vector < std : : vector < real_t > > > MLPPData : : loadMnistTrain ( ) {
2023-01-24 19:00:54 +01:00
const int MNIST_SIZE = 784 ;
const int ONE_HOT_NUM = 10 ;
2023-01-27 13:01:16 +01:00
std : : vector < std : : vector < real_t > > inputSet ;
std : : vector < real_t > tempOutputSet ;
2023-01-24 19:00:54 +01:00
setData ( MNIST_SIZE , " MLPP/Data/Datasets/MnistTrain.csv " , inputSet , tempOutputSet ) ;
2023-01-27 13:01:16 +01:00
std : : vector < std : : vector < real_t > > outputSet = oneHotRep ( tempOutputSet , ONE_HOT_NUM ) ;
2023-01-24 19:00:54 +01:00
return { inputSet , outputSet } ;
}
2023-01-27 13:01:16 +01:00
std : : tuple < std : : vector < std : : vector < real_t > > , std : : vector < std : : vector < real_t > > > MLPPData : : loadMnistTest ( ) {
2023-01-24 19:00:54 +01:00
const int MNIST_SIZE = 784 ;
const int ONE_HOT_NUM = 10 ;
2023-01-27 13:01:16 +01:00
std : : vector < std : : vector < real_t > > inputSet ;
std : : vector < real_t > tempOutputSet ;
2023-01-24 19:00:54 +01:00
setData ( MNIST_SIZE , " MLPP/Data/Datasets/MnistTest.csv " , inputSet , tempOutputSet ) ;
2023-01-27 13:01:16 +01:00
std : : vector < std : : vector < real_t > > outputSet = oneHotRep ( tempOutputSet , ONE_HOT_NUM ) ;
2023-01-24 19:00:54 +01:00
return { inputSet , outputSet } ;
}
2023-01-27 13:01:16 +01:00
std : : tuple < std : : vector < std : : vector < real_t > > , std : : vector < real_t > > MLPPData : : loadCaliforniaHousing ( ) {
2023-01-24 19:00:54 +01:00
const int CALIFORNIA_HOUSING_SIZE = 13 ; // k = 30
2023-01-27 13:01:16 +01:00
std : : vector < std : : vector < real_t > > inputSet ;
std : : vector < real_t > outputSet ;
2023-01-24 19:00:54 +01:00
setData ( CALIFORNIA_HOUSING_SIZE , " MLPP/Data/Datasets/CaliforniaHousing.csv " , inputSet , outputSet ) ;
return { inputSet , outputSet } ;
}
2023-01-27 13:01:16 +01:00
std : : tuple < std : : vector < real_t > , std : : vector < real_t > > MLPPData : : loadFiresAndCrime ( ) {
std : : vector < real_t > inputSet ; // k is implicitly 1.
std : : vector < real_t > outputSet ;
2023-01-24 19:00:54 +01:00
setData ( " MLPP/Data/Datasets/FiresAndCrime.csv " , inputSet , outputSet ) ;
return { inputSet , outputSet } ;
}
2023-01-26 14:52:49 +01:00
// Note that inputs and outputs should be pairs (technically), but this
// implementation will separate them. (My implementation keeps them tied together.)
// Not yet sure whether this is intentional or not (or it's something like a compiler specific difference)
2023-01-27 13:01:16 +01:00
std : : tuple < std : : vector < std : : vector < real_t > > , std : : vector < std : : vector < real_t > > , std : : vector < std : : vector < real_t > > , std : : vector < std : : vector < real_t > > > MLPPData : : trainTestSplit ( std : : vector < std : : vector < real_t > > inputSet , std : : vector < std : : vector < real_t > > outputSet , real_t testSize ) {
2023-01-24 19:00:54 +01:00
std : : random_device rd ;
std : : default_random_engine generator ( rd ( ) ) ;
std : : shuffle ( inputSet . begin ( ) , inputSet . end ( ) , generator ) ; // inputSet random shuffle
std : : shuffle ( outputSet . begin ( ) , outputSet . end ( ) , generator ) ; // outputSet random shuffle)
2023-01-27 13:01:16 +01:00
std : : vector < std : : vector < real_t > > inputTestSet ;
std : : vector < std : : vector < real_t > > outputTestSet ;
2023-01-24 19:00:54 +01:00
int testInputNumber = testSize * inputSet . size ( ) ; // implicit usage of floor
int testOutputNumber = testSize * outputSet . size ( ) ; // implicit usage of floor
for ( int i = 0 ; i < testInputNumber ; i + + ) {
inputTestSet . push_back ( inputSet [ i ] ) ;
inputSet . erase ( inputSet . begin ( ) ) ;
}
for ( int i = 0 ; i < testOutputNumber ; i + + ) {
outputTestSet . push_back ( outputSet [ i ] ) ;
outputSet . erase ( outputSet . begin ( ) ) ;
}
return { inputSet , outputSet , inputTestSet , outputTestSet } ;
}
// MULTIVARIATE SUPERVISED
2023-01-27 13:01:16 +01:00
void MLPPData : : setData ( int k , std : : string fileName , std : : vector < std : : vector < real_t > > & inputSet , std : : vector < real_t > & outputSet ) {
2023-04-22 17:17:58 +02:00
MLPPLinAlgOld alg ;
2023-01-24 19:00:54 +01:00
std : : string inputTemp ;
std : : string outputTemp ;
inputSet . resize ( k ) ;
std : : ifstream dataFile ( fileName ) ;
if ( ! dataFile . is_open ( ) ) {
std : : cout < < fileName < < " failed to open. " < < std : : endl ;
}
std : : string line ;
while ( std : : getline ( dataFile , line ) ) {
std : : stringstream ss ( line ) ;
for ( int i = 0 ; i < k ; i + + ) {
std : : getline ( ss , inputTemp , ' , ' ) ;
inputSet [ i ] . push_back ( std : : stod ( inputTemp ) ) ;
}
std : : getline ( ss , outputTemp , ' , ' ) ;
outputSet . push_back ( std : : stod ( outputTemp ) ) ;
}
inputSet = alg . transpose ( inputSet ) ;
dataFile . close ( ) ;
}
2023-01-27 13:01:16 +01:00
void MLPPData : : printData ( std : : vector < std : : string > inputName , std : : string outputName , std : : vector < std : : vector < real_t > > inputSet , std : : vector < real_t > outputSet ) {
2023-04-22 17:17:58 +02:00
MLPPLinAlgOld alg ;
2023-01-24 19:00:54 +01:00
inputSet = alg . transpose ( inputSet ) ;
2023-02-12 18:03:17 +01:00
for ( uint32_t i = 0 ; i < inputSet . size ( ) ; i + + ) {
2023-01-24 19:00:54 +01:00
std : : cout < < inputName [ i ] < < std : : endl ;
2023-02-12 18:03:17 +01:00
for ( uint32_t j = 0 ; j < inputSet [ i ] . size ( ) ; j + + ) {
2023-01-24 19:00:54 +01:00
std : : cout < < inputSet [ i ] [ j ] < < std : : endl ;
}
}
std : : cout < < outputName < < std : : endl ;
2023-02-12 18:03:17 +01:00
for ( uint32_t i = 0 ; i < outputSet . size ( ) ; i + + ) {
2023-01-24 19:00:54 +01:00
std : : cout < < outputSet [ i ] < < std : : endl ;
}
}
// UNSUPERVISED
2023-01-27 13:01:16 +01:00
void MLPPData : : setData ( int k , std : : string fileName , std : : vector < std : : vector < real_t > > & inputSet ) {
2023-04-22 17:17:58 +02:00
MLPPLinAlgOld alg ;
2023-01-24 19:00:54 +01:00
std : : string inputTemp ;
inputSet . resize ( k ) ;
std : : ifstream dataFile ( fileName ) ;
if ( ! dataFile . is_open ( ) ) {
std : : cout < < fileName < < " failed to open. " < < std : : endl ;
}
std : : string line ;
while ( std : : getline ( dataFile , line ) ) {
std : : stringstream ss ( line ) ;
for ( int i = 0 ; i < k ; i + + ) {
std : : getline ( ss , inputTemp , ' , ' ) ;
inputSet [ i ] . push_back ( std : : stod ( inputTemp ) ) ;
}
}
inputSet = alg . transpose ( inputSet ) ;
dataFile . close ( ) ;
}
2023-01-27 13:01:16 +01:00
void MLPPData : : printData ( std : : vector < std : : string > inputName , std : : vector < std : : vector < real_t > > inputSet ) {
2023-04-22 17:17:58 +02:00
MLPPLinAlgOld alg ;
2023-01-24 19:00:54 +01:00
inputSet = alg . transpose ( inputSet ) ;
2023-02-12 18:03:17 +01:00
for ( uint32_t i = 0 ; i < inputSet . size ( ) ; i + + ) {
2023-01-24 19:00:54 +01:00
std : : cout < < inputName [ i ] < < std : : endl ;
2023-02-12 18:03:17 +01:00
for ( uint32_t j = 0 ; j < inputSet [ i ] . size ( ) ; j + + ) {
2023-01-24 19:00:54 +01:00
std : : cout < < inputSet [ i ] [ j ] < < std : : endl ;
}
}
}
// SIMPLE
2023-01-27 13:01:16 +01:00
void MLPPData : : setData ( std : : string fileName , std : : vector < real_t > & inputSet , std : : vector < real_t > & outputSet ) {
2023-01-24 19:00:54 +01:00
std : : string inputTemp , outputTemp ;
std : : ifstream dataFile ( fileName ) ;
if ( ! dataFile . is_open ( ) ) {
std : : cout < < " The file failed to open. " < < std : : endl ;
}
std : : string line ;
while ( std : : getline ( dataFile , line ) ) {
std : : stringstream ss ( line ) ;
std : : getline ( ss , inputTemp , ' , ' ) ;
std : : getline ( ss , outputTemp , ' , ' ) ;
inputSet . push_back ( std : : stod ( inputTemp ) ) ;
outputSet . push_back ( std : : stod ( outputTemp ) ) ;
}
dataFile . close ( ) ;
}
2023-01-27 13:01:16 +01:00
void MLPPData : : printData ( std : : string & inputName , std : : string & outputName , std : : vector < real_t > & inputSet , std : : vector < real_t > & outputSet ) {
2023-01-24 19:00:54 +01:00
std : : cout < < inputName < < std : : endl ;
2023-02-12 18:03:17 +01:00
for ( uint32_t i = 0 ; i < inputSet . size ( ) ; i + + ) {
2023-01-24 19:00:54 +01:00
std : : cout < < inputSet [ i ] < < std : : endl ;
}
std : : cout < < outputName < < std : : endl ;
2023-02-12 18:03:17 +01:00
for ( uint32_t i = 0 ; i < inputSet . size ( ) ; i + + ) {
2023-01-24 19:00:54 +01:00
std : : cout < < outputSet [ i ] < < std : : endl ;
}
}
// Images
2023-01-27 13:01:16 +01:00
std : : vector < std : : vector < real_t > > MLPPData : : rgb2gray ( std : : vector < std : : vector < std : : vector < real_t > > > input ) {
std : : vector < std : : vector < real_t > > grayScale ;
2023-01-24 19:00:54 +01:00
grayScale . resize ( input [ 0 ] . size ( ) ) ;
2023-02-12 18:03:17 +01:00
for ( uint32_t i = 0 ; i < grayScale . size ( ) ; i + + ) {
2023-01-24 19:00:54 +01:00
grayScale [ i ] . resize ( input [ 0 ] [ i ] . size ( ) ) ;
}
2023-02-12 18:03:17 +01:00
for ( uint32_t i = 0 ; i < grayScale . size ( ) ; i + + ) {
for ( uint32_t j = 0 ; j < grayScale [ i ] . size ( ) ; j + + ) {
2023-01-24 19:00:54 +01:00
grayScale [ i ] [ j ] = 0.299 * input [ 0 ] [ i ] [ j ] + 0.587 * input [ 1 ] [ i ] [ j ] + 0.114 * input [ 2 ] [ i ] [ j ] ;
}
}
return grayScale ;
}
2023-01-27 13:01:16 +01:00
std : : vector < std : : vector < std : : vector < real_t > > > MLPPData : : rgb2ycbcr ( std : : vector < std : : vector < std : : vector < real_t > > > input ) {
2023-04-22 17:17:58 +02:00
MLPPLinAlgOld alg ;
2023-01-27 13:01:16 +01:00
std : : vector < std : : vector < std : : vector < real_t > > > YCbCr ;
2023-01-24 19:00:54 +01:00
YCbCr = alg . resize ( YCbCr , input ) ;
2023-02-12 18:03:17 +01:00
for ( uint32_t i = 0 ; i < YCbCr [ 0 ] . size ( ) ; i + + ) {
for ( uint32_t j = 0 ; j < YCbCr [ 0 ] [ i ] . size ( ) ; j + + ) {
2023-01-24 19:00:54 +01:00
YCbCr [ 0 ] [ i ] [ j ] = 0.299 * input [ 0 ] [ i ] [ j ] + 0.587 * input [ 1 ] [ i ] [ j ] + 0.114 * input [ 2 ] [ i ] [ j ] ;
YCbCr [ 1 ] [ i ] [ j ] = - 0.169 * input [ 0 ] [ i ] [ j ] - 0.331 * input [ 1 ] [ i ] [ j ] + 0.500 * input [ 2 ] [ i ] [ j ] ;
YCbCr [ 2 ] [ i ] [ j ] = 0.500 * input [ 0 ] [ i ] [ j ] - 0.419 * input [ 1 ] [ i ] [ j ] - 0.081 * input [ 2 ] [ i ] [ j ] ;
}
}
return YCbCr ;
}
// Conversion formulas available here:
// https://www.rapidtables.com/convert/color/rgb-to-hsv.html
2023-01-27 13:01:16 +01:00
std : : vector < std : : vector < std : : vector < real_t > > > MLPPData : : rgb2hsv ( std : : vector < std : : vector < std : : vector < real_t > > > input ) {
2023-04-22 17:17:58 +02:00
MLPPLinAlgOld alg ;
2023-01-27 13:01:16 +01:00
std : : vector < std : : vector < std : : vector < real_t > > > HSV ;
2023-01-24 19:00:54 +01:00
HSV = alg . resize ( HSV , input ) ;
2023-02-12 18:03:17 +01:00
for ( uint32_t i = 0 ; i < HSV [ 0 ] . size ( ) ; i + + ) {
for ( uint32_t j = 0 ; j < HSV [ 0 ] [ i ] . size ( ) ; j + + ) {
2023-01-27 13:01:16 +01:00
real_t rPrime = input [ 0 ] [ i ] [ j ] / 255 ;
real_t gPrime = input [ 1 ] [ i ] [ j ] / 255 ;
real_t bPrime = input [ 2 ] [ i ] [ j ] / 255 ;
2023-01-24 19:00:54 +01:00
2023-01-27 13:01:16 +01:00
real_t cMax = alg . max ( { rPrime , gPrime , bPrime } ) ;
real_t cMin = alg . min ( { rPrime , gPrime , bPrime } ) ;
real_t delta = cMax - cMin ;
2023-01-24 19:00:54 +01:00
// H calculation.
if ( delta = = 0 ) {
HSV [ 0 ] [ i ] [ j ] = 0 ;
} else {
if ( cMax = = rPrime ) {
HSV [ 0 ] [ i ] [ j ] = 60 * fmod ( ( ( gPrime - bPrime ) / delta ) , 6 ) ;
} else if ( cMax = = gPrime ) {
HSV [ 0 ] [ i ] [ j ] = 60 * ( ( bPrime - rPrime ) / delta + 2 ) ;
} else { // cMax == bPrime
HSV [ 0 ] [ i ] [ j ] = 60 * ( ( rPrime - gPrime ) / delta + 6 ) ;
}
}
// S calculation.
if ( cMax = = 0 ) {
HSV [ 1 ] [ i ] [ j ] = 0 ;
} else {
HSV [ 1 ] [ i ] [ j ] = delta / cMax ;
}
// V calculation.
HSV [ 2 ] [ i ] [ j ] = cMax ;
}
}
return HSV ;
}
// http://machinethatsees.blogspot.com/2013/07/how-to-convert-rgb-to-xyz-or-vice-versa.html
2023-01-27 13:01:16 +01:00
std : : vector < std : : vector < std : : vector < real_t > > > MLPPData : : rgb2xyz ( std : : vector < std : : vector < std : : vector < real_t > > > input ) {
2023-04-22 17:17:58 +02:00
MLPPLinAlgOld alg ;
2023-01-27 13:01:16 +01:00
std : : vector < std : : vector < std : : vector < real_t > > > XYZ ;
2023-01-24 19:00:54 +01:00
XYZ = alg . resize ( XYZ , input ) ;
2023-01-27 13:01:16 +01:00
std : : vector < std : : vector < real_t > > RGB2XYZ = { { 0.4124564 , 0.3575761 , 0.1804375 } , { 0.2126726 , 0.7151522 , 0.0721750 } , { 0.0193339 , 0.1191920 , 0.9503041 } } ;
2023-01-24 19:00:54 +01:00
return alg . vector_wise_tensor_product ( input , RGB2XYZ ) ;
}
2023-01-27 13:01:16 +01:00
std : : vector < std : : vector < std : : vector < real_t > > > MLPPData : : xyz2rgb ( std : : vector < std : : vector < std : : vector < real_t > > > input ) {
2023-04-22 17:17:58 +02:00
MLPPLinAlgOld alg ;
2023-01-27 13:01:16 +01:00
std : : vector < std : : vector < std : : vector < real_t > > > XYZ ;
2023-01-24 19:00:54 +01:00
XYZ = alg . resize ( XYZ , input ) ;
2023-01-27 13:01:16 +01:00
std : : vector < std : : vector < real_t > > RGB2XYZ = alg . inverse ( { { 0.4124564 , 0.3575761 , 0.1804375 } , { 0.2126726 , 0.7151522 , 0.0721750 } , { 0.0193339 , 0.1191920 , 0.9503041 } } ) ;
2023-01-24 19:00:54 +01:00
return alg . vector_wise_tensor_product ( input , RGB2XYZ ) ;
}
// TEXT-BASED & NLP
2023-01-25 00:21:31 +01:00
std : : string MLPPData : : toLower ( std : : string text ) {
2023-02-12 18:03:17 +01:00
for ( uint32_t i = 0 ; i < text . size ( ) ; i + + ) {
2023-01-24 19:00:54 +01:00
text [ i ] = tolower ( text [ i ] ) ;
}
return text ;
}
2023-01-25 00:21:31 +01:00
std : : vector < char > MLPPData : : split ( std : : string text ) {
2023-01-24 19:00:54 +01:00
std : : vector < char > split_data ;
2023-02-12 18:03:17 +01:00
for ( uint32_t i = 0 ; i < text . size ( ) ; i + + ) {
2023-01-24 19:00:54 +01:00
split_data . push_back ( text [ i ] ) ;
}
return split_data ;
}
2023-12-28 17:41:20 +01:00
Vector < String > MLPPData : : split_sentences ( String data ) {
Vector < String > sentences ;
2023-01-24 19:00:54 +01:00
2023-12-28 17:41:20 +01:00
int start_index = 0 ;
for ( int i = 0 ; i < data . length ( ) - 1 ; + + i ) {
2023-01-24 19:00:54 +01:00
if ( data [ i ] = = ' . ' & & data [ i + 1 ] ! = ' . ' ) {
2023-12-28 17:41:20 +01:00
continue ;
}
if ( data [ i ] = = ' . ' ) {
sentences . push_back ( data . substr_index ( start_index , i ) ) ;
start_index = i + 1 ;
2023-01-24 19:00:54 +01:00
}
}
2023-12-28 17:41:20 +01:00
if ( start_index ! = data . length ( ) - 1 ) {
sentences . push_back ( data . substr_index ( start_index , data . length ( ) - 1 ) ) ;
}
2023-01-24 19:00:54 +01:00
return sentences ;
}
2023-12-28 17:41:20 +01:00
Vector < String > MLPPData : : remove_spaces ( Vector < String > data ) {
for ( int i = 0 ; i < data . size ( ) ; i + + ) {
data . write [ i ] = data [ i ] . replace ( " " , " " ) ;
2023-01-24 19:00:54 +01:00
}
return data ;
}
2023-12-28 17:41:20 +01:00
Vector < String > MLPPData : : remove_empty ( Vector < String > data ) {
for ( int i = 0 ; i < data . size ( ) ; + + i ) {
if ( data [ i ] . empty ( ) ) {
data . remove ( i ) ;
2023-01-24 19:00:54 +01:00
}
}
2023-12-28 17:41:20 +01:00
2023-01-24 19:00:54 +01:00
return data ;
}
2023-12-28 17:41:20 +01:00
Vector < String > MLPPData : : segment ( String text ) {
Vector < String > segmented_data ;
2023-01-24 19:00:54 +01:00
int prev_delim = 0 ;
2023-12-28 17:41:20 +01:00
for ( int i = 0 ; i < text . length ( ) ; i + + ) {
2023-01-24 19:00:54 +01:00
if ( text [ i ] = = ' ' ) {
segmented_data . push_back ( text . substr ( prev_delim , i - prev_delim ) ) ;
prev_delim = i + 1 ;
} else if ( text [ i ] = = ' , ' | | text [ i ] = = ' ! ' | | text [ i ] = = ' . ' | | text [ i ] = = ' - ' ) {
segmented_data . push_back ( text . substr ( prev_delim , i - prev_delim ) ) ;
2023-12-28 17:41:20 +01:00
String punc ;
punc + = text [ i ] ;
2023-01-24 19:00:54 +01:00
segmented_data . push_back ( punc ) ;
prev_delim = i + 2 ;
i + + ;
} else if ( i = = text . length ( ) - 1 ) {
segmented_data . push_back ( text . substr ( prev_delim , text . length ( ) - prev_delim ) ) ; // hehe oops- forgot this
}
}
return segmented_data ;
}
2023-12-28 17:41:20 +01:00
Vector < int > MLPPData : : tokenize ( String text ) {
2023-01-24 19:00:54 +01:00
int max_num = 0 ;
bool new_num = true ;
2023-12-28 17:41:20 +01:00
Vector < String > segmented_data = segment ( text ) ;
Vector < int > tokenized_data ;
2023-01-24 19:00:54 +01:00
tokenized_data . resize ( segmented_data . size ( ) ) ;
2023-12-28 17:41:20 +01:00
for ( int i = 0 ; i < segmented_data . size ( ) ; i + + ) {
2023-01-24 19:00:54 +01:00
for ( int j = i - 1 ; j > = 0 ; j - - ) {
if ( segmented_data [ i ] = = segmented_data [ j ] ) {
2023-12-28 17:41:20 +01:00
tokenized_data . write [ i ] = tokenized_data [ j ] ;
2023-01-24 19:00:54 +01:00
new_num = false ;
}
}
if ( ! new_num ) {
new_num = true ;
} else {
max_num + + ;
2023-12-28 17:41:20 +01:00
tokenized_data . write [ i ] = max_num ;
2023-01-24 19:00:54 +01:00
}
}
2023-12-28 17:41:20 +01:00
2023-01-24 19:00:54 +01:00
return tokenized_data ;
}
2023-12-28 17:41:20 +01:00
Vector < String > MLPPData : : remove_stop_words ( String text ) {
Vector < String > segmented_data = remove_spaces ( segment ( text . to_lower ( ) ) ) ;
2023-01-24 19:00:54 +01:00
2023-12-28 17:41:20 +01:00
for ( int i = 0 ; i < stop_words . size ( ) ; i + + ) {
for ( int j = 0 ; j < segmented_data . size ( ) ; j + + ) {
if ( segmented_data [ j ] = = stop_words [ i ] ) {
segmented_data . remove ( j ) ;
- - j ;
2023-01-24 19:00:54 +01:00
}
}
}
2023-12-28 17:41:20 +01:00
2023-01-24 19:00:54 +01:00
return segmented_data ;
}
2023-12-28 17:41:20 +01:00
Vector < String > MLPPData : : remove_stop_words_vec ( Vector < String > segmented_data ) {
for ( int i = 0 ; i < segmented_data . size ( ) ; i + + ) {
for ( int j = 0 ; j < stop_words . size ( ) ; j + + ) {
if ( segmented_data [ i ] = = stop_words [ j ] ) {
segmented_data . remove ( i ) ;
- - i ;
2023-01-24 19:00:54 +01:00
}
}
}
2023-12-28 17:41:20 +01:00
2023-01-24 19:00:54 +01:00
return segmented_data ;
}
2023-12-28 17:41:20 +01:00
String MLPPData : : stemming ( String text ) {
2023-01-24 19:00:54 +01:00
int padding_size = 4 ;
2023-12-28 17:41:20 +01:00
String padding = " " ; // our padding
2023-01-24 19:00:54 +01:00
2023-12-28 17:41:20 +01:00
text + = String ( padding ) . repeat ( padding_size ) ; // ' ' will be our padding value
2023-01-24 19:00:54 +01:00
2023-12-28 17:41:20 +01:00
for ( int i = 0 ; i < text . length ( ) ; i + + ) {
for ( int j = 0 ; j < suffixes . size ( ) ; j + + ) {
2023-01-24 19:00:54 +01:00
if ( text . substr ( i , suffixes [ j ] . length ( ) ) = = suffixes [ j ] & & ( text [ i + suffixes [ j ] . length ( ) ] = = ' ' | | text [ i + suffixes [ j ] . length ( ) ] = = ' , ' | | text [ i + suffixes [ j ] . length ( ) ] = = ' - ' | | text [ i + suffixes [ j ] . length ( ) ] = = ' . ' | | text [ i + suffixes [ j ] . length ( ) ] = = ' ! ' ) ) {
text . erase ( i , suffixes [ j ] . length ( ) ) ;
}
}
}
return text ;
}
2023-12-28 17:41:20 +01:00
Ref < MLPPMatrix > MLPPData : : bag_of_words ( Vector < String > sentences , BagOfWordsType type ) {
2023-01-24 19:00:54 +01:00
/*
STEPS OF BOW :
2023-12-28 17:41:20 +01:00
1 ) To lowercase ( done by remove_stop_words function by def )
2023-01-24 19:00:54 +01:00
2 ) Removing stop words
3 ) Obtain a list of the used words
4 ) Create a one hot encoded vector of the words and sentences
5 ) Sentence . size ( ) x list . size ( ) matrix
*/
2023-12-28 17:41:20 +01:00
Vector < String > word_list = remove_empty ( remove_stop_words_vec ( create_word_list ( sentences ) ) ) ;
2023-01-24 19:00:54 +01:00
2023-12-28 17:41:20 +01:00
Vector < Vector < String > > segmented_sentences ;
2023-01-24 19:00:54 +01:00
segmented_sentences . resize ( sentences . size ( ) ) ;
2023-12-28 17:41:20 +01:00
for ( int i = 0 ; i < sentences . size ( ) ; i + + ) {
segmented_sentences . write [ i ] = remove_stop_words ( sentences [ i ] ) ;
2023-01-24 19:00:54 +01:00
}
2023-12-28 17:41:20 +01:00
Ref < MLPPMatrix > bow ;
bow . instance ( ) ;
2023-12-28 18:00:26 +01:00
bow - > resize ( Size2i ( word_list . size ( ) , segmented_sentences . size ( ) ) ) ;
2023-12-28 17:41:20 +01:00
bow - > fill ( 0 ) ;
2023-01-24 19:00:54 +01:00
2023-12-28 17:41:20 +01:00
for ( int i = 0 ; i < segmented_sentences . size ( ) ; i + + ) {
for ( int j = 0 ; j < segmented_sentences [ i ] . size ( ) ; j + + ) {
for ( int k = 0 ; k < word_list . size ( ) ; k + + ) {
if ( segmented_sentences [ i ] [ j ] = = word_list [ k ] ) {
if ( type = = BAG_OF_WORDS_TYPE_BINARY ) {
bow - > element_set ( i , k , 1 ) ;
2023-01-24 19:00:54 +01:00
} else {
2023-12-28 17:41:20 +01:00
bow - > element_set ( i , k , bow - > element_get ( i , k ) + 1 ) ;
2023-01-24 19:00:54 +01:00
}
}
}
}
}
2023-12-28 17:41:20 +01:00
2023-01-24 19:00:54 +01:00
return bow ;
}
2023-12-28 17:41:20 +01:00
Ref < MLPPMatrix > MLPPData : : tfidf ( Vector < String > sentences ) {
Vector < String > word_list = remove_empty ( remove_stop_words_vec ( create_word_list ( sentences ) ) ) ;
2023-01-24 19:00:54 +01:00
2023-12-28 17:41:20 +01:00
Vector < Vector < String > > segmented_sentences ;
2023-01-24 19:00:54 +01:00
segmented_sentences . resize ( sentences . size ( ) ) ;
2023-12-28 17:41:20 +01:00
for ( int i = 0 ; i < sentences . size ( ) ; i + + ) {
segmented_sentences . write [ i ] = remove_stop_words ( sentences [ i ] ) ;
2023-01-24 19:00:54 +01:00
}
2023-12-28 17:41:20 +01:00
Ref < MLPPMatrix > TF ;
TF . instance ( ) ;
TF - > resize ( Size2i ( word_list . size ( ) , segmented_sentences . size ( ) ) ) ;
2023-01-24 19:00:54 +01:00
2023-12-28 17:41:20 +01:00
Vector < int > frequency ;
frequency . resize ( word_list . size ( ) ) ;
frequency . fill ( 0 ) ;
2023-01-24 19:00:54 +01:00
2023-12-28 17:41:20 +01:00
Ref < MLPPVector > TF_row ;
TF_row . instance ( ) ;
TF_row - > resize ( word_list . size ( ) ) ;
2023-01-24 19:00:54 +01:00
2023-12-28 17:41:20 +01:00
for ( int i = 0 ; i < segmented_sentences . size ( ) ; i + + ) {
Vector < bool > present ;
present . resize ( word_list . size ( ) ) ;
present . fill ( false ) ;
2023-01-24 19:00:54 +01:00
2023-12-28 17:41:20 +01:00
for ( int j = 0 ; j < segmented_sentences [ i ] . size ( ) ; j + + ) {
for ( int k = 0 ; k < word_list . size ( ) ; k + + ) {
if ( segmented_sentences [ i ] [ j ] = = word_list [ k ] ) {
TF - > element_set ( i , k , TF - > element_get ( i , k ) + 1 ) ;
2023-01-24 19:00:54 +01:00
2023-12-28 17:41:20 +01:00
if ( ! present [ k ] ) {
frequency . write [ k ] + + ;
present . write [ k ] = true ;
}
2023-01-24 19:00:54 +01:00
}
}
}
2023-12-28 17:41:20 +01:00
TF - > row_get_into_mlpp_vector ( i , TF_row ) ;
TF_row - > scalar_multiply ( real_t ( 1 ) / real_t ( segmented_sentences [ i ] . size ( ) ) ) ;
TF - > row_set_mlpp_vector ( i , TF_row ) ;
2023-01-24 19:00:54 +01:00
}
2023-12-28 17:41:20 +01:00
Vector < real_t > IDF ;
IDF . resize ( frequency . size ( ) ) ;
for ( int i = 0 ; i < IDF . size ( ) ; i + + ) {
IDF . write [ i ] = Math : : log ( ( real_t ) segmented_sentences . size ( ) / ( real_t ) frequency [ i ] ) ;
2023-01-24 19:00:54 +01:00
}
2023-02-12 18:03:17 +01:00
2023-12-28 17:41:20 +01:00
Ref < MLPPMatrix > TFIDF ;
TFIDF . instance ( ) ;
Size2i tfidf_size = Size2i ( word_list . size ( ) , segmented_sentences . size ( ) ) ;
TFIDF - > resize ( tfidf_size ) ;
2023-02-12 18:03:17 +01:00
2023-12-28 17:41:20 +01:00
for ( int i = 0 ; i < tfidf_size . y ; i + + ) {
for ( int j = 0 ; j < tfidf_size . x ; j + + ) {
TFIDF - > element_set ( i , j , TF - > element_get ( i , j ) * IDF [ j ] ) ;
}
2023-01-24 19:00:54 +01:00
}
2023-02-12 18:03:17 +01:00
2023-12-28 17:41:20 +01:00
return TFIDF ;
2023-01-24 19:00:54 +01:00
}
2023-12-28 17:41:20 +01:00
MLPPData : : WordsToVecResult MLPPData : : word_to_vec ( Vector < String > sentences , WordToVecType type , int windowSize , int dimension , real_t learning_rate , int max_epoch ) {
2023-01-26 14:52:49 +01:00
WordsToVecResult res ;
2023-12-28 17:41:20 +01:00
res . word_list = remove_empty ( remove_stop_words_vec ( create_word_list ( sentences ) ) ) ;
2023-01-26 14:52:49 +01:00
2023-12-28 17:41:20 +01:00
Vector < Vector < String > > segmented_sentences ;
2023-01-26 14:52:49 +01:00
segmented_sentences . resize ( sentences . size ( ) ) ;
2023-12-28 17:41:20 +01:00
for ( int i = 0 ; i < sentences . size ( ) ; i + + ) {
segmented_sentences . write [ i ] = remove_stop_words ( sentences [ i ] ) ;
2023-01-26 14:52:49 +01:00
}
2023-12-28 17:41:20 +01:00
Vector < String > inputStrings ;
Vector < String > outputStrings ;
2023-01-26 14:52:49 +01:00
2023-12-28 17:41:20 +01:00
for ( int i = 0 ; i < segmented_sentences . size ( ) ; i + + ) {
for ( int j = 0 ; j < segmented_sentences [ i ] . size ( ) ; j + + ) {
2023-01-26 14:52:49 +01:00
for ( int k = windowSize ; k > 0 ; k - - ) {
2023-12-28 15:09:55 +01:00
int jmk = ( int ) j - k ;
if ( jmk > = 0 ) {
2023-01-26 14:52:49 +01:00
inputStrings . push_back ( segmented_sentences [ i ] [ j ] ) ;
2023-12-28 15:09:55 +01:00
outputStrings . push_back ( segmented_sentences [ i ] [ jmk ] ) ;
2023-01-26 14:52:49 +01:00
}
if ( j + k < = segmented_sentences [ i ] . size ( ) - 1 ) {
inputStrings . push_back ( segmented_sentences [ i ] [ j ] ) ;
outputStrings . push_back ( segmented_sentences [ i ] [ j + k ] ) ;
}
}
}
}
2023-12-28 17:41:20 +01:00
int input_size = inputStrings . size ( ) ;
2023-01-26 14:52:49 +01:00
2023-12-28 17:41:20 +01:00
inputStrings . append_array ( outputStrings ) ;
2023-01-26 14:52:49 +01:00
2023-12-28 17:41:20 +01:00
Ref < MLPPMatrix > bow = bag_of_words ( inputStrings , BAG_OF_WORDS_TYPE_BINARY ) ;
Size2i bow_size = bow - > size ( ) ;
2023-01-26 14:52:49 +01:00
2023-12-28 17:41:20 +01:00
Ref < MLPPMatrix > input_set ;
Ref < MLPPMatrix > output_set ;
input_set . instance ( ) ;
output_set . instance ( ) ;
2023-01-26 14:52:49 +01:00
2023-12-28 17:41:20 +01:00
input_set - > resize ( Size2i ( bow_size . x , input_size ) ) ;
Ref < MLPPVector > row_tmp ;
row_tmp . instance ( ) ;
row_tmp - > resize ( bow_size . x ) ;
for ( int i = 0 ; i < input_size ; i + + ) {
bow - > row_get_into_mlpp_vector ( i , row_tmp ) ;
input_set - > row_set_mlpp_vector ( i , row_tmp ) ;
2023-01-26 14:52:49 +01:00
}
2023-12-28 17:41:20 +01:00
output_set - > resize ( Size2i ( bow_size . x , bow_size . y - input_size ) ) ;
Size2i output_set_size = output_set - > size ( ) ;
for ( int i = 0 ; i < output_set_size . y ; i + + ) {
bow - > row_get_into_mlpp_vector ( i + input_size , row_tmp ) ;
input_set - > row_set_mlpp_vector ( i , row_tmp ) ;
2023-01-26 14:52:49 +01:00
}
2023-02-12 18:03:17 +01:00
2023-12-28 17:41:20 +01:00
MLPPSoftmaxNet * model ;
2023-02-12 18:03:17 +01:00
2023-12-28 17:41:20 +01:00
if ( type = = WORD_TO_VEC_TYPE_SKIPGRAM ) {
model = memnew ( MLPPSoftmaxNet ( output_set , input_set , dimension ) ) ;
2023-01-26 14:52:49 +01:00
} else { // else = CBOW. We maintain it is a default.
2023-12-28 17:41:20 +01:00
model = memnew ( MLPPSoftmaxNet ( input_set , output_set , dimension ) ) ;
2023-01-26 14:52:49 +01:00
}
2023-02-12 18:03:17 +01:00
2023-12-28 17:41:20 +01:00
model - > train_gradient_descent ( learning_rate , max_epoch ) ;
2023-01-26 14:52:49 +01:00
2023-12-28 17:41:20 +01:00
res . word_embeddings = model - > get_embeddings ( ) ;
memdelete ( model ) ;
2023-01-26 14:52:49 +01:00
return res ;
}
2023-12-28 17:41:20 +01:00
Ref < MLPPMatrix > MLPPData : : lsa ( Vector < String > sentences , int dim ) {
MLPPLinAlg alg ;
Ref < MLPPMatrix > doc_word_data = bag_of_words ( sentences , BAG_OF_WORDS_TYPE_BINARY ) ;
MLPPLinAlg : : SVDResult svr_res = alg . svd ( doc_word_data ) ;
Ref < MLPPMatrix > S_trunc = alg . zeromatnm ( dim , dim ) ;
Ref < MLPPMatrix > Vt_trunc ;
Vt_trunc . instance ( ) ;
Vt_trunc - > resize ( Size2i ( svr_res . Vt - > size ( ) . x , dim ) ) ;
Ref < MLPPVector > row_rmp ;
row_rmp . instance ( ) ;
row_rmp - > resize ( svr_res . Vt - > size ( ) . x ) ;
2023-01-24 19:00:54 +01:00
for ( int i = 0 ; i < dim ; i + + ) {
2023-12-28 17:41:20 +01:00
S_trunc - > element_set ( i , i , svr_res . S - > element_get ( i , i ) ) ;
svr_res . Vt - > row_get_into_mlpp_vector ( i , row_rmp ) ;
Vt_trunc - > row_set_mlpp_vector ( i , row_rmp ) ;
2023-01-24 19:00:54 +01:00
}
2023-12-28 17:41:20 +01:00
Ref < MLPPMatrix > embeddings = S_trunc - > multn ( Vt_trunc ) ;
2023-01-24 19:00:54 +01:00
return embeddings ;
}
2023-12-28 17:41:20 +01:00
Vector < String > MLPPData : : create_word_list ( Vector < String > sentences ) {
String combined_text = " " ;
for ( int i = 0 ; i < sentences . size ( ) ; i + + ) {
2023-01-24 19:00:54 +01:00
if ( i ! = 0 ) {
2023-12-28 17:41:20 +01:00
combined_text + = " " ;
2023-01-24 19:00:54 +01:00
}
2023-12-28 17:41:20 +01:00
combined_text + = sentences [ i ] ;
2023-01-24 19:00:54 +01:00
}
2023-12-28 17:41:20 +01:00
return remove_spaces ( vec_to_set ( remove_stop_words ( combined_text ) ) ) ;
2023-01-24 19:00:54 +01:00
}
// EXTRA
2023-01-25 00:21:31 +01:00
void MLPPData : : setInputNames ( std : : string fileName , std : : vector < std : : string > & inputNames ) {
2023-01-24 19:00:54 +01:00
std : : string inputNameTemp ;
std : : ifstream dataFile ( fileName ) ;
if ( ! dataFile . is_open ( ) ) {
std : : cout < < fileName < < " failed to open. " < < std : : endl ;
}
while ( std : : getline ( dataFile , inputNameTemp ) ) {
inputNames . push_back ( inputNameTemp ) ;
}
dataFile . close ( ) ;
}
2023-01-27 13:01:16 +01:00
std : : vector < std : : vector < real_t > > MLPPData : : featureScaling ( std : : vector < std : : vector < real_t > > X ) {
2023-04-22 17:17:58 +02:00
MLPPLinAlgOld alg ;
2023-01-24 19:00:54 +01:00
X = alg . transpose ( X ) ;
2023-01-27 13:01:16 +01:00
std : : vector < real_t > max_elements , min_elements ;
2023-01-24 19:00:54 +01:00
max_elements . resize ( X . size ( ) ) ;
min_elements . resize ( X . size ( ) ) ;
2023-02-12 18:03:17 +01:00
for ( uint32_t i = 0 ; i < X . size ( ) ; i + + ) {
2023-01-24 19:00:54 +01:00
max_elements [ i ] = alg . max ( X [ i ] ) ;
min_elements [ i ] = alg . min ( X [ i ] ) ;
}
2023-02-12 18:03:17 +01:00
for ( uint32_t i = 0 ; i < X . size ( ) ; i + + ) {
for ( uint32_t j = 0 ; j < X [ i ] . size ( ) ; j + + ) {
2023-01-24 19:00:54 +01:00
X [ i ] [ j ] = ( X [ i ] [ j ] - min_elements [ i ] ) / ( max_elements [ i ] - min_elements [ i ] ) ;
}
}
return alg . transpose ( X ) ;
}
2023-01-27 13:01:16 +01:00
std : : vector < std : : vector < real_t > > MLPPData : : meanNormalization ( std : : vector < std : : vector < real_t > > X ) {
2023-04-22 17:17:58 +02:00
MLPPLinAlgOld alg ;
MLPPStatOld stat ;
2023-01-24 19:00:54 +01:00
// (X_j - mu_j) / std_j, for every j
X = meanCentering ( X ) ;
2023-02-12 18:03:17 +01:00
for ( uint32_t i = 0 ; i < X . size ( ) ; i + + ) {
2023-01-24 19:00:54 +01:00
X [ i ] = alg . scalarMultiply ( 1 / stat . standardDeviation ( X [ i ] ) , X [ i ] ) ;
}
return X ;
}
2023-01-27 13:01:16 +01:00
std : : vector < std : : vector < real_t > > MLPPData : : meanCentering ( std : : vector < std : : vector < real_t > > X ) {
2023-04-22 17:17:58 +02:00
MLPPStatOld stat ;
2023-02-12 18:03:17 +01:00
for ( uint32_t i = 0 ; i < X . size ( ) ; i + + ) {
2023-01-27 13:01:16 +01:00
real_t mean_i = stat . mean ( X [ i ] ) ;
2023-02-12 18:03:17 +01:00
for ( uint32_t j = 0 ; j < X [ i ] . size ( ) ; j + + ) {
2023-01-24 19:00:54 +01:00
X [ i ] [ j ] - = mean_i ;
}
}
return X ;
}
2023-01-27 13:01:16 +01:00
std : : vector < std : : vector < real_t > > MLPPData : : oneHotRep ( std : : vector < real_t > tempOutputSet , int n_class ) {
std : : vector < std : : vector < real_t > > outputSet ;
2023-01-24 19:00:54 +01:00
outputSet . resize ( tempOutputSet . size ( ) ) ;
2023-02-12 18:03:17 +01:00
for ( uint32_t i = 0 ; i < tempOutputSet . size ( ) ; i + + ) {
2023-01-24 19:00:54 +01:00
for ( int j = 0 ; j < = n_class - 1 ; j + + ) {
if ( tempOutputSet [ i ] = = j ) {
outputSet [ i ] . push_back ( 1 ) ;
} else {
outputSet [ i ] . push_back ( 0 ) ;
}
}
}
return outputSet ;
}
2023-01-23 21:13:26 +01:00
2023-01-27 13:01:16 +01:00
std : : vector < real_t > MLPPData : : reverseOneHot ( std : : vector < std : : vector < real_t > > tempOutputSet ) {
std : : vector < real_t > outputSet ;
2023-02-12 18:03:17 +01:00
//uint32_t n_class = tempOutputSet[0].size();
for ( uint32_t i = 0 ; i < tempOutputSet . size ( ) ; i + + ) {
2023-01-24 19:00:54 +01:00
int current_class = 1 ;
2023-02-12 18:03:17 +01:00
for ( uint32_t j = 0 ; j < tempOutputSet [ i ] . size ( ) ; j + + ) {
2023-01-24 19:00:54 +01:00
if ( tempOutputSet [ i ] [ j ] = = 1 ) {
break ;
} else {
current_class + + ;
}
}
outputSet . push_back ( current_class ) ;
}
return outputSet ;
2023-01-23 21:13:26 +01:00
}
2023-01-24 19:20:18 +01:00
2023-02-08 01:26:37 +01:00
Ref < MLPPMatrix > MLPPData : : mean_centering ( const Ref < MLPPMatrix > & p_X ) {
MLPPStat stat ;
Ref < MLPPMatrix > X ;
X . instance ( ) ;
X - > resize ( p_X - > size ( ) ) ;
Size2i x_size = X - > size ( ) ;
Ref < MLPPVector > x_row_tmp ;
x_row_tmp . instance ( ) ;
x_row_tmp - > resize ( x_size . x ) ;
for ( int i = 0 ; i < x_size . y ; + + i ) {
2023-04-29 15:07:30 +02:00
X - > row_get_into_mlpp_vector ( i , x_row_tmp ) ;
2023-02-08 01:26:37 +01:00
real_t mean_i = stat . meanv ( x_row_tmp ) ;
for ( int j = 0 ; j < x_size . x ; + + j ) {
2023-04-29 13:44:18 +02:00
X - > element_set ( i , j , p_X - > element_get ( i , j ) - mean_i ) ;
2023-02-08 01:26:37 +01:00
}
}
return X ;
}
2023-02-09 11:40:16 +01:00
Ref < MLPPMatrix > MLPPData : : one_hot_rep ( const Ref < MLPPVector > & temp_output_set , int n_class ) {
ERR_FAIL_COND_V ( ! temp_output_set . is_valid ( ) , Ref < MLPPMatrix > ( ) ) ;
Ref < MLPPMatrix > output_set ;
output_set . instance ( ) ;
int temp_output_set_size = temp_output_set - > size ( ) ;
const real_t * temp_output_set_ptr = temp_output_set - > ptr ( ) ;
output_set - > resize ( Size2i ( n_class , temp_output_set_size ) ) ;
for ( int i = 0 ; i < temp_output_set_size ; + + i ) {
for ( int j = 0 ; j < = n_class - 1 ; + + j ) {
if ( static_cast < int > ( temp_output_set_ptr [ i ] ) = = j ) {
2023-04-29 13:44:18 +02:00
output_set - > element_set ( i , j , 1 ) ;
2023-02-09 11:40:16 +01:00
} else {
2023-04-29 13:44:18 +02:00
output_set - > element_set ( i , j , 0 ) ;
2023-02-09 11:40:16 +01:00
}
}
}
return output_set ;
}
2023-12-28 17:41:20 +01:00
void MLPPData : : load_default_suffixes ( ) {
// Our list of suffixes which we use to compare against
suffixes = String ( " eer er ion ity ment ness or sion ship th able ible al ant ary ful ic ious ous ive less y ed en ing ize ise ly ward wise " ) . split_spaces ( ) ;
}
void MLPPData : : load_default_stop_words ( ) {
stop_words = String ( " i me my myself we our ours ourselves you your yours yourself yourselves he him his himself she her hers herself it its itself they them their theirs themselves what which who whom this that these those am is are was were be been being have has had having do does did doing a an the and but if or because as until while of at by for with about against between into through during before after above below to from up down in out on off over under again further then once here there when where why how all any both each few more most other some such no nor not only own same so than too very s t can will just don should now " ) . split_spaces ( ) ;
}
2023-01-25 18:27:14 +01:00
void MLPPData : : _bind_methods ( ) {
ClassDB : : bind_method ( D_METHOD ( " load_breast_cancer " , " path " ) , & MLPPData : : load_breast_cancer ) ;
ClassDB : : bind_method ( D_METHOD ( " load_breast_cancer_svc " , " path " ) , & MLPPData : : load_breast_cancer_svc ) ;
ClassDB : : bind_method ( D_METHOD ( " load_iris " , " path " ) , & MLPPData : : load_iris ) ;
ClassDB : : bind_method ( D_METHOD ( " load_wine " , " path " ) , & MLPPData : : load_wine ) ;
ClassDB : : bind_method ( D_METHOD ( " load_mnist_train " , " path " ) , & MLPPData : : load_mnist_train ) ;
ClassDB : : bind_method ( D_METHOD ( " load_mnist_test " , " path " ) , & MLPPData : : load_mnist_test ) ;
ClassDB : : bind_method ( D_METHOD ( " load_california_housing " , " path " ) , & MLPPData : : load_california_housing ) ;
ClassDB : : bind_method ( D_METHOD ( " load_fires_and_crime " , " path " ) , & MLPPData : : load_fires_and_crime ) ;
2023-01-26 14:52:49 +01:00
ClassDB : : bind_method ( D_METHOD ( " train_test_split " , " data " , " test_size " ) , & MLPPData : : train_test_split_bind ) ;
2023-01-25 18:27:14 +01:00
}