Converted some methods in MLPPData.

This commit is contained in:
Relintai 2023-12-28 17:41:20 +01:00
parent 65854b55fc
commit 14150405b0
2 changed files with 236 additions and 284 deletions

View File

@ -12,11 +12,9 @@
#include "../lin_alg/lin_alg.h" #include "../lin_alg/lin_alg.h"
#include "../stat/stat.h" #include "../stat/stat.h"
#ifdef OLD_CLASSES_ENABLED
#include "../lin_alg/lin_alg_old.h" #include "../lin_alg/lin_alg_old.h"
#include "../softmax_net/softmax_net_old.h" #include "../softmax_net/softmax_net.h"
#include "../stat/stat_old.h" #include "../stat/stat_old.h"
#endif
#include <algorithm> #include <algorithm>
#include <cmath> #include <cmath>
@ -520,7 +518,6 @@ std::tuple<std::vector<std::vector<real_t>>, std::vector<std::vector<real_t>>, s
// MULTIVARIATE SUPERVISED // MULTIVARIATE SUPERVISED
void MLPPData::setData(int k, std::string fileName, std::vector<std::vector<real_t>> &inputSet, std::vector<real_t> &outputSet) { void MLPPData::setData(int k, std::string fileName, std::vector<std::vector<real_t>> &inputSet, std::vector<real_t> &outputSet) {
#ifdef OLD_CLASSES_ENABLED
MLPPLinAlgOld alg; MLPPLinAlgOld alg;
std::string inputTemp; std::string inputTemp;
std::string outputTemp; std::string outputTemp;
@ -546,11 +543,9 @@ void MLPPData::setData(int k, std::string fileName, std::vector<std::vector<real
} }
inputSet = alg.transpose(inputSet); inputSet = alg.transpose(inputSet);
dataFile.close(); dataFile.close();
#endif
} }
void MLPPData::printData(std::vector<std::string> inputName, std::string outputName, std::vector<std::vector<real_t>> inputSet, std::vector<real_t> outputSet) { void MLPPData::printData(std::vector<std::string> inputName, std::string outputName, std::vector<std::vector<real_t>> inputSet, std::vector<real_t> outputSet) {
#ifdef OLD_CLASSES_ENABLED
MLPPLinAlgOld alg; MLPPLinAlgOld alg;
inputSet = alg.transpose(inputSet); inputSet = alg.transpose(inputSet);
for (uint32_t i = 0; i < inputSet.size(); i++) { for (uint32_t i = 0; i < inputSet.size(); i++) {
@ -564,13 +559,11 @@ void MLPPData::printData(std::vector<std::string> inputName, std::string outputN
for (uint32_t i = 0; i < outputSet.size(); i++) { for (uint32_t i = 0; i < outputSet.size(); i++) {
std::cout << outputSet[i] << std::endl; std::cout << outputSet[i] << std::endl;
} }
#endif
} }
// UNSUPERVISED // UNSUPERVISED
void MLPPData::setData(int k, std::string fileName, std::vector<std::vector<real_t>> &inputSet) { void MLPPData::setData(int k, std::string fileName, std::vector<std::vector<real_t>> &inputSet) {
#ifdef OLD_CLASSES_ENABLED
MLPPLinAlgOld alg; MLPPLinAlgOld alg;
std::string inputTemp; std::string inputTemp;
@ -592,11 +585,9 @@ void MLPPData::setData(int k, std::string fileName, std::vector<std::vector<real
} }
inputSet = alg.transpose(inputSet); inputSet = alg.transpose(inputSet);
dataFile.close(); dataFile.close();
#endif
} }
void MLPPData::printData(std::vector<std::string> inputName, std::vector<std::vector<real_t>> inputSet) { void MLPPData::printData(std::vector<std::string> inputName, std::vector<std::vector<real_t>> inputSet) {
#ifdef OLD_CLASSES_ENABLED
MLPPLinAlgOld alg; MLPPLinAlgOld alg;
inputSet = alg.transpose(inputSet); inputSet = alg.transpose(inputSet);
for (uint32_t i = 0; i < inputSet.size(); i++) { for (uint32_t i = 0; i < inputSet.size(); i++) {
@ -605,7 +596,6 @@ void MLPPData::printData(std::vector<std::string> inputName, std::vector<std::ve
std::cout << inputSet[i][j] << std::endl; std::cout << inputSet[i][j] << std::endl;
} }
} }
#endif
} }
// SIMPLE // SIMPLE
@ -661,7 +651,6 @@ std::vector<std::vector<real_t>> MLPPData::rgb2gray(std::vector<std::vector<std:
} }
std::vector<std::vector<std::vector<real_t>>> MLPPData::rgb2ycbcr(std::vector<std::vector<std::vector<real_t>>> input) { std::vector<std::vector<std::vector<real_t>>> MLPPData::rgb2ycbcr(std::vector<std::vector<std::vector<real_t>>> input) {
#ifdef OLD_CLASSES_ENABLED
MLPPLinAlgOld alg; MLPPLinAlgOld alg;
std::vector<std::vector<std::vector<real_t>>> YCbCr; std::vector<std::vector<std::vector<real_t>>> YCbCr;
YCbCr = alg.resize(YCbCr, input); YCbCr = alg.resize(YCbCr, input);
@ -673,15 +662,11 @@ std::vector<std::vector<std::vector<real_t>>> MLPPData::rgb2ycbcr(std::vector<st
} }
} }
return YCbCr; return YCbCr;
#else
return std::vector<std::vector<std::vector<real_t>>>();
#endif
} }
// Conversion formulas available here: // Conversion formulas available here:
// https://www.rapidtables.com/convert/color/rgb-to-hsv.html // https://www.rapidtables.com/convert/color/rgb-to-hsv.html
std::vector<std::vector<std::vector<real_t>>> MLPPData::rgb2hsv(std::vector<std::vector<std::vector<real_t>>> input) { std::vector<std::vector<std::vector<real_t>>> MLPPData::rgb2hsv(std::vector<std::vector<std::vector<real_t>>> input) {
#ifdef OLD_CLASSES_ENABLED
MLPPLinAlgOld alg; MLPPLinAlgOld alg;
std::vector<std::vector<std::vector<real_t>>> HSV; std::vector<std::vector<std::vector<real_t>>> HSV;
HSV = alg.resize(HSV, input); HSV = alg.resize(HSV, input);
@ -720,34 +705,23 @@ std::vector<std::vector<std::vector<real_t>>> MLPPData::rgb2hsv(std::vector<std:
} }
} }
return HSV; return HSV;
#else
return std::vector<std::vector<std::vector<real_t>>>();
#endif
} }
// http://machinethatsees.blogspot.com/2013/07/how-to-convert-rgb-to-xyz-or-vice-versa.html // http://machinethatsees.blogspot.com/2013/07/how-to-convert-rgb-to-xyz-or-vice-versa.html
std::vector<std::vector<std::vector<real_t>>> MLPPData::rgb2xyz(std::vector<std::vector<std::vector<real_t>>> input) { std::vector<std::vector<std::vector<real_t>>> MLPPData::rgb2xyz(std::vector<std::vector<std::vector<real_t>>> input) {
#ifdef OLD_CLASSES_ENABLED
MLPPLinAlgOld alg; MLPPLinAlgOld alg;
std::vector<std::vector<std::vector<real_t>>> XYZ; std::vector<std::vector<std::vector<real_t>>> XYZ;
XYZ = alg.resize(XYZ, input); XYZ = alg.resize(XYZ, input);
std::vector<std::vector<real_t>> RGB2XYZ = { { 0.4124564, 0.3575761, 0.1804375 }, { 0.2126726, 0.7151522, 0.0721750 }, { 0.0193339, 0.1191920, 0.9503041 } }; std::vector<std::vector<real_t>> RGB2XYZ = { { 0.4124564, 0.3575761, 0.1804375 }, { 0.2126726, 0.7151522, 0.0721750 }, { 0.0193339, 0.1191920, 0.9503041 } };
return alg.vector_wise_tensor_product(input, RGB2XYZ); return alg.vector_wise_tensor_product(input, RGB2XYZ);
#else
return std::vector<std::vector<std::vector<real_t>>>();
#endif
} }
std::vector<std::vector<std::vector<real_t>>> MLPPData::xyz2rgb(std::vector<std::vector<std::vector<real_t>>> input) { std::vector<std::vector<std::vector<real_t>>> MLPPData::xyz2rgb(std::vector<std::vector<std::vector<real_t>>> input) {
#ifdef OLD_CLASSES_ENABLED
MLPPLinAlgOld alg; MLPPLinAlgOld alg;
std::vector<std::vector<std::vector<real_t>>> XYZ; std::vector<std::vector<std::vector<real_t>>> XYZ;
XYZ = alg.resize(XYZ, input); XYZ = alg.resize(XYZ, input);
std::vector<std::vector<real_t>> RGB2XYZ = alg.inverse({ { 0.4124564, 0.3575761, 0.1804375 }, { 0.2126726, 0.7151522, 0.0721750 }, { 0.0193339, 0.1191920, 0.9503041 } }); std::vector<std::vector<real_t>> RGB2XYZ = alg.inverse({ { 0.4124564, 0.3575761, 0.1804375 }, { 0.2126726, 0.7151522, 0.0721750 }, { 0.0193339, 0.1191920, 0.9503041 } });
return alg.vector_wise_tensor_product(input, RGB2XYZ); return alg.vector_wise_tensor_product(input, RGB2XYZ);
#else
return std::vector<std::vector<std::vector<real_t>>>();
#endif
} }
// TEXT-BASED & NLP // TEXT-BASED & NLP
@ -766,54 +740,58 @@ std::vector<char> MLPPData::split(std::string text) {
return split_data; return split_data;
} }
std::vector<std::string> MLPPData::splitSentences(std::string data) { Vector<String> MLPPData::split_sentences(String data) {
std::vector<std::string> sentences; Vector<String> sentences;
std::string currentStr = "";
for (uint32_t i = 0; i < data.length(); i++) { int start_index = 0;
currentStr.push_back(data[i]);
for (int i = 0; i < data.length() - 1; ++i) {
if (data[i] == '.' && data[i + 1] != '.') { if (data[i] == '.' && data[i + 1] != '.') {
sentences.push_back(currentStr); continue;
currentStr = ""; }
i++;
if (data[i] == '.') {
sentences.push_back(data.substr_index(start_index, i));
start_index = i + 1;
} }
} }
if (start_index != data.length() - 1) {
sentences.push_back(data.substr_index(start_index, data.length() - 1));
}
return sentences; return sentences;
} }
std::vector<std::string> MLPPData::removeSpaces(std::vector<std::string> data) { Vector<String> MLPPData::remove_spaces(Vector<String> data) {
for (uint32_t i = 0; i < data.size(); i++) { for (int i = 0; i < data.size(); i++) {
auto it = data[i].begin(); data.write[i] = data[i].replace(" ", "");
for (uint32_t j = 0; j < data[i].length(); j++) {
if (data[i][j] == ' ') {
data[i].erase(it);
}
it++;
}
} }
return data; return data;
} }
std::vector<std::string> MLPPData::removeNullByte(std::vector<std::string> data) { Vector<String> MLPPData::remove_empty(Vector<String> data) {
for (uint32_t i = 0; i < data.size(); i++) { for (int i = 0; i < data.size(); ++i) {
if (data[i] == "\0") { if (data[i].empty()) {
data.erase(data.begin() + i); data.remove(i);
} }
} }
return data; return data;
} }
std::vector<std::string> MLPPData::segment(std::string text) { Vector<String> MLPPData::segment(String text) {
std::vector<std::string> segmented_data; Vector<String> segmented_data;
int prev_delim = 0; int prev_delim = 0;
for (uint32_t i = 0; i < text.length(); i++) {
for (int i = 0; i < text.length(); i++) {
if (text[i] == ' ') { if (text[i] == ' ') {
segmented_data.push_back(text.substr(prev_delim, i - prev_delim)); segmented_data.push_back(text.substr(prev_delim, i - prev_delim));
prev_delim = i + 1; prev_delim = i + 1;
} else if (text[i] == ',' || text[i] == '!' || text[i] == '.' || text[i] == '-') { } else if (text[i] == ',' || text[i] == '!' || text[i] == '.' || text[i] == '-') {
segmented_data.push_back(text.substr(prev_delim, i - prev_delim)); segmented_data.push_back(text.substr(prev_delim, i - prev_delim));
std::string punc; String punc;
punc.push_back(text[i]); punc += text[i];
segmented_data.push_back(punc); segmented_data.push_back(punc);
prev_delim = i + 2; prev_delim = i + 2;
i++; i++;
@ -825,16 +803,17 @@ std::vector<std::string> MLPPData::segment(std::string text) {
return segmented_data; return segmented_data;
} }
std::vector<real_t> MLPPData::tokenize(std::string text) { Vector<int> MLPPData::tokenize(String text) {
int max_num = 0; int max_num = 0;
bool new_num = true; bool new_num = true;
std::vector<std::string> segmented_data = segment(text); Vector<String> segmented_data = segment(text);
std::vector<real_t> tokenized_data; Vector<int> tokenized_data;
tokenized_data.resize(segmented_data.size()); tokenized_data.resize(segmented_data.size());
for (uint32_t i = 0; i < segmented_data.size(); i++) {
for (int i = 0; i < segmented_data.size(); i++) {
for (int j = i - 1; j >= 0; j--) { for (int j = i - 1; j >= 0; j--) {
if (segmented_data[i] == segmented_data[j]) { if (segmented_data[i] == segmented_data[j]) {
tokenized_data[i] = tokenized_data[j]; tokenized_data.write[i] = tokenized_data[j];
new_num = false; new_num = false;
} }
} }
@ -842,50 +821,49 @@ std::vector<real_t> MLPPData::tokenize(std::string text) {
new_num = true; new_num = true;
} else { } else {
max_num++; max_num++;
tokenized_data[i] = max_num; tokenized_data.write[i] = max_num;
} }
} }
return tokenized_data; return tokenized_data;
} }
std::vector<std::string> MLPPData::removeStopWords(std::string text) { Vector<String> MLPPData::remove_stop_words(String text) {
std::vector<std::string> stopWords = { "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now" }; Vector<String> segmented_data = remove_spaces(segment(text.to_lower()));
std::vector<std::string> segmented_data = removeSpaces(segment(toLower(text)));
for (uint32_t i = 0; i < stopWords.size(); i++) { for (int i = 0; i < stop_words.size(); i++) {
for (uint32_t j = 0; j < segmented_data.size(); j++) { for (int j = 0; j < segmented_data.size(); j++) {
if (segmented_data[j] == stopWords[i]) { if (segmented_data[j] == stop_words[i]) {
segmented_data.erase(segmented_data.begin() + j); segmented_data.remove(j);
--j;
} }
} }
} }
return segmented_data; return segmented_data;
} }
std::vector<std::string> MLPPData::removeStopWords(std::vector<std::string> segmented_data) { Vector<String> MLPPData::remove_stop_words_vec(Vector<String> segmented_data) {
std::vector<std::string> stopWords = { "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now" }; for (int i = 0; i < segmented_data.size(); i++) {
for (uint32_t i = 0; i < segmented_data.size(); i++) { for (int j = 0; j < stop_words.size(); j++) {
for (uint32_t j = 0; j < stopWords.size(); j++) { if (segmented_data[i] == stop_words[j]) {
if (segmented_data[i] == stopWords[j]) { segmented_data.remove(i);
segmented_data.erase(segmented_data.begin() + i); --i;
} }
} }
} }
return segmented_data; return segmented_data;
} }
std::string MLPPData::stemming(std::string text) { String MLPPData::stemming(String text) {
// Our list of suffixes which we use to compare against
std::vector<std::string> suffixes = { "eer", "er", "ion", "ity", "ment", "ness", "or", "sion", "ship", "th", "able", "ible", "al", "ant", "ary", "ful", "ic", "ious", "ous", "ive", "less", "y", "ed", "en", "ing", "ize", "ise", "ly", "ward", "wise" };
int padding_size = 4; int padding_size = 4;
char padding = ' '; // our padding String padding = " "; // our padding
for (int i = 0; i < padding_size; i++) { text += String(padding).repeat(padding_size); // ' ' will be our padding value
text[text.length() + i] = padding; // ' ' will be our padding value
}
for (uint32_t i = 0; i < text.size(); i++) { for (int i = 0; i < text.length(); i++) {
for (uint32_t j = 0; j < suffixes.size(); j++) { for (int j = 0; j < suffixes.size(); j++) {
if (text.substr(i, suffixes[j].length()) == suffixes[j] && (text[i + suffixes[j].length()] == ' ' || text[i + suffixes[j].length()] == ',' || text[i + suffixes[j].length()] == '-' || text[i + suffixes[j].length()] == '.' || text[i + suffixes[j].length()] == '!')) { if (text.substr(i, suffixes[j].length()) == suffixes[j] && (text[i + suffixes[j].length()] == ' ' || text[i + suffixes[j].length()] == ',' || text[i + suffixes[j].length()] == '-' || text[i + suffixes[j].length()] == '.' || text[i + suffixes[j].length()] == '!')) {
text.erase(i, suffixes[j].length()); text.erase(i, suffixes[j].length());
} }
@ -895,196 +873,130 @@ std::string MLPPData::stemming(std::string text) {
return text; return text;
} }
std::vector<std::vector<real_t>> MLPPData::BOW(std::vector<std::string> sentences, std::string type) { Ref<MLPPMatrix> MLPPData::bag_of_words(Vector<String> sentences, BagOfWordsType type) {
/* /*
STEPS OF BOW: STEPS OF BOW:
1) To lowercase (done by removeStopWords function by def) 1) To lowercase (done by remove_stop_words function by def)
2) Removing stop words 2) Removing stop words
3) Obtain a list of the used words 3) Obtain a list of the used words
4) Create a one hot encoded vector of the words and sentences 4) Create a one hot encoded vector of the words and sentences
5) Sentence.size() x list.size() matrix 5) Sentence.size() x list.size() matrix
*/ */
std::vector<std::string> wordList = removeNullByte(removeStopWords(createWordList(sentences))); Vector<String> word_list = remove_empty(remove_stop_words_vec(create_word_list(sentences)));
std::vector<std::vector<std::string>> segmented_sentences; Vector<Vector<String>> segmented_sentences;
segmented_sentences.resize(sentences.size()); segmented_sentences.resize(sentences.size());
for (uint32_t i = 0; i < sentences.size(); i++) { for (int i = 0; i < sentences.size(); i++) {
segmented_sentences[i] = removeStopWords(sentences[i]); segmented_sentences.write[i] = remove_stop_words(sentences[i]);
} }
std::vector<std::vector<real_t>> bow; Ref<MLPPMatrix> bow;
bow.instance();
bow->resize(Size2i(word_list.size(), sentences.size()));
bow->fill(0);
bow.resize(sentences.size()); for (int i = 0; i < segmented_sentences.size(); i++) {
for (uint32_t i = 0; i < bow.size(); i++) { for (int j = 0; j < segmented_sentences[i].size(); j++) {
bow[i].resize(wordList.size()); for (int k = 0; k < word_list.size(); k++) {
} if (segmented_sentences[i][j] == word_list[k]) {
if (type == BAG_OF_WORDS_TYPE_BINARY) {
for (uint32_t i = 0; i < segmented_sentences.size(); i++) { bow->element_set(i, k, 1);
for (uint32_t j = 0; j < segmented_sentences[i].size(); j++) {
for (uint32_t k = 0; k < wordList.size(); k++) {
if (segmented_sentences[i][j] == wordList[k]) {
if (type == "Binary") {
bow[i][k] = 1;
} else { } else {
bow[i][k]++; bow->element_set(i, k, bow->element_get(i, k) + 1);
} }
} }
} }
} }
} }
return bow; return bow;
} }
std::vector<std::vector<real_t>> MLPPData::TFIDF(std::vector<std::string> sentences) { Ref<MLPPMatrix> MLPPData::tfidf(Vector<String> sentences) {
#ifdef OLD_CLASSES_ENABLED Vector<String> word_list = remove_empty(remove_stop_words_vec(create_word_list(sentences)));
MLPPLinAlgOld alg;
std::vector<std::string> wordList = removeNullByte(removeStopWords(createWordList(sentences)));
std::vector<std::vector<std::string>> segmented_sentences; Vector<Vector<String>> segmented_sentences;
segmented_sentences.resize(sentences.size()); segmented_sentences.resize(sentences.size());
for (uint32_t i = 0; i < sentences.size(); i++) { for (int i = 0; i < sentences.size(); i++) {
segmented_sentences[i] = removeStopWords(sentences[i]); segmented_sentences.write[i] = remove_stop_words(sentences[i]);
} }
std::vector<std::vector<real_t>> TF; Ref<MLPPMatrix> TF;
std::vector<int> frequency; TF.instance();
frequency.resize(wordList.size()); TF->resize(Size2i(word_list.size(), segmented_sentences.size()));
TF.resize(segmented_sentences.size());
for (uint32_t i = 0; i < TF.size(); i++) { Vector<int> frequency;
TF[i].resize(wordList.size()); frequency.resize(word_list.size());
} frequency.fill(0);
for (uint32_t i = 0; i < segmented_sentences.size(); i++) {
std::vector<bool> present(wordList.size(), false); Ref<MLPPVector> TF_row;
for (uint32_t j = 0; j < segmented_sentences[i].size(); j++) { TF_row.instance();
for (uint32_t k = 0; k < wordList.size(); k++) { TF_row->resize(word_list.size());
if (segmented_sentences[i][j] == wordList[k]) {
TF[i][k]++; for (int i = 0; i < segmented_sentences.size(); i++) {
Vector<bool> present;
present.resize(word_list.size());
present.fill(false);
for (int j = 0; j < segmented_sentences[i].size(); j++) {
for (int k = 0; k < word_list.size(); k++) {
if (segmented_sentences[i][j] == word_list[k]) {
TF->element_set(i, k, TF->element_get(i, k) + 1);
if (!present[k]) { if (!present[k]) {
frequency[k]++; frequency.write[k]++;
present[k] = true; present.write[k] = true;
} }
} }
} }
} }
TF[i] = alg.scalarMultiply(real_t(1) / real_t(segmented_sentences[i].size()), TF[i]);
TF->row_get_into_mlpp_vector(i, TF_row);
TF_row->scalar_multiply(real_t(1) / real_t(segmented_sentences[i].size()));
TF->row_set_mlpp_vector(i, TF_row);
} }
std::vector<real_t> IDF; Vector<real_t> IDF;
IDF.resize(frequency.size()); IDF.resize(frequency.size());
for (uint32_t i = 0; i < IDF.size(); i++) { for (int i = 0; i < IDF.size(); i++) {
IDF[i] = std::log((real_t)segmented_sentences.size() / (real_t)frequency[i]); IDF.write[i] = Math::log((real_t)segmented_sentences.size() / (real_t)frequency[i]);
} }
std::vector<std::vector<real_t>> TFIDF; Ref<MLPPMatrix> TFIDF;
TFIDF.resize(segmented_sentences.size()); TFIDF.instance();
for (uint32_t i = 0; i < TFIDF.size(); i++) { Size2i tfidf_size = Size2i(word_list.size(), segmented_sentences.size());
TFIDF[i].resize(wordList.size()); TFIDF->resize(tfidf_size);
}
for (uint32_t i = 0; i < TFIDF.size(); i++) { for (int i = 0; i < tfidf_size.y; i++) {
for (uint32_t j = 0; j < TFIDF[i].size(); j++) { for (int j = 0; j < tfidf_size.x; j++) {
TFIDF[i][j] = TF[i][j] * IDF[j]; TFIDF->element_set(i, j, TF->element_get(i, j) * IDF[j]);
} }
} }
return TFIDF; return TFIDF;
#else
return std::vector<std::vector<real_t>>();
#endif
} }
std::tuple<std::vector<std::vector<real_t>>, std::vector<std::string>> MLPPData::word2Vec(std::vector<std::string> sentences, std::string type, int windowSize, int dimension, real_t learning_rate, int max_epoch) { MLPPData::WordsToVecResult MLPPData::word_to_vec(Vector<String> sentences, WordToVecType type, int windowSize, int dimension, real_t learning_rate, int max_epoch) {
#ifdef OLD_CLASSES_ENABLED
std::vector<std::string> wordList = removeNullByte(removeStopWords(createWordList(sentences)));
std::vector<std::vector<std::string>> segmented_sentences;
segmented_sentences.resize(sentences.size());
for (uint32_t i = 0; i < sentences.size(); i++) {
segmented_sentences[i] = removeStopWords(sentences[i]);
}
std::vector<std::string> inputStrings;
std::vector<std::string> outputStrings;
for (uint32_t i = 0; i < segmented_sentences.size(); i++) {
for (uint32_t j = 0; j < segmented_sentences[i].size(); j++) {
for (int k = windowSize; k > 0; k--) {
if (j - k >= 0) {
inputStrings.push_back(segmented_sentences[i][j]);
outputStrings.push_back(segmented_sentences[i][j - k]);
}
if (j + k <= segmented_sentences[i].size() - 1) {
inputStrings.push_back(segmented_sentences[i][j]);
outputStrings.push_back(segmented_sentences[i][j + k]);
}
}
}
}
uint32_t inputSize = inputStrings.size();
inputStrings.insert(inputStrings.end(), outputStrings.begin(), outputStrings.end());
std::vector<std::vector<real_t>> BOW = MLPPData::BOW(inputStrings, "Binary");
std::vector<std::vector<real_t>> inputSet;
std::vector<std::vector<real_t>> outputSet;
for (uint32_t i = 0; i < inputSize; i++) {
inputSet.push_back(BOW[i]);
}
for (uint32_t i = inputSize; i < BOW.size(); i++) {
outputSet.push_back(BOW[i]);
}
MLPPSoftmaxNetOld *model;
if (type == "Skipgram") {
model = new MLPPSoftmaxNetOld(outputSet, inputSet, dimension);
} else { // else = CBOW. We maintain it is a default.
model = new MLPPSoftmaxNetOld(inputSet, outputSet, dimension);
}
model->gradientDescent(learning_rate, max_epoch, true);
std::vector<std::vector<real_t>> wordEmbeddings = model->getEmbeddings();
delete model;
return { wordEmbeddings, wordList };
#else
return std::tuple<std::vector<std::vector<real_t>>, std::vector<std::string>>();
#endif
}
struct WordsToVecResult {
std::vector<std::vector<real_t>> word_embeddings;
std::vector<std::string> word_list;
};
MLPPData::WordsToVecResult MLPPData::word_to_vec(std::vector<std::string> sentences, std::string type, int windowSize, int dimension, real_t learning_rate, int max_epoch) {
WordsToVecResult res; WordsToVecResult res;
#ifdef OLD_CLASSES_ENABLED res.word_list = remove_empty(remove_stop_words_vec(create_word_list(sentences)));
res.word_list = removeNullByte(removeStopWords(createWordList(sentences)));
std::vector<std::vector<std::string>> segmented_sentences; Vector<Vector<String>> segmented_sentences;
segmented_sentences.resize(sentences.size()); segmented_sentences.resize(sentences.size());
for (uint32_t i = 0; i < sentences.size(); i++) { for (int i = 0; i < sentences.size(); i++) {
segmented_sentences[i] = removeStopWords(sentences[i]); segmented_sentences.write[i] = remove_stop_words(sentences[i]);
} }
std::vector<std::string> inputStrings; Vector<String> inputStrings;
std::vector<std::string> outputStrings; Vector<String> outputStrings;
for (uint32_t i = 0; i < segmented_sentences.size(); i++) { for (int i = 0; i < segmented_sentences.size(); i++) {
for (uint32_t j = 0; j < segmented_sentences[i].size(); j++) { for (int j = 0; j < segmented_sentences[i].size(); j++) {
for (int k = windowSize; k > 0; k--) { for (int k = windowSize; k > 0; k--) {
int jmk = (int)j - k; int jmk = (int)j - k;
@ -1101,70 +1013,99 @@ MLPPData::WordsToVecResult MLPPData::word_to_vec(std::vector<std::string> senten
} }
} }
uint32_t inputSize = inputStrings.size(); int input_size = inputStrings.size();
inputStrings.insert(inputStrings.end(), outputStrings.begin(), outputStrings.end()); inputStrings.append_array(outputStrings);
std::vector<std::vector<real_t>> BOW = MLPPData::BOW(inputStrings, "Binary"); Ref<MLPPMatrix> bow = bag_of_words(inputStrings, BAG_OF_WORDS_TYPE_BINARY);
Size2i bow_size = bow->size();
std::vector<std::vector<real_t>> inputSet; Ref<MLPPMatrix> input_set;
std::vector<std::vector<real_t>> outputSet; Ref<MLPPMatrix> output_set;
for (uint32_t i = 0; i < inputSize; i++) { input_set.instance();
inputSet.push_back(BOW[i]); output_set.instance();
input_set->resize(Size2i(bow_size.x, input_size));
Ref<MLPPVector> row_tmp;
row_tmp.instance();
row_tmp->resize(bow_size.x);
for (int i = 0; i < input_size; i++) {
bow->row_get_into_mlpp_vector(i, row_tmp);
input_set->row_set_mlpp_vector(i, row_tmp);
} }
for (uint32_t i = inputSize; i < BOW.size(); i++) { output_set->resize(Size2i(bow_size.x, bow_size.y - input_size));
outputSet.push_back(BOW[i]); Size2i output_set_size = output_set->size();
for (int i = 0; i < output_set_size.y; i++) {
bow->row_get_into_mlpp_vector(i + input_size, row_tmp);
input_set->row_set_mlpp_vector(i, row_tmp);
} }
MLPPSoftmaxNetOld *model; MLPPSoftmaxNet *model;
if (type == "Skipgram") { if (type == WORD_TO_VEC_TYPE_SKIPGRAM) {
model = new MLPPSoftmaxNetOld(outputSet, inputSet, dimension); model = memnew(MLPPSoftmaxNet(output_set, input_set, dimension));
} else { // else = CBOW. We maintain it is a default. } else { // else = CBOW. We maintain it is a default.
model = new MLPPSoftmaxNetOld(inputSet, outputSet, dimension); model = memnew(MLPPSoftmaxNet(input_set, output_set, dimension));
} }
model->gradientDescent(learning_rate, max_epoch, false); model->train_gradient_descent(learning_rate, max_epoch);
res.word_embeddings = model->getEmbeddings(); res.word_embeddings = model->get_embeddings();
delete model; memdelete(model);
#endif
return res; return res;
} }
std::vector<std::vector<real_t>> MLPPData::LSA(std::vector<std::string> sentences, int dim) { Ref<MLPPMatrix> MLPPData::lsa(Vector<String> sentences, int dim) {
#ifdef OLD_CLASSES_ENABLED MLPPLinAlg alg;
MLPPLinAlgOld alg;
std::vector<std::vector<real_t>> docWordData = BOW(sentences, "Binary"); Ref<MLPPMatrix> doc_word_data = bag_of_words(sentences, BAG_OF_WORDS_TYPE_BINARY);
MLPPLinAlg::SVDResult svr_res = alg.svd(doc_word_data);
Ref<MLPPMatrix> S_trunc = alg.zeromatnm(dim, dim);
Ref<MLPPMatrix> Vt_trunc;
Vt_trunc.instance();
Vt_trunc->resize(Size2i(svr_res.Vt->size().x, dim));
Ref<MLPPVector> row_rmp;
row_rmp.instance();
row_rmp->resize(svr_res.Vt->size().x);
MLPPLinAlgOld::SVDResultOld svr_res = alg.SVD(docWordData);
std::vector<std::vector<real_t>> S_trunc = alg.zeromat(dim, dim);
std::vector<std::vector<real_t>> Vt_trunc;
for (int i = 0; i < dim; i++) { for (int i = 0; i < dim; i++) {
S_trunc[i][i] = svr_res.S[i][i]; S_trunc->element_set(i, i, svr_res.S->element_get(i, i));
Vt_trunc.push_back(svr_res.Vt[i]);
svr_res.Vt->row_get_into_mlpp_vector(i, row_rmp);
Vt_trunc->row_set_mlpp_vector(i, row_rmp);
} }
std::vector<std::vector<real_t>> embeddings = alg.matmult(S_trunc, Vt_trunc); Ref<MLPPMatrix> embeddings = S_trunc->multn(Vt_trunc);
return embeddings; return embeddings;
#else
return std::vector<std::vector<real_t>>();
#endif
} }
std::vector<std::string> MLPPData::createWordList(std::vector<std::string> sentences) { struct SVDResult {
std::string combinedText = ""; Ref<MLPPMatrix> U;
for (uint32_t i = 0; i < sentences.size(); i++) { Ref<MLPPMatrix> S;
Ref<MLPPMatrix> Vt;
};
Vector<String> MLPPData::create_word_list(Vector<String> sentences) {
String combined_text = "";
for (int i = 0; i < sentences.size(); i++) {
if (i != 0) { if (i != 0) {
combinedText += " "; combined_text += " ";
} }
combinedText += sentences[i];
combined_text += sentences[i];
} }
return removeSpaces(vecToSet(removeStopWords(combinedText))); return remove_spaces(vec_to_set(remove_stop_words(combined_text)));
} }
// EXTRA // EXTRA
@ -1183,7 +1124,6 @@ void MLPPData::setInputNames(std::string fileName, std::vector<std::string> &inp
} }
std::vector<std::vector<real_t>> MLPPData::featureScaling(std::vector<std::vector<real_t>> X) { std::vector<std::vector<real_t>> MLPPData::featureScaling(std::vector<std::vector<real_t>> X) {
#ifdef OLD_CLASSES_ENABLED
MLPPLinAlgOld alg; MLPPLinAlgOld alg;
X = alg.transpose(X); X = alg.transpose(X);
std::vector<real_t> max_elements, min_elements; std::vector<real_t> max_elements, min_elements;
@ -1201,13 +1141,9 @@ std::vector<std::vector<real_t>> MLPPData::featureScaling(std::vector<std::vecto
} }
} }
return alg.transpose(X); return alg.transpose(X);
#else
return std::vector<std::vector<real_t>>();
#endif
} }
std::vector<std::vector<real_t>> MLPPData::meanNormalization(std::vector<std::vector<real_t>> X) { std::vector<std::vector<real_t>> MLPPData::meanNormalization(std::vector<std::vector<real_t>> X) {
#ifdef OLD_CLASSES_ENABLED
MLPPLinAlgOld alg; MLPPLinAlgOld alg;
MLPPStatOld stat; MLPPStatOld stat;
// (X_j - mu_j) / std_j, for every j // (X_j - mu_j) / std_j, for every j
@ -1217,13 +1153,9 @@ std::vector<std::vector<real_t>> MLPPData::meanNormalization(std::vector<std::ve
X[i] = alg.scalarMultiply(1 / stat.standardDeviation(X[i]), X[i]); X[i] = alg.scalarMultiply(1 / stat.standardDeviation(X[i]), X[i]);
} }
return X; return X;
#else
return std::vector<std::vector<real_t>>();
#endif
} }
std::vector<std::vector<real_t>> MLPPData::meanCentering(std::vector<std::vector<real_t>> X) { std::vector<std::vector<real_t>> MLPPData::meanCentering(std::vector<std::vector<real_t>> X) {
#ifdef OLD_CLASSES_ENABLED
MLPPStatOld stat; MLPPStatOld stat;
for (uint32_t i = 0; i < X.size(); i++) { for (uint32_t i = 0; i < X.size(); i++) {
real_t mean_i = stat.mean(X[i]); real_t mean_i = stat.mean(X[i]);
@ -1232,9 +1164,6 @@ std::vector<std::vector<real_t>> MLPPData::meanCentering(std::vector<std::vector
} }
} }
return X; return X;
#else
return std::vector<std::vector<real_t>>();
#endif
} }
std::vector<std::vector<real_t>> MLPPData::oneHotRep(std::vector<real_t> tempOutputSet, int n_class) { std::vector<std::vector<real_t>> MLPPData::oneHotRep(std::vector<real_t> tempOutputSet, int n_class) {
@ -1320,6 +1249,15 @@ Ref<MLPPMatrix> MLPPData::one_hot_rep(const Ref<MLPPVector> &temp_output_set, in
return output_set; return output_set;
} }
void MLPPData::load_default_suffixes() {
// Our list of suffixes which we use to compare against
suffixes = String("eer er ion ity ment ness or sion ship th able ible al ant ary ful ic ious ous ive less y ed en ing ize ise ly ward wise").split_spaces();
}
void MLPPData::load_default_stop_words() {
stop_words = String("i me my myself we our ours ourselves you your yours yourself yourselves he him his himself she her hers herself it its itself they them their theirs themselves what which who whom this that these those am is are was were be been being have has had having do does did doing a an the and but if or because as until while of at by for with about against between into through during before after above below to from up down in out on off over under again further then once here there when where why how all any both each few more most other some such no nor not only own same so than too very s t can will just don should now").split_spaces();
}
void MLPPData::_bind_methods() { void MLPPData::_bind_methods() {
ClassDB::bind_method(D_METHOD("load_breast_cancer", "path"), &MLPPData::load_breast_cancer); ClassDB::bind_method(D_METHOD("load_breast_cancer", "path"), &MLPPData::load_breast_cancer);
ClassDB::bind_method(D_METHOD("load_breast_cancer_svc", "path"), &MLPPData::load_breast_cancer_svc); ClassDB::bind_method(D_METHOD("load_breast_cancer_svc", "path"), &MLPPData::load_breast_cancer_svc);

View File

@ -140,31 +140,39 @@ public:
// Text-Based & NLP // Text-Based & NLP
std::string toLower(std::string text); std::string toLower(std::string text);
std::vector<char> split(std::string text); std::vector<char> split(std::string text);
std::vector<std::string> splitSentences(std::string data); Vector<String> split_sentences(String data);
std::vector<std::string> removeSpaces(std::vector<std::string> data); Vector<String> remove_spaces(Vector<String> data);
std::vector<std::string> removeNullByte(std::vector<std::string> data); Vector<String> remove_empty(Vector<String> data);
std::vector<std::string> segment(std::string text); Vector<String> segment(String text);
std::vector<real_t> tokenize(std::string text); Vector<int> tokenize(String text);
std::vector<std::string> removeStopWords(std::string text); Vector<String> remove_stop_words(String text);
std::vector<std::string> removeStopWords(std::vector<std::string> segmented_data); Vector<String> remove_stop_words_vec(Vector<String> segmented_data);
std::string stemming(std::string text); String stemming(String text);
std::vector<std::vector<real_t>> BOW(std::vector<std::string> sentences, std::string = "Default"); enum BagOfWordsType {
std::vector<std::vector<real_t>> TFIDF(std::vector<std::string> sentences); BAG_OF_WORDS_TYPE_DEFAULT = 0,
BAG_OF_WORDS_TYPE_BINARY,
std::tuple<std::vector<std::vector<real_t>>, std::vector<std::string>> word2Vec(std::vector<std::string> sentences, std::string type, int windowSize, int dimension, real_t learning_rate, int max_epoch);
struct WordsToVecResult {
std::vector<std::vector<real_t>> word_embeddings;
std::vector<std::string> word_list;
}; };
WordsToVecResult word_to_vec(std::vector<std::string> sentences, std::string type, int windowSize, int dimension, real_t learning_rate, int max_epoch); Ref<MLPPMatrix> bag_of_words(Vector<String> sentences, BagOfWordsType type = BAG_OF_WORDS_TYPE_DEFAULT);
Ref<MLPPMatrix> tfidf(Vector<String> sentences);
std::vector<std::vector<real_t>> LSA(std::vector<std::string> sentences, int dim); struct WordsToVecResult {
Ref<MLPPMatrix> word_embeddings;
Vector<String> word_list;
};
std::vector<std::string> createWordList(std::vector<std::string> sentences); enum WordToVecType {
WORD_TO_VEC_TYPE_CBOW = 0,
WORD_TO_VEC_TYPE_SKIPGRAM,
};
WordsToVecResult word_to_vec(Vector<String> sentences, WordToVecType type, int windowSize, int dimension, real_t learning_rate, int max_epoch);
Ref<MLPPMatrix> lsa(Vector<String> sentences, int dim);
Vector<String> create_word_list(Vector<String> sentences);
// Extra // Extra
void setInputNames(std::string fileName, std::vector<std::string> &inputNames); void setInputNames(std::string fileName, std::vector<std::string> &inputNames);
@ -239,6 +247,12 @@ public:
return ret; return ret;
} }
void load_default_suffixes();
void load_default_stop_words();
Vector<String> suffixes;
Vector<String> stop_words;
protected: protected:
static void _bind_methods(); static void _bind_methods();
}; };