From 14150405b042871e516baffcc1b32014a08d3281 Mon Sep 17 00:00:00 2001 From: Relintai Date: Thu, 28 Dec 2023 17:41:20 +0100 Subject: [PATCH] Converted some methods in MLPPData. --- mlpp/data/data.cpp | 468 ++++++++++++++++++++------------------------- mlpp/data/data.h | 52 +++-- 2 files changed, 236 insertions(+), 284 deletions(-) diff --git a/mlpp/data/data.cpp b/mlpp/data/data.cpp index efc330e..4084dd5 100644 --- a/mlpp/data/data.cpp +++ b/mlpp/data/data.cpp @@ -12,11 +12,9 @@ #include "../lin_alg/lin_alg.h" #include "../stat/stat.h" -#ifdef OLD_CLASSES_ENABLED #include "../lin_alg/lin_alg_old.h" -#include "../softmax_net/softmax_net_old.h" +#include "../softmax_net/softmax_net.h" #include "../stat/stat_old.h" -#endif #include #include @@ -520,7 +518,6 @@ std::tuple>, std::vector>, s // MULTIVARIATE SUPERVISED void MLPPData::setData(int k, std::string fileName, std::vector> &inputSet, std::vector &outputSet) { -#ifdef OLD_CLASSES_ENABLED MLPPLinAlgOld alg; std::string inputTemp; std::string outputTemp; @@ -546,11 +543,9 @@ void MLPPData::setData(int k, std::string fileName, std::vector inputName, std::string outputName, std::vector> inputSet, std::vector outputSet) { -#ifdef OLD_CLASSES_ENABLED MLPPLinAlgOld alg; inputSet = alg.transpose(inputSet); for (uint32_t i = 0; i < inputSet.size(); i++) { @@ -564,13 +559,11 @@ void MLPPData::printData(std::vector inputName, std::string outputN for (uint32_t i = 0; i < outputSet.size(); i++) { std::cout << outputSet[i] << std::endl; } -#endif } // UNSUPERVISED void MLPPData::setData(int k, std::string fileName, std::vector> &inputSet) { -#ifdef OLD_CLASSES_ENABLED MLPPLinAlgOld alg; std::string inputTemp; @@ -592,11 +585,9 @@ void MLPPData::setData(int k, std::string fileName, std::vector inputName, std::vector> inputSet) { -#ifdef OLD_CLASSES_ENABLED MLPPLinAlgOld alg; inputSet = alg.transpose(inputSet); for (uint32_t i = 0; i < inputSet.size(); i++) { @@ -605,7 +596,6 @@ void MLPPData::printData(std::vector inputName, std::vector> MLPPData::rgb2gray(std::vector>> MLPPData::rgb2ycbcr(std::vector>> input) { -#ifdef OLD_CLASSES_ENABLED MLPPLinAlgOld alg; std::vector>> YCbCr; YCbCr = alg.resize(YCbCr, input); @@ -673,15 +662,11 @@ std::vector>> MLPPData::rgb2ycbcr(std::vector>>(); -#endif } // Conversion formulas available here: // https://www.rapidtables.com/convert/color/rgb-to-hsv.html std::vector>> MLPPData::rgb2hsv(std::vector>> input) { -#ifdef OLD_CLASSES_ENABLED MLPPLinAlgOld alg; std::vector>> HSV; HSV = alg.resize(HSV, input); @@ -720,34 +705,23 @@ std::vector>> MLPPData::rgb2hsv(std::vector>>(); -#endif } // http://machinethatsees.blogspot.com/2013/07/how-to-convert-rgb-to-xyz-or-vice-versa.html std::vector>> MLPPData::rgb2xyz(std::vector>> input) { -#ifdef OLD_CLASSES_ENABLED MLPPLinAlgOld alg; std::vector>> XYZ; XYZ = alg.resize(XYZ, input); std::vector> RGB2XYZ = { { 0.4124564, 0.3575761, 0.1804375 }, { 0.2126726, 0.7151522, 0.0721750 }, { 0.0193339, 0.1191920, 0.9503041 } }; return alg.vector_wise_tensor_product(input, RGB2XYZ); -#else - return std::vector>>(); -#endif } std::vector>> MLPPData::xyz2rgb(std::vector>> input) { -#ifdef OLD_CLASSES_ENABLED MLPPLinAlgOld alg; std::vector>> XYZ; XYZ = alg.resize(XYZ, input); std::vector> RGB2XYZ = alg.inverse({ { 0.4124564, 0.3575761, 0.1804375 }, { 0.2126726, 0.7151522, 0.0721750 }, { 0.0193339, 0.1191920, 0.9503041 } }); return alg.vector_wise_tensor_product(input, RGB2XYZ); -#else - return std::vector>>(); -#endif } // TEXT-BASED & NLP @@ -766,54 +740,58 @@ std::vector MLPPData::split(std::string text) { return split_data; } -std::vector MLPPData::splitSentences(std::string data) { - std::vector sentences; - std::string currentStr = ""; +Vector MLPPData::split_sentences(String data) { + Vector sentences; - for (uint32_t i = 0; i < data.length(); i++) { - currentStr.push_back(data[i]); + int start_index = 0; + + for (int i = 0; i < data.length() - 1; ++i) { if (data[i] == '.' && data[i + 1] != '.') { - sentences.push_back(currentStr); - currentStr = ""; - i++; + continue; + } + + if (data[i] == '.') { + sentences.push_back(data.substr_index(start_index, i)); + start_index = i + 1; } } + + if (start_index != data.length() - 1) { + sentences.push_back(data.substr_index(start_index, data.length() - 1)); + } + return sentences; } -std::vector MLPPData::removeSpaces(std::vector data) { - for (uint32_t i = 0; i < data.size(); i++) { - auto it = data[i].begin(); - for (uint32_t j = 0; j < data[i].length(); j++) { - if (data[i][j] == ' ') { - data[i].erase(it); - } - it++; - } +Vector MLPPData::remove_spaces(Vector data) { + for (int i = 0; i < data.size(); i++) { + data.write[i] = data[i].replace(" ", ""); } return data; } -std::vector MLPPData::removeNullByte(std::vector data) { - for (uint32_t i = 0; i < data.size(); i++) { - if (data[i] == "\0") { - data.erase(data.begin() + i); +Vector MLPPData::remove_empty(Vector data) { + for (int i = 0; i < data.size(); ++i) { + if (data[i].empty()) { + data.remove(i); } } + return data; } -std::vector MLPPData::segment(std::string text) { - std::vector segmented_data; +Vector MLPPData::segment(String text) { + Vector segmented_data; int prev_delim = 0; - for (uint32_t i = 0; i < text.length(); i++) { + + for (int i = 0; i < text.length(); i++) { if (text[i] == ' ') { segmented_data.push_back(text.substr(prev_delim, i - prev_delim)); prev_delim = i + 1; } else if (text[i] == ',' || text[i] == '!' || text[i] == '.' || text[i] == '-') { segmented_data.push_back(text.substr(prev_delim, i - prev_delim)); - std::string punc; - punc.push_back(text[i]); + String punc; + punc += text[i]; segmented_data.push_back(punc); prev_delim = i + 2; i++; @@ -825,16 +803,17 @@ std::vector MLPPData::segment(std::string text) { return segmented_data; } -std::vector MLPPData::tokenize(std::string text) { +Vector MLPPData::tokenize(String text) { int max_num = 0; bool new_num = true; - std::vector segmented_data = segment(text); - std::vector tokenized_data; + Vector segmented_data = segment(text); + Vector tokenized_data; tokenized_data.resize(segmented_data.size()); - for (uint32_t i = 0; i < segmented_data.size(); i++) { + + for (int i = 0; i < segmented_data.size(); i++) { for (int j = i - 1; j >= 0; j--) { if (segmented_data[i] == segmented_data[j]) { - tokenized_data[i] = tokenized_data[j]; + tokenized_data.write[i] = tokenized_data[j]; new_num = false; } } @@ -842,50 +821,49 @@ std::vector MLPPData::tokenize(std::string text) { new_num = true; } else { max_num++; - tokenized_data[i] = max_num; + tokenized_data.write[i] = max_num; } } + return tokenized_data; } -std::vector MLPPData::removeStopWords(std::string text) { - std::vector stopWords = { "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now" }; - std::vector segmented_data = removeSpaces(segment(toLower(text))); +Vector MLPPData::remove_stop_words(String text) { + Vector segmented_data = remove_spaces(segment(text.to_lower())); - for (uint32_t i = 0; i < stopWords.size(); i++) { - for (uint32_t j = 0; j < segmented_data.size(); j++) { - if (segmented_data[j] == stopWords[i]) { - segmented_data.erase(segmented_data.begin() + j); + for (int i = 0; i < stop_words.size(); i++) { + for (int j = 0; j < segmented_data.size(); j++) { + if (segmented_data[j] == stop_words[i]) { + segmented_data.remove(j); + --j; } } } + return segmented_data; } -std::vector MLPPData::removeStopWords(std::vector segmented_data) { - std::vector stopWords = { "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now" }; - for (uint32_t i = 0; i < segmented_data.size(); i++) { - for (uint32_t j = 0; j < stopWords.size(); j++) { - if (segmented_data[i] == stopWords[j]) { - segmented_data.erase(segmented_data.begin() + i); +Vector MLPPData::remove_stop_words_vec(Vector segmented_data) { + for (int i = 0; i < segmented_data.size(); i++) { + for (int j = 0; j < stop_words.size(); j++) { + if (segmented_data[i] == stop_words[j]) { + segmented_data.remove(i); + --i; } } } + return segmented_data; } -std::string MLPPData::stemming(std::string text) { - // Our list of suffixes which we use to compare against - std::vector suffixes = { "eer", "er", "ion", "ity", "ment", "ness", "or", "sion", "ship", "th", "able", "ible", "al", "ant", "ary", "ful", "ic", "ious", "ous", "ive", "less", "y", "ed", "en", "ing", "ize", "ise", "ly", "ward", "wise" }; +String MLPPData::stemming(String text) { int padding_size = 4; - char padding = ' '; // our padding + String padding = " "; // our padding - for (int i = 0; i < padding_size; i++) { - text[text.length() + i] = padding; // ' ' will be our padding value - } + text += String(padding).repeat(padding_size); // ' ' will be our padding value - for (uint32_t i = 0; i < text.size(); i++) { - for (uint32_t j = 0; j < suffixes.size(); j++) { + for (int i = 0; i < text.length(); i++) { + for (int j = 0; j < suffixes.size(); j++) { if (text.substr(i, suffixes[j].length()) == suffixes[j] && (text[i + suffixes[j].length()] == ' ' || text[i + suffixes[j].length()] == ',' || text[i + suffixes[j].length()] == '-' || text[i + suffixes[j].length()] == '.' || text[i + suffixes[j].length()] == '!')) { text.erase(i, suffixes[j].length()); } @@ -895,196 +873,130 @@ std::string MLPPData::stemming(std::string text) { return text; } -std::vector> MLPPData::BOW(std::vector sentences, std::string type) { +Ref MLPPData::bag_of_words(Vector sentences, BagOfWordsType type) { /* STEPS OF BOW: - 1) To lowercase (done by removeStopWords function by def) + 1) To lowercase (done by remove_stop_words function by def) 2) Removing stop words 3) Obtain a list of the used words 4) Create a one hot encoded vector of the words and sentences 5) Sentence.size() x list.size() matrix */ - std::vector wordList = removeNullByte(removeStopWords(createWordList(sentences))); + Vector word_list = remove_empty(remove_stop_words_vec(create_word_list(sentences))); - std::vector> segmented_sentences; + Vector> segmented_sentences; segmented_sentences.resize(sentences.size()); - for (uint32_t i = 0; i < sentences.size(); i++) { - segmented_sentences[i] = removeStopWords(sentences[i]); + for (int i = 0; i < sentences.size(); i++) { + segmented_sentences.write[i] = remove_stop_words(sentences[i]); } - std::vector> bow; + Ref bow; + bow.instance(); + bow->resize(Size2i(word_list.size(), sentences.size())); + bow->fill(0); - bow.resize(sentences.size()); - for (uint32_t i = 0; i < bow.size(); i++) { - bow[i].resize(wordList.size()); - } - - for (uint32_t i = 0; i < segmented_sentences.size(); i++) { - for (uint32_t j = 0; j < segmented_sentences[i].size(); j++) { - for (uint32_t k = 0; k < wordList.size(); k++) { - if (segmented_sentences[i][j] == wordList[k]) { - if (type == "Binary") { - bow[i][k] = 1; + for (int i = 0; i < segmented_sentences.size(); i++) { + for (int j = 0; j < segmented_sentences[i].size(); j++) { + for (int k = 0; k < word_list.size(); k++) { + if (segmented_sentences[i][j] == word_list[k]) { + if (type == BAG_OF_WORDS_TYPE_BINARY) { + bow->element_set(i, k, 1); } else { - bow[i][k]++; + bow->element_set(i, k, bow->element_get(i, k) + 1); } } } } } + return bow; } -std::vector> MLPPData::TFIDF(std::vector sentences) { -#ifdef OLD_CLASSES_ENABLED - MLPPLinAlgOld alg; - std::vector wordList = removeNullByte(removeStopWords(createWordList(sentences))); +Ref MLPPData::tfidf(Vector sentences) { + Vector word_list = remove_empty(remove_stop_words_vec(create_word_list(sentences))); - std::vector> segmented_sentences; + Vector> segmented_sentences; segmented_sentences.resize(sentences.size()); - for (uint32_t i = 0; i < sentences.size(); i++) { - segmented_sentences[i] = removeStopWords(sentences[i]); + for (int i = 0; i < sentences.size(); i++) { + segmented_sentences.write[i] = remove_stop_words(sentences[i]); } - std::vector> TF; - std::vector frequency; - frequency.resize(wordList.size()); - TF.resize(segmented_sentences.size()); - for (uint32_t i = 0; i < TF.size(); i++) { - TF[i].resize(wordList.size()); - } - for (uint32_t i = 0; i < segmented_sentences.size(); i++) { - std::vector present(wordList.size(), false); - for (uint32_t j = 0; j < segmented_sentences[i].size(); j++) { - for (uint32_t k = 0; k < wordList.size(); k++) { - if (segmented_sentences[i][j] == wordList[k]) { - TF[i][k]++; + Ref TF; + TF.instance(); + TF->resize(Size2i(word_list.size(), segmented_sentences.size())); + + Vector frequency; + frequency.resize(word_list.size()); + frequency.fill(0); + + Ref TF_row; + TF_row.instance(); + TF_row->resize(word_list.size()); + + for (int i = 0; i < segmented_sentences.size(); i++) { + Vector present; + present.resize(word_list.size()); + present.fill(false); + + for (int j = 0; j < segmented_sentences[i].size(); j++) { + for (int k = 0; k < word_list.size(); k++) { + if (segmented_sentences[i][j] == word_list[k]) { + TF->element_set(i, k, TF->element_get(i, k) + 1); + if (!present[k]) { - frequency[k]++; - present[k] = true; + frequency.write[k]++; + present.write[k] = true; } } } } - TF[i] = alg.scalarMultiply(real_t(1) / real_t(segmented_sentences[i].size()), TF[i]); + + TF->row_get_into_mlpp_vector(i, TF_row); + TF_row->scalar_multiply(real_t(1) / real_t(segmented_sentences[i].size())); + TF->row_set_mlpp_vector(i, TF_row); } - std::vector IDF; + Vector IDF; IDF.resize(frequency.size()); - for (uint32_t i = 0; i < IDF.size(); i++) { - IDF[i] = std::log((real_t)segmented_sentences.size() / (real_t)frequency[i]); + for (int i = 0; i < IDF.size(); i++) { + IDF.write[i] = Math::log((real_t)segmented_sentences.size() / (real_t)frequency[i]); } - std::vector> TFIDF; - TFIDF.resize(segmented_sentences.size()); - for (uint32_t i = 0; i < TFIDF.size(); i++) { - TFIDF[i].resize(wordList.size()); - } + Ref TFIDF; + TFIDF.instance(); + Size2i tfidf_size = Size2i(word_list.size(), segmented_sentences.size()); + TFIDF->resize(tfidf_size); - for (uint32_t i = 0; i < TFIDF.size(); i++) { - for (uint32_t j = 0; j < TFIDF[i].size(); j++) { - TFIDF[i][j] = TF[i][j] * IDF[j]; + for (int i = 0; i < tfidf_size.y; i++) { + for (int j = 0; j < tfidf_size.x; j++) { + TFIDF->element_set(i, j, TF->element_get(i, j) * IDF[j]); } } return TFIDF; -#else - return std::vector>(); -#endif } -std::tuple>, std::vector> MLPPData::word2Vec(std::vector sentences, std::string type, int windowSize, int dimension, real_t learning_rate, int max_epoch) { -#ifdef OLD_CLASSES_ENABLED - std::vector wordList = removeNullByte(removeStopWords(createWordList(sentences))); - - std::vector> segmented_sentences; - segmented_sentences.resize(sentences.size()); - - for (uint32_t i = 0; i < sentences.size(); i++) { - segmented_sentences[i] = removeStopWords(sentences[i]); - } - - std::vector inputStrings; - std::vector outputStrings; - - for (uint32_t i = 0; i < segmented_sentences.size(); i++) { - for (uint32_t j = 0; j < segmented_sentences[i].size(); j++) { - for (int k = windowSize; k > 0; k--) { - if (j - k >= 0) { - inputStrings.push_back(segmented_sentences[i][j]); - - outputStrings.push_back(segmented_sentences[i][j - k]); - } - if (j + k <= segmented_sentences[i].size() - 1) { - inputStrings.push_back(segmented_sentences[i][j]); - outputStrings.push_back(segmented_sentences[i][j + k]); - } - } - } - } - - uint32_t inputSize = inputStrings.size(); - - inputStrings.insert(inputStrings.end(), outputStrings.begin(), outputStrings.end()); - - std::vector> BOW = MLPPData::BOW(inputStrings, "Binary"); - - std::vector> inputSet; - std::vector> outputSet; - - for (uint32_t i = 0; i < inputSize; i++) { - inputSet.push_back(BOW[i]); - } - - for (uint32_t i = inputSize; i < BOW.size(); i++) { - outputSet.push_back(BOW[i]); - } - - MLPPSoftmaxNetOld *model; - - if (type == "Skipgram") { - model = new MLPPSoftmaxNetOld(outputSet, inputSet, dimension); - } else { // else = CBOW. We maintain it is a default. - model = new MLPPSoftmaxNetOld(inputSet, outputSet, dimension); - } - - model->gradientDescent(learning_rate, max_epoch, true); - - std::vector> wordEmbeddings = model->getEmbeddings(); - delete model; - return { wordEmbeddings, wordList }; -#else - return std::tuple>, std::vector>(); -#endif -} - -struct WordsToVecResult { - std::vector> word_embeddings; - std::vector word_list; -}; - -MLPPData::WordsToVecResult MLPPData::word_to_vec(std::vector sentences, std::string type, int windowSize, int dimension, real_t learning_rate, int max_epoch) { +MLPPData::WordsToVecResult MLPPData::word_to_vec(Vector sentences, WordToVecType type, int windowSize, int dimension, real_t learning_rate, int max_epoch) { WordsToVecResult res; -#ifdef OLD_CLASSES_ENABLED - res.word_list = removeNullByte(removeStopWords(createWordList(sentences))); + res.word_list = remove_empty(remove_stop_words_vec(create_word_list(sentences))); - std::vector> segmented_sentences; + Vector> segmented_sentences; segmented_sentences.resize(sentences.size()); - for (uint32_t i = 0; i < sentences.size(); i++) { - segmented_sentences[i] = removeStopWords(sentences[i]); + for (int i = 0; i < sentences.size(); i++) { + segmented_sentences.write[i] = remove_stop_words(sentences[i]); } - std::vector inputStrings; - std::vector outputStrings; + Vector inputStrings; + Vector outputStrings; - for (uint32_t i = 0; i < segmented_sentences.size(); i++) { - for (uint32_t j = 0; j < segmented_sentences[i].size(); j++) { + for (int i = 0; i < segmented_sentences.size(); i++) { + for (int j = 0; j < segmented_sentences[i].size(); j++) { for (int k = windowSize; k > 0; k--) { int jmk = (int)j - k; @@ -1101,70 +1013,99 @@ MLPPData::WordsToVecResult MLPPData::word_to_vec(std::vector senten } } - uint32_t inputSize = inputStrings.size(); + int input_size = inputStrings.size(); - inputStrings.insert(inputStrings.end(), outputStrings.begin(), outputStrings.end()); + inputStrings.append_array(outputStrings); - std::vector> BOW = MLPPData::BOW(inputStrings, "Binary"); + Ref bow = bag_of_words(inputStrings, BAG_OF_WORDS_TYPE_BINARY); + Size2i bow_size = bow->size(); - std::vector> inputSet; - std::vector> outputSet; + Ref input_set; + Ref output_set; - for (uint32_t i = 0; i < inputSize; i++) { - inputSet.push_back(BOW[i]); + input_set.instance(); + output_set.instance(); + + input_set->resize(Size2i(bow_size.x, input_size)); + + Ref row_tmp; + row_tmp.instance(); + row_tmp->resize(bow_size.x); + + for (int i = 0; i < input_size; i++) { + bow->row_get_into_mlpp_vector(i, row_tmp); + input_set->row_set_mlpp_vector(i, row_tmp); } - for (uint32_t i = inputSize; i < BOW.size(); i++) { - outputSet.push_back(BOW[i]); + output_set->resize(Size2i(bow_size.x, bow_size.y - input_size)); + Size2i output_set_size = output_set->size(); + + for (int i = 0; i < output_set_size.y; i++) { + bow->row_get_into_mlpp_vector(i + input_size, row_tmp); + input_set->row_set_mlpp_vector(i, row_tmp); } - MLPPSoftmaxNetOld *model; + MLPPSoftmaxNet *model; - if (type == "Skipgram") { - model = new MLPPSoftmaxNetOld(outputSet, inputSet, dimension); + if (type == WORD_TO_VEC_TYPE_SKIPGRAM) { + model = memnew(MLPPSoftmaxNet(output_set, input_set, dimension)); } else { // else = CBOW. We maintain it is a default. - model = new MLPPSoftmaxNetOld(inputSet, outputSet, dimension); + model = memnew(MLPPSoftmaxNet(input_set, output_set, dimension)); } - model->gradientDescent(learning_rate, max_epoch, false); + model->train_gradient_descent(learning_rate, max_epoch); - res.word_embeddings = model->getEmbeddings(); - delete model; -#endif + res.word_embeddings = model->get_embeddings(); + memdelete(model); return res; } -std::vector> MLPPData::LSA(std::vector sentences, int dim) { -#ifdef OLD_CLASSES_ENABLED - MLPPLinAlgOld alg; - std::vector> docWordData = BOW(sentences, "Binary"); +Ref MLPPData::lsa(Vector sentences, int dim) { + MLPPLinAlg alg; + + Ref doc_word_data = bag_of_words(sentences, BAG_OF_WORDS_TYPE_BINARY); + + MLPPLinAlg::SVDResult svr_res = alg.svd(doc_word_data); + + Ref S_trunc = alg.zeromatnm(dim, dim); + Ref Vt_trunc; + Vt_trunc.instance(); + Vt_trunc->resize(Size2i(svr_res.Vt->size().x, dim)); + + Ref row_rmp; + row_rmp.instance(); + row_rmp->resize(svr_res.Vt->size().x); - MLPPLinAlgOld::SVDResultOld svr_res = alg.SVD(docWordData); - std::vector> S_trunc = alg.zeromat(dim, dim); - std::vector> Vt_trunc; for (int i = 0; i < dim; i++) { - S_trunc[i][i] = svr_res.S[i][i]; - Vt_trunc.push_back(svr_res.Vt[i]); + S_trunc->element_set(i, i, svr_res.S->element_get(i, i)); + + svr_res.Vt->row_get_into_mlpp_vector(i, row_rmp); + Vt_trunc->row_set_mlpp_vector(i, row_rmp); } - std::vector> embeddings = alg.matmult(S_trunc, Vt_trunc); + Ref embeddings = S_trunc->multn(Vt_trunc); return embeddings; -#else - return std::vector>(); -#endif } -std::vector MLPPData::createWordList(std::vector sentences) { - std::string combinedText = ""; - for (uint32_t i = 0; i < sentences.size(); i++) { +struct SVDResult { + Ref U; + Ref S; + Ref Vt; +}; + +Vector MLPPData::create_word_list(Vector sentences) { + String combined_text = ""; + + for (int i = 0; i < sentences.size(); i++) { if (i != 0) { - combinedText += " "; + combined_text += " "; } - combinedText += sentences[i]; + + combined_text += sentences[i]; } - return removeSpaces(vecToSet(removeStopWords(combinedText))); + return remove_spaces(vec_to_set(remove_stop_words(combined_text))); } // EXTRA @@ -1183,7 +1124,6 @@ void MLPPData::setInputNames(std::string fileName, std::vector &inp } std::vector> MLPPData::featureScaling(std::vector> X) { -#ifdef OLD_CLASSES_ENABLED MLPPLinAlgOld alg; X = alg.transpose(X); std::vector max_elements, min_elements; @@ -1201,13 +1141,9 @@ std::vector> MLPPData::featureScaling(std::vector>(); -#endif } std::vector> MLPPData::meanNormalization(std::vector> X) { -#ifdef OLD_CLASSES_ENABLED MLPPLinAlgOld alg; MLPPStatOld stat; // (X_j - mu_j) / std_j, for every j @@ -1217,13 +1153,9 @@ std::vector> MLPPData::meanNormalization(std::vector>(); -#endif } std::vector> MLPPData::meanCentering(std::vector> X) { -#ifdef OLD_CLASSES_ENABLED MLPPStatOld stat; for (uint32_t i = 0; i < X.size(); i++) { real_t mean_i = stat.mean(X[i]); @@ -1232,9 +1164,6 @@ std::vector> MLPPData::meanCentering(std::vector>(); -#endif } std::vector> MLPPData::oneHotRep(std::vector tempOutputSet, int n_class) { @@ -1320,6 +1249,15 @@ Ref MLPPData::one_hot_rep(const Ref &temp_output_set, in return output_set; } +void MLPPData::load_default_suffixes() { + // Our list of suffixes which we use to compare against + suffixes = String("eer er ion ity ment ness or sion ship th able ible al ant ary ful ic ious ous ive less y ed en ing ize ise ly ward wise").split_spaces(); +} + +void MLPPData::load_default_stop_words() { + stop_words = String("i me my myself we our ours ourselves you your yours yourself yourselves he him his himself she her hers herself it its itself they them their theirs themselves what which who whom this that these those am is are was were be been being have has had having do does did doing a an the and but if or because as until while of at by for with about against between into through during before after above below to from up down in out on off over under again further then once here there when where why how all any both each few more most other some such no nor not only own same so than too very s t can will just don should now").split_spaces(); +} + void MLPPData::_bind_methods() { ClassDB::bind_method(D_METHOD("load_breast_cancer", "path"), &MLPPData::load_breast_cancer); ClassDB::bind_method(D_METHOD("load_breast_cancer_svc", "path"), &MLPPData::load_breast_cancer_svc); diff --git a/mlpp/data/data.h b/mlpp/data/data.h index dac293e..bb8b7e1 100644 --- a/mlpp/data/data.h +++ b/mlpp/data/data.h @@ -140,31 +140,39 @@ public: // Text-Based & NLP std::string toLower(std::string text); std::vector split(std::string text); - std::vector splitSentences(std::string data); - std::vector removeSpaces(std::vector data); - std::vector removeNullByte(std::vector data); - std::vector segment(std::string text); - std::vector tokenize(std::string text); - std::vector removeStopWords(std::string text); - std::vector removeStopWords(std::vector segmented_data); + Vector split_sentences(String data); + Vector remove_spaces(Vector data); + Vector remove_empty(Vector data); + Vector segment(String text); + Vector tokenize(String text); + Vector remove_stop_words(String text); + Vector remove_stop_words_vec(Vector segmented_data); - std::string stemming(std::string text); + String stemming(String text); - std::vector> BOW(std::vector sentences, std::string = "Default"); - std::vector> TFIDF(std::vector sentences); - - std::tuple>, std::vector> word2Vec(std::vector sentences, std::string type, int windowSize, int dimension, real_t learning_rate, int max_epoch); - - struct WordsToVecResult { - std::vector> word_embeddings; - std::vector word_list; + enum BagOfWordsType { + BAG_OF_WORDS_TYPE_DEFAULT = 0, + BAG_OF_WORDS_TYPE_BINARY, }; - WordsToVecResult word_to_vec(std::vector sentences, std::string type, int windowSize, int dimension, real_t learning_rate, int max_epoch); + Ref bag_of_words(Vector sentences, BagOfWordsType type = BAG_OF_WORDS_TYPE_DEFAULT); + Ref tfidf(Vector sentences); - std::vector> LSA(std::vector sentences, int dim); + struct WordsToVecResult { + Ref word_embeddings; + Vector word_list; + }; - std::vector createWordList(std::vector sentences); + enum WordToVecType { + WORD_TO_VEC_TYPE_CBOW = 0, + WORD_TO_VEC_TYPE_SKIPGRAM, + }; + + WordsToVecResult word_to_vec(Vector sentences, WordToVecType type, int windowSize, int dimension, real_t learning_rate, int max_epoch); + + Ref lsa(Vector sentences, int dim); + + Vector create_word_list(Vector sentences); // Extra void setInputNames(std::string fileName, std::vector &inputNames); @@ -239,6 +247,12 @@ public: return ret; } + void load_default_suffixes(); + void load_default_stop_words(); + + Vector suffixes; + Vector stop_words; + protected: static void _bind_methods(); };