diff --git a/CMakeLists.txt b/CMakeLists.txt index 8209dc4..b7d2c9b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -62,4 +62,6 @@ add_library(slowmokit src/slowmokit/methods/metrics/f1score.hpp src/slowmokit/methods/metrics/f1score.cpp src/slowmokit/methods/metrics/mean_squared_error.hpp - src/slowmokit/methods/metrics/mean_squared_error.cpp) \ No newline at end of file + src/slowmokit/methods/metrics/mean_squared_error.cpp + src/slowmokit/methods/neighbors/categorical_nb/categorical_nb.cpp + src/slowmokit/methods/neighbors/categorical_nb/categorical_nb.hpp) diff --git a/docs/methods/neighbors/categorical_nb/categorical_nb.md b/docs/methods/neighbors/categorical_nb/categorical_nb.md new file mode 100644 index 0000000..1d8e1ca --- /dev/null +++ b/docs/methods/neighbors/categorical_nb/categorical_nb.md @@ -0,0 +1,43 @@ +# Categorical Naive Bayes + +Categorical Naive Bayes model. + +Categorical Naive Bayes computes likelihood by : occurences of instance in current label divided by total occurences of instance + +It computes the prior probability by : occurence of current class divided by size of training array + +And finally prior*likelihood gives the probabilty of class according to which output class is predicted. + + +## Parameters + +| Name | Definition | Type | +|--------|----------------------------------------|-------------------| +| `xTrain` |The training set containing the features|`vector>`| +| `yTrain` |The set containing the class correspoding to respective xTrain instance|`vector`| + +## Methods + +| Name | Definition | Return value | +|--------------------------------------------------|---------------------------------------------|-----------| +| `fit(vector> xTrain,vector yTrain)` |fit the class instance with the training data|`NULL`| +| `predict(vector xTest` |predict the label for xTest vector of features|`string`| + +## Example + +```cpp +std::vector > xTrain = {{"fifa", "yes", "no", "no"}, + {"fifa", "no", "yes", "no"}, + {"fifa", "no", "no", "yes"}, + {"cc", "no", "no", "yes"}, + {"fifa", "yes", "yes", "yes"}, + {"cc", "yes", "yes", "yes"}, + {"cc", "no", "no", "yes"}, + {"cc", "yes", "no", "no"}}; +std::vector yTrain = {"m", "m", "m", "m", "f", "f", "f", "f"}; +std::vector xTest = {"fifa", "no", "yes", "yes"}; +categoricalNB classifier; +classifier.fit(xTrain,yTrain); +std::cout<> xTrain = {{"fifa", "yes", "no", "no"}, +// {"fifa", "no", "yes", "no"}, +// {"fifa", "no", "no", "yes"}, +// {"cc", "no", "no", "yes"}, +// {"fifa", "yes", "yes", "yes"}, +// {"cc", "yes", "yes", "yes"}, +// {"cc", "no", "no", "yes"}, +// {"cc", "yes", "no", "no"}}; +// std::vector yTrain = {"m", "m", "m", "m", "f", "f", "f", "f"}; +// std::vector xTest = {"fifa", "no", "yes", "yes"}; +// categoricalNB classifier; +// classifier.fit(xTrain,yTrain) +// std::cout< +void CategoricalNB::fit(std::vector> xTrain, + std::vector yTrain) +{ + // posterior = (prior * likelihood)/evidence + // since, evidence is same among all instances -> we can ignore it + + if (xTrain.size() == 0 || yTrain.size() == 0) + { + throw "Make sure that you have atleast one train example"; + } + if (xTrain.size() != yTrain.size()) + { + throw "Number of instances and target values must be equal"; + } + featureSize = xTrain[0].size(); + std::map + occurences; // to store the occurences of each label in the training set + // and then use it in finding the priors + + for (auto category : yTrain) + { + occurences[category]++; // incrementing the occurence of each label + } + for (auto current : occurences) + { + priors[current.first] = + double(current.second) / + yTrain.size(); // calculating the priors of each label, dividing the + // occurence by the size of the total set + } + + std::map, int>> + counts; // to store the count of each cell corresponding to it's label, + // it's row and it's category + + for (int i = 0; i < (int) (xTrain.size()); + i++) // iterating over the training data grid + { + std::vector current = xTrain[i]; // current row + int j = 0; // to keep track of the current column for each cell + for (auto curr : current) // iterating over current row + { + counts[yTrain[i]] + [{curr, j}]++; // incrementing the value based on it's label , it's + // category and it's column(denoting it's feature) + j++; + } + } + for (auto current : + counts) // iterating over the counts map , to calculate likelihoods + { + for (auto e : + current.second) // iterating over the map corresponding to each label + // to find the likelihood of each entry + { + likelihoods[current.first][{e.first.first, e.first.second}] = + ((double(e.second)) / (occurences[current.first])); + // likelihood[label][current feature]=occ in current/total no of occ + } + } +} + +template + +std::string CategoricalNB::predict( + std::vector xTest) // predicting the label on the basis of training set +{ + if (((int) (xTest.size())) != featureSize) + { + throw "The number of features in training and testing set must be same"; + } + std::map probs; + for (auto curr : priors) // since, posterior = prior*likelihood , we will + // calculate the same for each label to give the + // label with highest probability as the output + { + probs[curr.first] = curr.second; + int j = 0; + for (auto feature : xTest) + { + probs[curr.first] *= + likelihoods[curr.first][{feature, j}]; // calculating posterior for + // each feature in Testing data + j++; + } + } + double maxProb = 0; + std::string out; + for (auto prob : probs) + { + if (prob.second > + maxProb) // finding the highest probability among all options + { + maxProb = prob.second; + out = prob.first; + } + } + return out; +} \ No newline at end of file diff --git a/src/slowmokit/methods/neighbors/categorical_nb/categorical_nb.hpp b/src/slowmokit/methods/neighbors/categorical_nb/categorical_nb.hpp new file mode 100644 index 0000000..1be2369 --- /dev/null +++ b/src/slowmokit/methods/neighbors/categorical_nb/categorical_nb.hpp @@ -0,0 +1,37 @@ +/** + * @file methods/neighbors/categorical_nb/categorical_naive_bayes.hpp + * + * The header file including the Categorical Naive Bayes algorithm + */ + +#ifndef SLOWMOKIT_CATEGORICAL_NB_HPP +#define SLOWMOKIT_CATEGORICAL_NB_HPP + +#include "../../../core.hpp" + +template class CategoricalNB +{ + + public: + /** + * @brief Fitting the training set into instance of class + * @param xTrain all training 2-d feature x values + * @param yTrain all training 1-d string y values + * @return NULL + */ + void fit(std::vector> xTrain, std::vector yTrain); + + /** + * @brief Predicting the class for xTest on the basis of training set + * @param xTest all testing feature x values + * @return string denoting the class label of xTest + */ + std::string predict(std::vector xTest); + + private: + std::map priors; + std::map, double>> likelihoods; + int featureSize; +}; + +#endif // SLOWMOKIT_CATEGORICAL_NB_HPP