diff --git a/KNNClassifier.py b/KNNClassifier.py new file mode 100644 index 0000000..272200b --- /dev/null +++ b/KNNClassifier.py @@ -0,0 +1,29 @@ +from typing import Any + +import numpy as np +from numpy import floating + + +class KNNClassifier: + def __init__(self, k: int = 3) -> None: + self.x_train: np.ndarray = np.array([]) + self.y_train: np.ndarray = np.array([]) + self.k: int = k + + self.x_test: np.ndarray = np.array([]) + + def fit(self, x: np.ndarray, y: np.ndarray) -> None: + self.x_train = x + self.y_train = y + + def predict(self, x: np.ndarray, y: np.ndarray | None = None) -> np.ndarray | tuple[np.ndarray, float]: + predictions = [] + for i in range(x.shape[0]): + distances = np.linalg.norm(self.x_train - x[i], axis=1) + k_indices = np.argsort(distances)[:self.k] + k_labels = self.y_train[k_indices] + prediction = np.bincount(k_labels).argmax() + predictions.append(prediction) + if y is None: + return np.array(predictions) + return np.array(predictions), np.mean(predictions != y) diff --git a/README.md b/README.md index 00dbeff..8cac645 100644 --- a/README.md +++ b/README.md @@ -1 +1,84 @@ -# MachineLearning \ No newline at end of file +# Lab 3 – k-Nearest Neighbors (kNN) Classifier + +This branch contains the implementation and report for Lab 3: kNN Classifier, assigned as part of the Machine Learning course in the Robotics Engineering program. + +*🧭 Assignment Objectives* + +- Load and preprocess a suitable classification dataset + +- Implement a flexible k-Nearest Neighbors (kNN) classifier + +- Evaluate classifier performance across various settings and configurations + +- Generate meaningful statistics and visualize results + +*πŸ“ Task 1 - Dataset Options* + +πŸ”ΉLarge Option – MNIST Handwritten Digits + + - 70,000 grayscale images (28Γ—28 pixels) of handwritten digits (0–9) + + - Pre-split into: + + - 60,000 training samples + + - 10,000 test samples + + +πŸ”ΈSmaller Option – Wine Dataset + + - 178 observations, 13 chemical descriptors + + - 3 wine classes + + +*βš™οΈ Task 2 – Implementing the kNN Classifier* + + - Create a function with the following inputs: + + - train_X: (n Γ— d) training data + + - train_y: (n Γ— 1) training labels + + - test_X: (m Γ— d) test data + + - k: number of neighbors + + - (Optional) test_y: (m Γ— 1) ground truth labels + + - The function should: + + - Validate argument count (nargin) + + - Ensure training/test dimensions match + + - Check that k > 0 and k ≀ n + + - Classify test data according to the kNN rule + + - If ground truth is provided, compute and return error rate + +*πŸ“Š Task 3 – Evaluation and Testing* + + - MNIST Evaluation: + Perform binary classification: + + - For each digit (0–9), classify it vs the remaining 9 (e.g., "is it a 1?" vs "not 1") + + - Use multiple k values: k = 1, 2, 3, 4, 5, 10, 15, 20, 30, 40, 50 + + - Suggestion: avoid k values divisible by the number of classes to reduce ties + +βœ… For each configuration: + + - Compute confusion matrix + + - Derive metrics: accuracy, precision, recall, F1-score + + - Aggregate results (e.g., average and standard deviation or interquartile range) + +πŸ“ˆ Present results in: + + - Plots (e.g., accuracy vs k) + + - Tables summarizing classification performance for each digit and k value diff --git a/main.py b/main.py new file mode 100644 index 0000000..14eb67b --- /dev/null +++ b/main.py @@ -0,0 +1,179 @@ +import sys +import numpy as np +import KNNClassifier +from tabulate import tabulate +import matplotlib.pyplot as plt +from sklearn.datasets import load_wine +from sklearn.model_selection import train_test_split + + +def confusion_matrix(y_true: np.ndarray, y_pred: np.ndarray) -> np.ndarray: + num_classes = max(y_true.max(), y_pred.max()) + 1 + cm = np.zeros((num_classes, num_classes), dtype=int) + for i in range(len(y_true)): + cm[y_true[i], y_pred[i]] += 1 + return cm + + +def precision(y_true: np.ndarray, y_pred: np.ndarray) -> float: + cm = confusion_matrix(y_true, y_pred) + return np.diag(cm) / np.sum(cm, axis=0) + + +def recall(y_true: np.ndarray, y_pred: np.ndarray) -> float: + cm = confusion_matrix(y_true, y_pred) + return np.diag(cm) / np.sum(cm, axis=1) + + +def f1_score(y_true: np.ndarray, y_pred: np.ndarray) -> float: + precision_val = np.nanmean(precision(y_true, y_pred)) + recall_val = np.nanmean(recall(y_true, y_pred)) + denominator = precision_val + recall_val + if denominator == 0: + return 0 + return 2 * (precision_val * recall_val) / denominator + + +if __name__ == "__main__": + + k: list[int] + + wine = load_wine() + + data: np.ndarray = wine.data + target: np.ndarray = wine.target + + x_train, x_test, y_train, y_test = train_test_split(wine.data, wine.target, test_size=0.2, random_state=42) + + # Normalize train_x and test_x + train_min = x_train.min(axis=0) + train_max = x_train.max(axis=0) + # Normalize train_x + x_train = (x_train - train_min) / (train_max - train_min) + # Normalize test_x + x_test = (x_test - train_min) / (train_max - train_min) + + # Values of k to test + k = [1, 2, 3, 4, 5, 10, 15, 20, 30, 40, 50] + num_mandatory_args = 5 # List of all mandatory arguments + num_received_args = len([x_train, y_train, x_test, y_test] + list(k)) # List of all arguments + if num_received_args < num_mandatory_args: + raise ValueError("All arguments must be provided") + + classes = np.unique(y_train) + for i in range(len(k)): + assert 0 <= k[i] <= x_train.shape[0], "k must be between 0 and the number of training samples" + assert isinstance(k[i], int), "k must be an integer" + # Accuracy results storage + accuracy_results: dict = {cls: {k_val: [] for k_val in k} for cls in classes} + + mean_val_prec: list = [] + median_val_prec: list = [] + avd_val_prec: list = [] + + mean_val_rec: list = [] + median_val_rec: list = [] + avd_val_rec: list = [] + + mean_val_f1: list = [] + median_val_f1: list = [] + avd_val_f1: list = [] + + first_quantile_prec: list = [] + third_quantile_prec: list = [] + + first_quantile_rec: list = [] + third_quantile_rec: list = [] + + first_quantile_f1: list = [] + third_quantile_f1: list = [] + + for cls in classes: + # Create one-vs-all labels for training and testing + binary_y_train = np.where(y_train == cls, 1, 0) + binary_y_test = np.where(y_test == cls, 1, 0) + + precision_val: list = [] + recall_val: list = [] + f1_score_val: list = [] + + for neighbors in k: + # Create a k-NN classifier + kNN = KNNClassifier.KNNClassifier(k=neighbors) + # Train on binary labels + kNN.fit(x_train, binary_y_train) + # Predict on test set + y_pred: np.ndarray + error_rate: float + y_pred, error_rate = kNN.predict(x_test, y_test) + print(f"Class {cls}, k={neighbors}, Error rate: {error_rate}") + + # Compute accuracy + accuracy = np.mean(y_pred == binary_y_test) + accuracy_results[cls][neighbors].append(accuracy) + + # Compute confusion matrix + conf_matrix: np.ndarray = confusion_matrix(binary_y_test, y_pred) + # Compute precision, recall, and F1 score + precision_val.append(precision(binary_y_test, y_pred)) + recall_val.append(recall(binary_y_test, y_pred)) + f1_score_val.append(f1_score(binary_y_test, y_pred)) + + # Compute statistical measures of precision, recall, and F1 score + mean_val_prec.append(np.mean(precision_val)) + median_val_prec.append(np.median(precision_val)) + avd_val_prec.append(np.mean(np.abs(np.array(precision_val) - mean_val_prec[-1]))) + first_quantile_prec.append(np.percentile(precision_val, 25)) + third_quantile_prec.append(np.percentile(precision_val, 75)) + + mean_val_rec.append(np.mean(recall_val)) + median_val_rec.append(np.median(recall_val)) + avd_val_rec.append(np.mean(np.abs(np.array(recall_val) - mean_val_rec[-1]))) + first_quantile_rec.append(np.percentile(recall_val, 25)) + third_quantile_rec.append(np.percentile(recall_val, 75)) + + mean_val_f1.append(np.mean(f1_score_val)) + median_val_f1.append(np.median(f1_score_val)) + avd_val_f1.append(np.mean(np.abs(np.array(f1_score_val) - mean_val_f1[-1]))) + first_quantile_f1.append(np.percentile(f1_score_val, 25)) + third_quantile_f1.append(np.percentile(f1_score_val, 75)) + + # Print table for each class + headers = ["Metric", "Mean", "Median", "AAD", "1st Quantile", "3rd Quantile"] + table_data = [ + ["Precision", mean_val_prec[-1], median_val_prec[-1], avd_val_prec[-1], first_quantile_prec[-1], + third_quantile_prec[-1]], + ["Recall", mean_val_rec[-1], median_val_rec[-1], avd_val_rec[-1], first_quantile_rec[-1], + third_quantile_rec[-1]], + ["F1 Score", mean_val_f1[-1], median_val_f1[-1], avd_val_f1[-1], first_quantile_f1[-1], + third_quantile_f1[-1]] + ] + print(f"Class {cls} Statistics over all k values:") + print(tabulate(table_data, headers=headers, tablefmt="grid")) + + # Print total table + # Print total table + headers = ["Metric", "Mean", "Median", "AAD", "1st Quantile", "3rd Quantile"] + total_data = [ + ["Precision", np.mean(mean_val_prec), np.median(mean_val_prec), np.mean(avd_val_prec), + np.percentile(mean_val_prec, 25), np.percentile(mean_val_prec, 75)], + ["Recall", np.mean(mean_val_rec), np.median(mean_val_rec), np.mean(avd_val_rec), + np.percentile(mean_val_rec, 25), np.percentile(mean_val_rec, 75)], + ["F1 Score", np.mean(mean_val_f1), np.median(mean_val_f1), np.mean(avd_val_f1), np.percentile(mean_val_f1, 25), + np.percentile(mean_val_f1, 75)] + ] + print("Total Statistics for all classes:") + print(tabulate(total_data, headers=headers, tablefmt="grid")) + + # Plot results + plt.figure(figsize=(12, 8)) + for cls in classes: + plt.plot(k, [100 * np.mean(acc) for acc in accuracy_results[cls].values()], label=f'Class {cls}', alpha=0.4, + marker='o', markersize=5, linewidth=2) + plt.xlabel('Number of Neighbors (k)') + plt.ylabel('Accuracy (%)') + plt.title('k-NN Accuracy for Each Class vs. All Others') + plt.legend(loc='best') + plt.grid(True, linestyle='--', alpha=0.15) + plt.xticks(k) + plt.show()