From 8618dad0def5dbabff492b474ee3ea53503fb7b5 Mon Sep 17 00:00:00 2001 From: utsavt9936 <49581123+utsavt9936@users.noreply.github.com> Date: Wed, 26 Jun 2019 23:25:03 +0530 Subject: [PATCH 1/2] Assignment 2 --- .../Week_2/Utsav_week2/costFunction.m | 35 ++++ .../Week_2/Utsav_week2/costFunctionReg.m | 34 ++++ .../Week_2/Utsav_week2/ex2.m | 151 ++++++++++++++++++ .../Week_2/Utsav_week2/ex2_reg.m | 136 ++++++++++++++++ .../Week_2/Utsav_week2/mapFeature.m | 21 +++ .../Week_2/Utsav_week2/plotData.m | 31 ++++ .../Week_2/Utsav_week2/plotDecisionBoundary.m | 48 ++++++ .../Week_2/Utsav_week2/predict.m | 35 ++++ .../Week_2/Utsav_week2/sigmoid.m | 18 +++ 9 files changed, 509 insertions(+) create mode 100644 Programming Assignment/Week_2/Utsav_week2/costFunction.m create mode 100644 Programming Assignment/Week_2/Utsav_week2/costFunctionReg.m create mode 100644 Programming Assignment/Week_2/Utsav_week2/ex2.m create mode 100644 Programming Assignment/Week_2/Utsav_week2/ex2_reg.m create mode 100644 Programming Assignment/Week_2/Utsav_week2/mapFeature.m create mode 100644 Programming Assignment/Week_2/Utsav_week2/plotData.m create mode 100644 Programming Assignment/Week_2/Utsav_week2/plotDecisionBoundary.m create mode 100644 Programming Assignment/Week_2/Utsav_week2/predict.m create mode 100644 Programming Assignment/Week_2/Utsav_week2/sigmoid.m diff --git a/Programming Assignment/Week_2/Utsav_week2/costFunction.m b/Programming Assignment/Week_2/Utsav_week2/costFunction.m new file mode 100644 index 0000000..b13ffa4 --- /dev/null +++ b/Programming Assignment/Week_2/Utsav_week2/costFunction.m @@ -0,0 +1,35 @@ +function [J, grad] = costFunction(theta, X, y) +%COSTFUNCTION Compute cost and gradient for logistic regression +% J = COSTFUNCTION(theta, X, y) computes the cost of using theta as the +% parameter for logistic regression and the gradient of the cost +% w.r.t. to the parameters. + +% Initialize some useful values +m = length(y); % number of training examples + +% You need to return the following variables correctly +J = 0; +grad = zeros(size(theta)); + +% ====================== YOUR CODE HERE ====================== +% Instructions: Compute the cost of a particular choice of theta. +% You should set J to the cost. +% Compute the partial derivatives and set grad to the partial +% derivatives of the cost w.r.t. each parameter in theta +J=(-1/(m))*sum(y.*log(sigmoid(X*theta))+(1-y).*log(1-sigmoid(X*theta))); + +grad=(1/(m))*X'*(sigmoid(X*theta)-y); +% +% Note: grad should have the same dimensions as theta +% + + + + + + + + +% ============================================================= + +end diff --git a/Programming Assignment/Week_2/Utsav_week2/costFunctionReg.m b/Programming Assignment/Week_2/Utsav_week2/costFunctionReg.m new file mode 100644 index 0000000..3410aaf --- /dev/null +++ b/Programming Assignment/Week_2/Utsav_week2/costFunctionReg.m @@ -0,0 +1,34 @@ +function [J, grad] = costFunctionReg(theta, X, y, lambda) +%COSTFUNCTIONREG Compute cost and gradient for logistic regression with regularization +% J = COSTFUNCTIONREG(theta, X, y, lambda) computes the cost of using +% theta as the parameter for regularized logistic regression and the +% gradient of the cost w.r.t. to the parameters. + +% Initialize some useful values +m = length(y); % number of training examples + +% You need to return the following variables correctly +J = 0; +grad = zeros(size(theta)); + +% ====================== YOUR CODE HERE ====================== +% Instructions: Compute the cost of a particular choice of theta. +% You should set J to the cost. +% Compute the partial derivatives and set grad to the partial +% derivatives of the cost w.r.t. each parameter in theta + +J=(-1/(m))*(sum(y.*log(sigmoid(X*theta))+(1-y).*log(1-sigmoid(X*theta)))-(lambda/2)*(sum(theta.^2)-theta(1)^2)); + +grad(1)=(1/(m))*(X(:,1)'*(sigmoid(X*theta)-y)); +for i=2:size(X,2) + grad(i)=(1/(m))*(X(:,i)'*(sigmoid(X*theta)-y)+lambda*theta(i)); +end + + + + + + +% ============================================================= + +end diff --git a/Programming Assignment/Week_2/Utsav_week2/ex2.m b/Programming Assignment/Week_2/Utsav_week2/ex2.m new file mode 100644 index 0000000..103fe11 --- /dev/null +++ b/Programming Assignment/Week_2/Utsav_week2/ex2.m @@ -0,0 +1,151 @@ +%% Machine Learning Online Class - Exercise 2: Logistic Regression +% +% Instructions +% ------------ +% +% This file contains code that helps you get started on the logistic +% regression exercise. You will need to complete the following functions +% in this exericse: +% +% sigmoid.m +% costFunction.m +% predict.m +% costFunctionReg.m +% +% For this exercise, you will not need to change any code in this file, +% or any other files other than those mentioned above. +% + +%% Initialization +clear ; close all; clc + +%% Load Data +% The first two columns contains the exam scores and the third column +% contains the label. + +data = load('ex2data1.txt'); +X = data(:, [1, 2]); y = data(:, 3); + +%% ==================== Part 1: Plotting ==================== +% We start the exercise by first plotting the data to understand the +% the problem we are working with. + +fprintf(['Plotting data with + indicating (y = 1) examples and o ' ... + 'indicating (y = 0) examples.\n']); + +plotData(X, y); + +% Put some labels +hold on; +% Labels and Legend +xlabel('Exam 1 score') +ylabel('Exam 2 score') + +% Specified in plot order +legend('Admitted', 'Not admitted') +hold off; + +fprintf('\nProgram paused. Press enter to continue.\n'); +pause; + + +%% ============ Part 2: Compute Cost and Gradient ============ +% In this part of the exercise, you will implement the cost and gradient +% for logistic regression. You neeed to complete the code in +% costFunction.m + +% Setup the data matrix appropriately, and add ones for the intercept term +[m, n] = size(X); + +% Add intercept term to x and X_test +X = [ones(m, 1) X]; + +% Initialize fitting parameters +initial_theta = zeros(n + 1, 1); + +% Compute and display initial cost and gradient +[cost, grad] = costFunction(initial_theta, X, y); + +fprintf('Cost at initial theta (zeros): %f\n', cost); +fprintf('Expected cost (approx): 0.693\n'); +fprintf('Gradient at initial theta (zeros): \n'); +fprintf(' %f \n', grad); +fprintf('Expected gradients (approx):\n -0.1000\n -12.0092\n -11.2628\n'); + +% Compute and display cost and gradient with non-zero theta +test_theta = [-24; 0.2; 0.2]; +[cost, grad] = costFunction(test_theta, X, y); + +fprintf('\nCost at test theta: %f\n', cost); +fprintf('Expected cost (approx): 0.218\n'); +fprintf('Gradient at test theta: \n'); +fprintf(' %f \n', grad); +fprintf('Expected gradients (approx):\n 0.043\n 2.566\n 2.647\n'); + +fprintf('\nProgram paused. Press enter to continue.\n'); +pause; + + +%% ============= Part 3: Optimizing using fminunc ============= +% In this exercise, you will use a built-in function (fminunc) to find the +% optimal parameters theta. + +% Set options for fminunc +options = optimset('GradObj', 'on', 'MaxIter', 400); + +% Run fminunc to obtain the optimal theta +% This function will return theta and the cost +[theta, cost] = ... + fminunc(@(t)(costFunction(t, X, y)), initial_theta, options); + +% Print theta to screen +fprintf('Cost at theta found by fminunc: %f\n', cost); +fprintf('Expected cost (approx): 0.203\n'); +fprintf('theta: \n'); +fprintf(' %f \n', theta); +fprintf('Expected theta (approx):\n'); +fprintf(' -25.161\n 0.206\n 0.201\n'); + +% Plot Boundary +plotDecisionBoundary(theta, X, y); + +% Put some labels +hold on; +% Labels and Legend +xlabel('Exam 1 score') +ylabel('Exam 2 score') + +% Specified in plot order +legend('Admitted', 'Not admitted') +hold off; + +fprintf('\nProgram paused. Press enter to continue.\n'); +pause; + +%% ============== Part 4: Predict and Accuracies ============== +% After learning the parameters, you'll like to use it to predict the outcomes +% on unseen data. In this part, you will use the logistic regression model +% to predict the probability that a student with score 45 on exam 1 and +% score 85 on exam 2 will be admitted. +% +% Furthermore, you will compute the training and test set accuracies of +% our model. +% +% Your task is to complete the code in predict.m + +% Predict probability for a student with score 45 on exam 1 +% and score 85 on exam 2 + +prob = sigmoid([1 45 85] * theta); +fprintf(['For a student with scores 45 and 85, we predict an admission ' ... + 'probability of %f\n'], prob); +fprintf('Expected value: 0.775 +/- 0.002\n\n'); + +% Compute accuracy on our training set +p = predict(theta, X); + +fprintf('Train Accuracy: %f\n', mean(double(p == y)) * 100); +fprintf('Expected accuracy (approx): 89.0\n'); +fprintf('\n'); + + diff --git a/Programming Assignment/Week_2/Utsav_week2/ex2_reg.m b/Programming Assignment/Week_2/Utsav_week2/ex2_reg.m new file mode 100644 index 0000000..f363318 --- /dev/null +++ b/Programming Assignment/Week_2/Utsav_week2/ex2_reg.m @@ -0,0 +1,136 @@ +%% Machine Learning Online Class - Exercise 2: Logistic Regression +% +% Instructions +% ------------ +% +% This file contains code that helps you get started on the second part +% of the exercise which covers regularization with logistic regression. +% +% You will need to complete the following functions in this exericse: +% +% sigmoid.m +% costFunction.m +% predict.m +% costFunctionReg.m +% +% For this exercise, you will not need to change any code in this file, +% or any other files other than those mentioned above. +% + +%% Initialization +clear ; close all; clc + +%% Load Data +% The first two columns contains the X values and the third column +% contains the label (y). + +data = load('ex2data2.txt'); +X = data(:, [1, 2]); y = data(:, 3); + +plotData(X, y); + +% Put some labels +hold on; + +% Labels and Legend +xlabel('Microchip Test 1') +ylabel('Microchip Test 2') + +% Specified in plot order +legend('y = 1', 'y = 0') +hold off; + + +%% =========== Part 1: Regularized Logistic Regression ============ +% In this part, you are given a dataset with data points that are not +% linearly separable. However, you would still like to use logistic +% regression to classify the data points. +% +% To do so, you introduce more features to use -- in particular, you add +% polynomial features to our data matrix (similar to polynomial +% regression). +% + +% Add Polynomial Features + +% Note that mapFeature also adds a column of ones for us, so the intercept +% term is handled +X = mapFeature(X(:,1), X(:,2)); + +% Initialize fitting parameters +initial_theta = zeros(size(X, 2), 1); + +% Set regularization parameter lambda to 1 +lambda = 1; + +% Compute and display initial cost and gradient for regularized logistic +% regression +[cost, grad] = costFunctionReg(initial_theta, X, y, lambda); + +fprintf('Cost at initial theta (zeros): %f\n', cost); +fprintf('Expected cost (approx): 0.693\n'); +fprintf('Gradient at initial theta (zeros) - first five values only:\n'); +fprintf(' %f \n', grad(1:5)); +fprintf('Expected gradients (approx) - first five values only:\n'); +fprintf(' 0.0085\n 0.0188\n 0.0001\n 0.0503\n 0.0115\n'); + +fprintf('\nProgram paused. Press enter to continue.\n'); +pause; + +% Compute and display cost and gradient +% with all-ones theta and lambda = 10 +test_theta = ones(size(X,2),1); +[cost, grad] = costFunctionReg(test_theta, X, y, 10); + +fprintf('\nCost at test theta (with lambda = 10): %f\n', cost); +fprintf('Expected cost (approx): 3.16\n'); +fprintf('Gradient at test theta - first five values only:\n'); +fprintf(' %f \n', grad(1:5)); +fprintf('Expected gradients (approx) - first five values only:\n'); +fprintf(' 0.3460\n 0.1614\n 0.1948\n 0.2269\n 0.0922\n'); + +fprintf('\nProgram paused. Press enter to continue.\n'); +pause; + +%% ============= Part 2: Regularization and Accuracies ============= +% Optional Exercise: +% In this part, you will get to try different values of lambda and +% see how regularization affects the decision coundart +% +% Try the following values of lambda (0, 1, 10, 100). +% +% How does the decision boundary change when you vary lambda? How does +% the training set accuracy vary? +% + +% Initialize fitting parameters +initial_theta = zeros(size(X, 2), 1); + +% Set regularization parameter lambda to 1 (you should vary this) +lambda = 1; + +% Set Options +options = optimset('GradObj', 'on', 'MaxIter', 400); + +% Optimize +[theta, J, exit_flag] = ... + fminunc(@(t)(costFunctionReg(t, X, y, lambda)), initial_theta, options); + +% Plot Boundary +plotDecisionBoundary(theta, X, y); +hold on; +title(sprintf('lambda = %g', lambda)) + +% Labels and Legend +xlabel('Microchip Test 1') +ylabel('Microchip Test 2') + +legend('y = 1', 'y = 0', 'Decision boundary') +hold off; + +% Compute accuracy on our training set +p = predict(theta, X); + +fprintf('Train Accuracy: %f\n', mean(double(p == y)) * 100); +fprintf('Expected accuracy (with lambda = 1): 83.1 (approx)\n'); + diff --git a/Programming Assignment/Week_2/Utsav_week2/mapFeature.m b/Programming Assignment/Week_2/Utsav_week2/mapFeature.m new file mode 100644 index 0000000..d02a72a --- /dev/null +++ b/Programming Assignment/Week_2/Utsav_week2/mapFeature.m @@ -0,0 +1,21 @@ +function out = mapFeature(X1, X2) +% MAPFEATURE Feature mapping function to polynomial features +% +% MAPFEATURE(X1, X2) maps the two input features +% to quadratic features used in the regularization exercise. +% +% Returns a new feature array with more features, comprising of +% X1, X2, X1.^2, X2.^2, X1*X2, X1*X2.^2, etc.. +% +% Inputs X1, X2 must be the same size +% + +degree = 6; +out = ones(size(X1(:,1))); +for i = 1:degree + for j = 0:i + out(:, end+1) = (X1.^(i-j)).*(X2.^j); + end +end + +end \ No newline at end of file diff --git a/Programming Assignment/Week_2/Utsav_week2/plotData.m b/Programming Assignment/Week_2/Utsav_week2/plotData.m new file mode 100644 index 0000000..febf337 --- /dev/null +++ b/Programming Assignment/Week_2/Utsav_week2/plotData.m @@ -0,0 +1,31 @@ +function plotData(X, y) +%PLOTDATA Plots the data points X and y into a new figure +% PLOTDATA(x,y) plots the data points with + for the positive examples +% and o for the negative examples. X is assumed to be a Mx2 matrix. + +% Create New Figure +figure; hold on; + +% ====================== YOUR CODE HERE ====================== +% Instructions: Plot the positive and negative examples on a +% 2D plot, using the option 'k+' for the positive +% examples and 'ko' for the negative examples. +% + +pos=find(y==1);neg=find(y==0); +plot(X(pos,1),X(pos,2),'k+','LineWidth',2,'MarkerSize',7); +plot(X(neg,1),X(neg,2),'ko','MarkerFaceColor','y',2,'MarkerSize',7); + + + + + + + +% ========================================================================= + + + +hold off; + +end diff --git a/Programming Assignment/Week_2/Utsav_week2/plotDecisionBoundary.m b/Programming Assignment/Week_2/Utsav_week2/plotDecisionBoundary.m new file mode 100644 index 0000000..cd36314 --- /dev/null +++ b/Programming Assignment/Week_2/Utsav_week2/plotDecisionBoundary.m @@ -0,0 +1,48 @@ +function plotDecisionBoundary(theta, X, y) +%PLOTDECISIONBOUNDARY Plots the data points X and y into a new figure with +%the decision boundary defined by theta +% PLOTDECISIONBOUNDARY(theta, X,y) plots the data points with + for the +% positive examples and o for the negative examples. X is assumed to be +% a either +% 1) Mx3 matrix, where the first column is an all-ones column for the +% intercept. +% 2) MxN, N>3 matrix, where the first column is all-ones + +% Plot Data +plotData(X(:,2:3), y); +hold on + +if size(X, 2) <= 3 + % Only need 2 points to define a line, so choose two endpoints + plot_x = [min(X(:,2))-2, max(X(:,2))+2]; + + % Calculate the decision boundary line + plot_y = (-1./theta(3)).*(theta(2).*plot_x + theta(1)); + + % Plot, and adjust axes for better viewing + plot(plot_x, plot_y) + + % Legend, specific for the exercise + legend('Admitted', 'Not admitted', 'Decision Boundary') + axis([30, 100, 30, 100]) +else + % Here is the grid range + u = linspace(-1, 1.5, 50); + v = linspace(-1, 1.5, 50); + + z = zeros(length(u), length(v)); + % Evaluate z = theta*x over the grid + for i = 1:length(u) + for j = 1:length(v) + z(i,j) = mapFeature(u(i), v(j))*theta; + end + end + z = z'; % important to transpose z before calling contour + + % Plot z = 0 + % Notice you need to specify the range [0, 0] + contour(u, v, z, [0, 0], 'LineWidth', 2) +end +hold off + +end diff --git a/Programming Assignment/Week_2/Utsav_week2/predict.m b/Programming Assignment/Week_2/Utsav_week2/predict.m new file mode 100644 index 0000000..58c3c37 --- /dev/null +++ b/Programming Assignment/Week_2/Utsav_week2/predict.m @@ -0,0 +1,35 @@ +function p = predict(theta, X) +%PREDICT Predict whether the label is 0 or 1 using learned logistic +%regression parameters theta +% p = PREDICT(theta, X) computes the predictions for X using a +% threshold at 0.5 (i.e., if sigmoid(theta'*x) >= 0.5, predict 1) + +m = size(X, 1); % Number of training examples + +% You need to return the following variables correctly +p = zeros(m, 1); + +% ====================== YOUR CODE HERE ====================== +% Instructions: Complete the following code to make predictions using +% your learned logistic regression parameters. +% You should set p to a vector of 0's and 1's +% +for i=1:m + if sigmoid(theta'*X(i,:)')>=0.5 + p(i)=1; + else + p(i)=0; + end + +end + + + + + + + +% ========================================================================= + + +end diff --git a/Programming Assignment/Week_2/Utsav_week2/sigmoid.m b/Programming Assignment/Week_2/Utsav_week2/sigmoid.m new file mode 100644 index 0000000..5d4868a --- /dev/null +++ b/Programming Assignment/Week_2/Utsav_week2/sigmoid.m @@ -0,0 +1,18 @@ +function g = sigmoid(z) +%SIGMOID Compute sigmoid function +% g = SIGMOID(z) computes the sigmoid of z. + +% You need to return the following variables correctly +g = zeros(size(z)); + +% ====================== YOUR CODE HERE ====================== +% Instructions: Compute the sigmoid of each value of z (z can be a matrix, +% vector or scalar). + + +g=1./(1+exp(-z)); + + +% ============================================================= + +end From c84c3fb9fcde6dc6e1b30bac0342defd05e7e4f3 Mon Sep 17 00:00:00 2001 From: utsavt9936 <49581123+utsavt9936@users.noreply.github.com> Date: Wed, 26 Jun 2019 23:32:31 +0530 Subject: [PATCH 2/2] Assignment 3 --- .../Chirag_week3/Utsav_week3/dataset3Params.m | 57 ++++++++ .../Chirag_week3/Utsav_week3/emailFeatures.m | 63 ++++++++ .../Chirag_week3/Utsav_week3/gaussianKernel.m | 26 ++++ .../Chirag_week3/Utsav_week3/processEmail.m | 135 ++++++++++++++++++ 4 files changed, 281 insertions(+) create mode 100644 Programming Assignment/week_3/Chirag_week3/Utsav_week3/dataset3Params.m create mode 100644 Programming Assignment/week_3/Chirag_week3/Utsav_week3/emailFeatures.m create mode 100644 Programming Assignment/week_3/Chirag_week3/Utsav_week3/gaussianKernel.m create mode 100644 Programming Assignment/week_3/Chirag_week3/Utsav_week3/processEmail.m diff --git a/Programming Assignment/week_3/Chirag_week3/Utsav_week3/dataset3Params.m b/Programming Assignment/week_3/Chirag_week3/Utsav_week3/dataset3Params.m new file mode 100644 index 0000000..5cff76e --- /dev/null +++ b/Programming Assignment/week_3/Chirag_week3/Utsav_week3/dataset3Params.m @@ -0,0 +1,57 @@ +function [C, sigma] = dataset3Params(X, y, Xval, yval) +%DATASET3PARAMS returns your choice of C and sigma for Part 3 of the exercise +%where you select the optimal (C, sigma) learning parameters to use for SVM +%with RBF kernel +% [C, sigma] = DATASET3PARAMS(X, y, Xval, yval) returns your choice of C and +% sigma. You should complete this function to return the optimal C and +% sigma based on a cross-validation set. +% + +% You need to return the following variables correctly. +C = 1; +sigma = 0.3; +c=[0.01;0.03;0.1; 0.3; 1; 3; 10; 30]; +s=[0.01;0.03;0.1; 0.3; 1; 3; 10; 30]; + +% ====================== YOUR CODE HERE ====================== +% Instructions: Fill in this function to return the optimal C and sigma +% learning parameters found using the cross validation set. +% You can use svmPredict to predict the labels on the cross +% validation set. For example, +% predictions = svmPredict(model, Xval); +% will return the predictions on the cross validation set. +% +% Note: You can compute the prediction error using +% mean(double(predictions ~= yval)) +% +eff=0; +for i=1:8 + for j=1:8 + C=c(i); + sigma=s(j); + model= svmTrain(X, y, C, @(x1, x2) gaussianKernel(x1, x2, sigma)); + visualizeBoundary(X, y, model); + predictions=svmPredict(model,Xval); + e=mean(double(predictions == yval))*100; + if e>eff + eff=e; + m=C; + k=sigma; + endif + + + endfor +endfor + + + + + + +C=m; +sigma=k; + + +% ========================================================================= + +end diff --git a/Programming Assignment/week_3/Chirag_week3/Utsav_week3/emailFeatures.m b/Programming Assignment/week_3/Chirag_week3/Utsav_week3/emailFeatures.m new file mode 100644 index 0000000..0cdc2a2 --- /dev/null +++ b/Programming Assignment/week_3/Chirag_week3/Utsav_week3/emailFeatures.m @@ -0,0 +1,63 @@ +function x = emailFeatures(word_indices) +%EMAILFEATURES takes in a word_indices vector and produces a feature vector +%from the word indices +% x = EMAILFEATURES(word_indices) takes in a word_indices vector and +% produces a feature vector from the word indices. + +% Total number of words in the dictionary +n = 1899; + +% You need to return the following variables correctly. +x = zeros(n, 1); + +% ====================== YOUR CODE HERE ====================== +% Instructions: Fill in this function to return a feature vector for the +% given email (word_indices). To help make it easier to +% process the emails, we have have already pre-processed each +% email and converted each word in the email into an index in +% a fixed dictionary (of 1899 words). The variable +% word_indices contains the list of indices of the words +% which occur in one email. +% +% Concretely, if an email has the text: +% +% The quick brown fox jumped over the lazy dog. +% +% Then, the word_indices vector for this text might look +% like: +% +% 60 100 33 44 10 53 60 58 5 +% +% where, we have mapped each word onto a number, for example: +% +% the -- 60 +% quick -- 100 +% ... +% +% (note: the above numbers are just an example and are not the +% actual mappings). +% +% Your task is take one such word_indices vector and construct +% a binary feature vector that indicates whether a particular +% word occurs in the email. That is, x(i) = 1 when word i +% is present in the email. Concretely, if the word 'the' (say, +% index 60) appears in the email, then x(60) = 1. The feature +% vector should look like: +% +% x = [ 0 0 0 0 1 0 0 0 ... 0 0 0 0 1 ... 0 0 0 1 0 ..]; +% +% + + + + + + + + +% ========================================================================= + for i=1:length(word_indices) + x(word_indices(i))=1; + endfor + +end diff --git a/Programming Assignment/week_3/Chirag_week3/Utsav_week3/gaussianKernel.m b/Programming Assignment/week_3/Chirag_week3/Utsav_week3/gaussianKernel.m new file mode 100644 index 0000000..176fa6e --- /dev/null +++ b/Programming Assignment/week_3/Chirag_week3/Utsav_week3/gaussianKernel.m @@ -0,0 +1,26 @@ +function sim = gaussianKernel(x1, x2, sigma) +%RBFKERNEL returns a radial basis function kernel between x1 and x2 +% sim = gaussianKernel(x1, x2) returns a gaussian kernel between x1 and x2 +% and returns the value in sim + +% Ensure that x1 and x2 are column vectors +x1 = x1(:); x2 = x2(:); + +% You need to return the following variables correctly. +sim = 0; + +% ====================== YOUR CODE HERE ====================== +% Instructions: Fill in this function to return the similarity between x1 +% and x2 computed using a Gaussian kernel with bandwidth +% sigma +% +% +sim=exp(-sum((x1-x2).*(x1-x2))/(2*sigma^2)); + + + + + +% ============================================================= + +end diff --git a/Programming Assignment/week_3/Chirag_week3/Utsav_week3/processEmail.m b/Programming Assignment/week_3/Chirag_week3/Utsav_week3/processEmail.m new file mode 100644 index 0000000..dc39721 --- /dev/null +++ b/Programming Assignment/week_3/Chirag_week3/Utsav_week3/processEmail.m @@ -0,0 +1,135 @@ +function word_indices= processEmail(email_contents) +%PROCESSEMAIL preprocesses a the body of an email and +%returns a list of word_indices +% word_indices = PROCESSEMAIL(email_contents) preprocesses +% the body of an email and returns a list of indices of the +% words contained in the email. +% + +% Load Vocabulary +vocabList = getVocabList(); + +% Init return value +word_indices= []; + +% ========================== Preprocess Email =========================== + +% Find the Headers ( \n\n and remove ) +% Uncomment the following lines if you are working with raw emails with the +% full headers + +% hdrstart = strfind(email_contents, ([char(10) char(10)])); +% email_contents = email_contents(hdrstart(1):end); + +% Lower case +email_contents = lower(email_contents); + +% Strip all HTML +% Looks for any expression that starts with < and ends with > and replace +% and does not have any < or > in the tag it with a space +email_contents = regexprep(email_contents, '<[^<>]+>', ' '); + +% Handle Numbers +% Look for one or more characters between 0-9 +email_contents = regexprep(email_contents, '[0-9]+', 'number'); + +% Handle URLS +% Look for strings starting with http:// or https:// +email_contents = regexprep(email_contents, ... + '(http|https)://[^\s]*', 'httpaddr'); + +% Handle Email Addresses +% Look for strings with @ in the middle +email_contents = regexprep(email_contents, '[^\s]+@[^\s]+', 'emailaddr'); + +% Handle $ sign +email_contents = regexprep(email_contents, '[$]+', 'dollar'); + + +% ========================== Tokenize Email =========================== + +% Output the email to screen as well +fprintf('\n==== Processed Email ====\n\n'); + +% Process file +l = 0; + +while ~isempty(email_contents) + email_contents; + + % Tokenize and also get rid of any punctuation + [str, email_contents] = ... + strtok(email_contents, ... + [' @$/#.-:&*+=[]?!(){},''">_<;%' char(10) char(13)]); + + % Remove any non alphanumeric characters + str = regexprep(str, '[^a-zA-Z0-9]', ''); + + % Stem the word + % (the porterStemmer sometimes has issues, so we use a try catch block) + try str = porterStemmer(strtrim(str)); + catch str = ''; continue; + end; + + + % Skip the word if it is too short + if length(str) < 1 + continue; + end + + + % Look up the word in the dictionary and add to word_indices if + % found + % ====================== YOUR CODE HERE ====================== + % Instructions: Fill in this function to add the index of str to + % word_indices if it is in the vocabulary. At this point + % of the code, you have a stemmed word from the email in + % the variable str. You should look up str in the + % vocabulary list (vocabList). If a match exists, you + % should add the index of the word to the word_indices + % vector. Concretely, if str = 'action', then you should + % look up the vocabulary list to find where in vocabList + % 'action' appears. For example, if vocabList{18} = + % 'action', then, you should add 18 to the word_indices + % vector (e.g., word_indices = [word_indices ; 18]; ). + % + % Note: vocabList{idx} returns a the word with index idx in the + % vocabulary list. + % + % Note: You can use strcmp(str1, str2) to compare two strings (str1 and + % str2). It will return 1 only if the two strings are equivalent. + % + + + for j=1:length(vocabList) + if strcmp(str,vocabList(j)) + word_indices = [word_indices ; j]; + endif + endfor + + + + + + + + + + + % ============================================================= + + + % Print to screen, ensuring that the output lines are not too long + if (l + length(str) + 1) > 78 + fprintf('\n'); + l = 0; + end + fprintf('%s ', str); + l = l + length(str) + 1; + +end + +% Print footer +fprintf('\n\n=========================\n'); + +end