diff --git a/ds701_book/04-Linear-Algebra-Improvements.md b/ds701_book/04-Linear-Algebra-Improvements.md new file mode 100644 index 00000000..06e7c740 --- /dev/null +++ b/ds701_book/04-Linear-Algebra-Improvements.md @@ -0,0 +1,12 @@ +# Notes on Improvements to make to 04-Linear-Algebra-Refresher + +Consider reordering the entire lecture to start with simple systems of linear +equations and how to represent them as vectors and matrices, and then how to +solve them, types of solutions, what the A matrices says about the solutions. +Then go into the geometry of linear algebra, etc. See Strang MIT OCW linear +algebra course for ideas. + +Make the figures interactive. +For example on scalar multiplications of vectors, have a slider to change scaler +values between -2 and 2. + diff --git a/ds701_book/05-DTW-example.ipynb b/ds701_book/05-DTW-example.ipynb new file mode 100644 index 00000000..dbf27ef4 --- /dev/null +++ b/ds701_book/05-DTW-example.ipynb @@ -0,0 +1,383 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# DTW Example\n", + "\n", + "Based on chatgpt prompt." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "from ipywidgets import interact, FloatSlider\n", + "import ipywidgets as widgets" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "def dtw(x, y):\n", + " \"\"\"\n", + " Computes the Dynamic Time Warping (DTW) distance between two time series x and y.\n", + " Returns the accumulated cost matrix and the warping path.\n", + " \"\"\"\n", + " n, m = len(x), len(y)\n", + " dtw_matrix = np.full((n+1, m+1), np.inf)\n", + " dtw_matrix[0, 0] = 0\n", + "\n", + " # Populate the DTW matrix.\n", + " for i in range(1, n+1):\n", + " for j in range(1, m+1):\n", + " cost = abs(x[i-1] - y[j-1])\n", + " last_min = min(\n", + " dtw_matrix[i-1, j], # Insertion\n", + " dtw_matrix[i, j-1], # Deletion\n", + " dtw_matrix[i-1, j-1] # Match\n", + " )\n", + " dtw_matrix[i, j] = cost + last_min\n", + "\n", + " dtw_matrix = dtw_matrix[1:, 1:] # Remove the extra padding.\n", + "\n", + " # Backtrack to find the optimal warping path.\n", + " i, j = n-1, m-1\n", + " warping_path = [(i, j)]\n", + "\n", + " while i > 0 or j > 0:\n", + " if i == 0:\n", + " j -= 1\n", + " elif j == 0:\n", + " i -= 1\n", + " else:\n", + " steps = [dtw_matrix[i-1, j], dtw_matrix[i, j-1], dtw_matrix[i-1, j-1]]\n", + " argmin = np.argmin(steps)\n", + " if argmin == 0:\n", + " i -= 1\n", + " elif argmin == 1:\n", + " j -= 1\n", + " else:\n", + " i -= 1\n", + " j -= 1\n", + " warping_path.append((i, j))\n", + "\n", + " warping_path.reverse()\n", + " return dtw_matrix, dtw_matrix[-1, -1], warping_path\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# Create two artificial time series.\n", + "t = np.linspace(0, 2 * np.pi, 100)\n", + "x = np.sin(t)\n", + "y = np.sin(t + 1.0) # Phase-shifted sine wave.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "DTW distance between the two time series: 14.8877\n" + ] + } + ], + "source": [ + "\n", + "# Compute DTW.\n", + "dtw_matrix, distance, path = dtw(x, y)\n", + "print(f\"DTW distance between the two time series: {distance:.4f}\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "L2 norm between the two time series: 6.7984\n" + ] + } + ], + "source": [ + "# also calculate the l2 norm between the two\n", + "l2_norm = np.linalg.norm(x - y)\n", + "print(f\"L2 norm between the two time series: {l2_norm:.4f}\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "\n", + "# Plot the time series.\n", + "plt.figure(figsize=(12, 6))\n", + "plt.subplot(2, 1, 1)\n", + "plt.plot(t, x, label='Time Series 1')\n", + "plt.plot(t, y, label='Time Series 2')\n", + "plt.title('Artificial Time Series')\n", + "plt.legend()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "\n", + "# Plot the DTW matrix and warping path.\n", + "plt.subplot(2, 1, 2)\n", + "plt.imshow(dtw_matrix.T, origin='lower', cmap='gray', interpolation='nearest')\n", + "path_x, path_y = zip(*path)\n", + "plt.plot(path_x, path_y, 'r') # Warping path.\n", + "plt.title('DTW Matrix with Warping Path')\n", + "plt.xlabel('Time Series 1 Index')\n", + "plt.ylabel('Time Series 2 Index')\n", + "plt.colorbar()\n", + "\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "83260a9f270044ac8dd8d0b83d91ca0c", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "interactive(children=(FloatSlider(value=1.0, description='Shift:', max=3.2), Output()), _dom_classes=('widget-…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%matplotlib inline\n", + "\n", + "# Create two artificial time series.\n", + "t = np.linspace(0, 2 * np.pi, 100)\n", + "x = np.sin(t)\n", + "\n", + "def update_plot(shift):\n", + " y = np.sin(t + shift) # Phase-shifted sine wave.\n", + " \n", + " # Compute DTW.\n", + " dtw_matrix, distance, path = dtw(x, y)\n", + " l2_norm = np.linalg.norm(x - y)\n", + " \n", + " # Clear previous plots\n", + " plt.clf()\n", + " \n", + " # Plot the time series.\n", + " plt.figure(figsize=(12, 6))\n", + " plt.subplot(2, 1, 1)\n", + " plt.plot(t, x, label='Time Series 1')\n", + " plt.plot(t, y, label='Time Series 2')\n", + " plt.title(f'Artificial Time Series (Shift: {shift:.2f})')\n", + " plt.legend()\n", + " \n", + " # Plot the DTW matrix and warping path.\n", + " plt.subplot(2, 1, 2)\n", + " plt.imshow(dtw_matrix.T, origin='lower', cmap='gray', interpolation='nearest')\n", + " path_x, path_y = zip(*path)\n", + " plt.plot(path_x, path_y, 'r') # Warping path.\n", + " plt.title(f'DTW Matrix with Warping Path\\nDTW Distance: {distance:.4f}, L2 Norm: {l2_norm:.4f}')\n", + " plt.xlabel('Time Series 1 Index')\n", + " plt.ylabel('Time Series 2 Index')\n", + " plt.colorbar()\n", + " \n", + " plt.tight_layout()\n", + " plt.show()\n", + "\n", + "# Create an interactive slider\n", + "shift_slider = FloatSlider(value=1.0, min=0.0, max=3.2, step=0.1, description='Shift:')\n", + "interact(update_plot, shift=shift_slider)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Create two artificial time series with Gaussian noise and additional Gaussian distributions.\n", + "t = np.linspace(0, 2 * np.pi, 100)\n", + "x = np.sin(t) + np.random.normal(0, 0.1, t.shape)\n", + "\n", + "# Add Gaussian distributions at different offsets\n", + "gaussian1 = np.exp(-0.5 * ((t - np.pi/2) / 0.1)**2)\n", + "gaussian2 = np.exp(-0.5 * ((t - np.pi) / 0.1)**2)\n", + "gaussian3 = np.exp(-0.5 * ((t - 3*np.pi/2) / 0.1)**2)\n", + "gaussian4 = np.exp(-0.5 * ((t - 2*np.pi) / 0.1)**2)\n", + "\n", + "x += gaussian1 + gaussian2 + gaussian3 + gaussian4\n", + "\n", + "# Create the second time series with a random offset between 0.1 and 0.5\n", + "offset = np.random.uniform(-0.5, 0.5)\n", + "y = np.sin(t + offset) + np.random.normal(0, 0.1, t.shape)\n", + "offset = np.random.uniform(-0.5, 0.5)\n", + "y += np.exp(-0.5 * ((t - np.pi/2 + offset) / 0.1)**2)\n", + "offset = np.random.uniform(-0.5, 0.5)\n", + "y += np.exp(-0.5 * ((t - np.pi + offset) / 0.1)**2)\n", + "offset = np.random.uniform(-0.5, 0.5)\n", + "y += np.exp(-0.5 * ((t - 3*np.pi/2 + offset) / 0.1)**2)\n", + "offset = np.random.uniform(-0.5, 0.5)\n", + "y += np.exp(-0.5 * ((t - 2*np.pi + offset) / 0.1)**2)\n", + "\n", + "# Plot the new time series\n", + "plt.figure(figsize=(12, 6))\n", + "plt.plot(t, x, label='Time Series 1')\n", + "plt.plot(t, y, label='Time Series 2')\n", + "plt.title('Artificial Time Series with Gaussian Noise and Distributions')\n", + "plt.legend()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "DTW distance between the two time series: 13.5916\n" + ] + } + ], + "source": [ + "\n", + "# Compute DTW.\n", + "dtw_matrix, distance, path = dtw(x, y)\n", + "print(f\"DTW distance between the two time series: {distance:.4f}\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "L2 norm between the two time series: 4.4303\n" + ] + } + ], + "source": [ + "# also calculate the l2 norm between the two\n", + "l2_norm = np.linalg.norm(x - y)\n", + "print(f\"L2 norm between the two time series: {l2_norm:.4f}\")\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/ds701_book/09-GMM-EM-Convergence.qmd b/ds701_book/09-GMM-EM-Convergence.qmd new file mode 100644 index 00000000..6d3150f7 --- /dev/null +++ b/ds701_book/09-GMM-EM-Convergence.qmd @@ -0,0 +1,56 @@ +--- +title: "GMM EM Convergence" +--- + +The convergence criteria for the Expectation-Maximization (EM) algorithm generally revolve around assessing the change in either the model parameters or the likelihood function across iterations. Here are the common convergence criteria used: + +1. Log-Likelihood Convergence (Most Common) + +The EM algorithm seeks to maximize the log-likelihood of the observed data under the current model parameters. A common convergence criterion is based on the change in the log-likelihood value between successive iterations. The algorithm stops when the difference between the log-likelihood in two consecutive iterations is smaller than a predefined threshold (tolerance), typically denoted as tol. + +Convergence criterion: + +Where: + + •  is the log-likelihood at iteration , + •  is a small positive number (e.g., ). + +2. Parameter Convergence + +Instead of focusing on the log-likelihood, another approach is to check whether the model parameters (means, covariances, and mixture weights) have stabilized. This can be useful when the log-likelihood changes only marginally but the parameter values continue to evolve. + +Convergence criterion: + +Where: + + •  represents the model parameters (means, covariances, and weights) at iteration , + •  is the Euclidean (L2) norm, + •  is a small positive number. + +3. Responsibility Convergence + +This criterion checks whether the soft assignments or responsibilities (posterior probabilities of cluster membership) have stabilized across iterations. If the change in responsibilities between iterations is smaller than a threshold, the algorithm stops. + +Convergence criterion: + +Where: + + •  is the responsibility of data point  for cluster  at iteration , + •  is a small positive number. + +4. Maximum Number of Iterations + +The EM algorithm is typically capped at a maximum number of iterations to avoid long runtimes in cases where the log-likelihood or parameters converge very slowly or never fully stabilize. + +Criterion: + +Where: + + •  is a predefined limit (e.g., 100 or 500 iterations). + +Typical Setup in Practice: + + • The most commonly used criterion is log-likelihood convergence, combined with a maximum number of iterations as a safeguard. + • A typical tolerance value for the log-likelihood difference is  or , depending on the precision needed. + +In summary, the EM algorithm usually stops when the log-likelihood improvement between iterations falls below a small threshold or when the number of iterations exceeds a predefined limit. \ No newline at end of file diff --git a/ds701_book/09-GMM-EM.qmd b/ds701_book/09-GMM-EM.qmd new file mode 100644 index 00000000..82a934c5 --- /dev/null +++ b/ds701_book/09-GMM-EM.qmd @@ -0,0 +1,213 @@ +--- +title: "GMM EM Algorithm" +jupyter: python3 +--- + +## A GMM Example + +Imagine you’re running a coffee shop, and you have data on your customers’ preferences +for coffee. + +Each customer likes a different blend of beans, which you can represent +as a point in two dimensions: + +* sweetness (x-axis) and +* acidity (y-axis). + +Your goal is to identify three most popular blends (clusters) from a pile of +customer reviews that provide noisy measurements of these two characteristics. + +This data could be generated synthetically by sampling from three Gaussian +distributions, each representing a different coffee blend that your customers +might like. The task for the students would be to uncover these hidden coffee +blends using a GMM. + +For an intuitive way to explain Expectation-Maximization (EM): + +You can describe it as a two-step process that alternates between two roles: + +1. Expectation Step (E-step): The model takes a guess about the likelihood that each customer belongs to each blend. At this point, it might not be sure, so it assigns probabilities (soft assignments) based on how close the customers’ preferences are to the different blends. +2. Maximization Step (M-step): The model then updates its guess about the actual parameters of the coffee blends—essentially adjusting the mean, variance, and proportion of customers for each blend, based on the soft assignments from the previous step. + +The EM algorithm is like refining a recipe: each time you taste-test (E-step) and then tweak the ingredients (M-step), the blend becomes more representative of what customers want. + +Let’s continue with the Python code for generating synthetic data representing the coffee preferences of your customers. After generating the data, we’ll implement the EM algorithm step-by-step. + +## Step 1: Generate synthetic data + +```{python} +import numpy as np +import matplotlib.pyplot as plt + +# Set seed for reproducibility +np.random.seed(42) + +# Means and covariances for three Gaussian distributions (coffee blends) +means = np.array([[2, 3], [8, 7], [5, 10]]) # sweetness and acidity means +covariances = [np.array([[1, 0.5], [0.5, 1]]), # covariance matrix for blend 1 + np.array([[1, -0.3], [-0.3, 1]]), # covariance matrix for blend 2 + np.array([[1, 0], [0, 1]])] # covariance matrix for blend 3 + +# Number of points in each cluster (representing customers) +points_per_cluster = 100 + +# Generate points from each Gaussian distribution +X1 = np.random.multivariate_normal(means[0], covariances[0], points_per_cluster) +X2 = np.random.multivariate_normal(means[1], covariances[1], points_per_cluster) +X3 = np.random.multivariate_normal(means[2], covariances[2], points_per_cluster) + +# Combine all points into one dataset +X = np.vstack((X1, X2, X3)) +``` + + +## Plot the synthetic dataset + +```{python} +# Plot the synthetic dataset +plt.scatter(X[:, 0], X[:, 1], s=30, color='b', label="Customers' coffee preferences") +plt.title('Synthetic Coffee Preferences Dataset') +plt.xlabel('Sweetness') +plt.ylabel('Acidity') +plt.legend() +plt.show() +``` + +## Step 2: Implement the EM Algorithm + +Now that we have the data, we’ll implement the EM algorithm for a Gaussian Mixture Model. The algorithm involves two steps: + +1. Expectation (E-step): Estimate the probability that each data point belongs to each cluster based on current parameters (mean, covariance, and mixture weights). +2. Maximization (M-step): Update the parameters (means, covariances, and mixture weights) based on the probabilities from the E-step. + +Here is the Python code to implement this step-by-step: + +```{python} +from scipy.stats import multivariate_normal + +# Initialize parameters for the EM algorithm +# We'll randomly select data points as the initial means +# and initialize the covariances as identity matrices +# and the weights as equal. +def initialize_params(X, n_clusters): + np.random.seed(42) + n_samples, n_features = X.shape + + # Randomly initialize means from the data + means = X[np.random.choice(n_samples, n_clusters, False)] + + # Initialize covariances as identity matrices + covariances = [np.eye(n_features) for _ in range(n_clusters)] + + # Initialize equal weights for the mixture components + weights = np.ones(n_clusters) / n_clusters + + return means, covariances, weights + +# E-step: compute the responsibility (posterior probability that a point belongs to a cluster) +def expectation_step(X, means, covariances, weights): + n_samples, n_clusters = X.shape[0], len(means) + responsibilities = np.zeros((n_samples, n_clusters)) + + for k in range(n_clusters): + responsibilities[:, k] = weights[k] * multivariate_normal.pdf(X, means[k], covariances[k]) + + # Normalize the responsibilities + responsibilities /= responsibilities.sum(axis=1, keepdims=True) + + return responsibilities + +# M-step: update the parameters based on the current responsibilities +def maximization_step(X, responsibilities): + n_samples, n_clusters = responsibilities.shape + n_features = X.shape[1] + + # Initialize parameters + means = np.zeros((n_clusters, n_features)) + covariances = [] + weights = np.zeros(n_clusters) + + for k in range(n_clusters): + # Effective number of points assigned to cluster k + Nk = responsibilities[:, k].sum() + + # Update the means + means[k] = (X * responsibilities[:, k][:, np.newaxis]).sum(axis=0) / Nk + + # Update the covariance matrices + covariance_k = np.zeros((n_features, n_features)) + for i in range(n_samples): + diff = (X[i] - means[k]).reshape(-1, 1) + covariance_k += responsibilities[i, k] * (diff @ diff.T) + covariances.append(covariance_k / Nk) + + # Update the weights (mixture proportions) + weights[k] = Nk / n_samples + + return means, covariances, weights + +# Log-likelihood calculation +def log_likelihood(X, means, covariances, weights): + n_samples, n_clusters = X.shape[0], len(means) + log_likelihood = 0 + + for i in range(n_samples): + temp = 0 + for k in range(n_clusters): + temp += weights[k] * multivariate_normal.pdf(X[i], means[k], covariances[k]) + log_likelihood += np.log(temp) + + return log_likelihood + +# EM algorithm +def em_algorithm(X, n_clusters, n_iters=100, tol=1e-4): + # Initialize parameters + means, covariances, weights = initialize_params(X, n_clusters) + + log_likelihoods = [] + + for i in range(n_iters): + # E-step + responsibilities = expectation_step(X, means, covariances, weights) + + # M-step + means, covariances, weights = maximization_step(X, responsibilities) + + # Compute log-likelihood + log_likelihood_value = log_likelihood(X, means, covariances, weights) + log_likelihoods.append(log_likelihood_value) + + # Check for convergence + if i > 0 and np.abs(log_likelihoods[-1] - log_likelihoods[-2]) < tol: + break + + return means, covariances, weights, responsibilities, log_likelihoods +``` + +## Step 3: Run the EM algorithm + +```{python} +# Run the EM algorithm +n_clusters = 3 +means, covariances, weights, responsibilities, log_likelihoods = em_algorithm(X, n_clusters) + +# Plot the final clusters and means +plt.scatter(X[:, 0], X[:, 1], s=30, color='b', label="Data points") +plt.scatter(means[:, 0], means[:, 1], s=100, color='r', label="Estimated Means", marker='x') +plt.title('Clusters Found by Gaussian Mixture Model') +plt.xlabel('Sweetness') +plt.ylabel('Acidity') +plt.legend() +plt.show() +``` + +## Explanation + +1. Data Generation: We generated synthetic data by sampling points from three distinct Gaussian distributions, each representing a different coffee blend. +2. Expectation Step: The algorithm calculates the soft assignments (responsibilities) for each point to each cluster. +3. Maximization Step: The algorithm updates the parameters (means, covariances, and weights) to maximize the likelihood given the responsibilities. +4. Convergence: The algorithm stops when the log-likelihood improvement is below a certain threshold. + +This code should provide a clear step-by-step implementation of the EM algorithm, and the final plot will show the clusters found by the algorithm. + +Let me know if you need further clarifications! \ No newline at end of file diff --git a/ds701_book/14ex-decision-tree-iris-dataset.qmd b/ds701_book/14ex-decision-tree-iris-dataset.qmd new file mode 100644 index 00000000..7fb55f76 --- /dev/null +++ b/ds701_book/14ex-decision-tree-iris-dataset.qmd @@ -0,0 +1,53 @@ +--- +title: "14.1 Decision Trees on the Iris Dataset" +--- + +## Iris Data Set Example + +Let's look at the classic Iris data set which consists of 150 samples representing3 types of irises: + +1. Setosa, +2. Versicolor, and +3. Virginica + +The features for each sample are the petal and sepal length and width in cm. + +``` {python} +from sklearn.datasets import load_iris +from sklearn import tree +iris = load_iris() +X, y = iris.data, iris.target +clf = tree.DecisionTreeClassifier() +clf = clf.fit(X, y) +tree.plot_tree(clf, + filled=True, + max_depth=1, + impurity=False, + class_names=iris.target_names, + feature_names=iris.feature_names) +``` + +``` {.python} +# Render a PDF file of the tree +import graphviz +dot_data = tree.export_graphviz(clf, out_file=None) +graph = graphviz.Source(dot_data) +graph.render("iris") +``` + +``` {.python} +# Render a PNG file of the tree +graph.render("iris", format="png") +``` + +``` {python} +import graphviz + +dot_data = tree.export_graphviz(clf, out_file=None, + feature_names=iris.feature_names, + class_names=iris.target_names, + filled=True, rounded=True, + special_characters=True) +graph = graphviz.Source(dot_data) +graph +``` \ No newline at end of file diff --git a/ds701_book/15-Curse-of-Dimensionality.qmd b/ds701_book/15-Curse-of-Dimensionality.qmd new file mode 100644 index 00000000..cf7923e7 --- /dev/null +++ b/ds701_book/15-Curse-of-Dimensionality.qmd @@ -0,0 +1,388 @@ +--- +jupyter: python3 +--- + +Open In Colab + +From [notebook 8.4](https://github.com/udlbook/udlbook/blob/main/Notebooks/Chap08/8_4_High_Dimensional_Spaces.ipynb) + +# **Notebook 8.4: High-dimensional spaces** + +This notebook investigates the strange properties of high-dimensional spaces as discussed in the notes at the end of chapter 8. + +Work through the cells below, running each cell in turn. In various places you will see the words "TO DO". Follow the instructions at these places and make predictions about what is going to happen or write code to complete the functions. + +Contact me at udlbookmail@gmail.com if you find any mistakes or have any suggestions. + +```{python} +import numpy as np +import matplotlib.pyplot as plt +import scipy.special as sci +``` + +# How close are points in high dimensions? + +In this part of the notebook, we investigate how close random points are in 2D, 100D, and 1000D. In each case, we generate 1000 points and calculate the Euclidean distance between each pair. + +```{python} +# Fix the random seed so we all have the same random numbers +np.random.seed(0) + +n_data = 1000 + +# Create 1000 data examples (columns) each with 2 dimensions (rows) +n_dim = 2 +x_2D = np.random.normal(size=(n_dim,n_data)) + +# Create 1000 data examples (columns) each with 3 dimensions (rows) +n_dim = 3 +x_3D = np.random.normal(size=(n_dim,n_data)) + +# Create 1000 data examples (columns) each with 100 dimensions (rows) +n_dim = 100 +x_100D = np.random.normal(size=(n_dim,n_data)) + +# Create 1000 data examples (columns) each with 1000 dimensions (rows) +n_dim = 1000 +x_1000D = np.random.normal(size=(n_dim,n_data)) +``` + +```{python} +print(x_2D.shape) +print(x_3D.shape) +print(x_100D.shape) +print(x_1000D.shape) +``` + +```{python} +# scatter plot of the 2D data +plt.scatter(x_2D[0,:], x_2D[1,:]) +plt.title('2D data') +plt.show() +``` + +```{python} +# make an interactive scatter plot of the 3D data +fig = plt.figure() +ax = fig.add_subplot(111, projection='3d') +ax.scatter(x_3D[0,:], x_3D[1,:], x_3D[2,:]) +plt.title('3D data') +plt.show() +``` + +```{python} +from scipy.spatial import distance +``` + +```{python} +def distance_ratio(x): + # TODO -- replace the two lines below to calculate the largest and smallest Euclidean distance between + # the data points in the columns of x. DO NOT include the distance between the data point + # and itself (which is obviously zero) + + ### BEGIN SOLUTION + smallest_dist = np.inf + largest_dist = 0 + for i in range(x.shape[1]): + for j in range(x.shape[1]): + if i != j: + dist = np.linalg.norm(x[:,i] - x[:,j], ord=2) # ord=2 for Euclidean, ord=1 for Manhattan + # dist = distance.cosine(x[:,i].flatten(), x[:,j].flatten()) + if dist < smallest_dist: + smallest_dist = dist + if dist > largest_dist: + largest_dist = dist + ### END SOLUTION + + print(f"smallest_dist = {smallest_dist}, largest_dist = {largest_dist}") + # Calculate the ratio and return + dist_ratio = largest_dist / smallest_dist + return dist_ratio +``` + +```{python} +dist_ratio_2d = distance_ratio(x_2D) +print('Ratio of largest to smallest distance 2D: %3.3f'%(dist_ratio_2d)) + +dist_ratio_3d = distance_ratio(x_3D) +print('Ratio of largest to smallest distance 3D: %3.3f'%(dist_ratio_3d)) + +dist_ratio_100d = distance_ratio(x_100D) +print('Ratio of largest to smallest distance 100D: %3.3f'%(dist_ratio_100d)) + +dist_ratio_1000d = distance_ratio(x_1000D) +print('Ratio of largest to smallest distance 1000D: %3.3f'%(dist_ratio_1000d)) +``` + +```{python} +plt.scatter([2,3,100,1000], [dist_ratio_2d, dist_ratio_3d, dist_ratio_100d, dist_ratio_1000d]) +plt.plot([2,3,100,1000], [dist_ratio_2d, dist_ratio_3d, dist_ratio_100d, dist_ratio_1000d], '--', color='lightgray') +plt.xscale('log') +plt.title('Euclidean Distance ratio') +plt.xlabel('Dimension') +plt.ylabel('Distance ratio') +plt.show() +``` + +If you did this right, you will see that the distance between the nearest and farthest two points in high dimensions is almost the same. + +```{python} +import numpy as np + +# Assuming x_1000D is your 2D array +# For example: +# x_1000D = np.random.rand(1000, 100) + +# Pick any two columns +col1 = x_2D[:, 0] +col2 = x_2D[:, 20] + +# Compute their dot product +dot_product = np.dot(col1, col2) + +print(dot_product) +``` + +# Volume of a hypersphere + +In the second part of this notebook we calculate the volume of a hypersphere of radius 0.5 (i.e., of diameter 1) as a function of the radius. Note that you you can check your answer by doing the calculation for 2D using the standard formula for the area of a circle and making sure it matches. + +The equation is + +$$ +V_n(R) = \frac{\pi^{n/2}}{\Gamma\bigl(\tfrac n2 + 1\bigr)}R^n, +$$ +where $\Gamma$ is Euler's gamma function, $\Gamma(n) = (n - 1)!$ for all positive integers $n$. + +```{python} +def volume_of_hypersphere(diameter, dimensions): + # Formula given in Problem 8.7 of the book or see https://en.wikipedia.org/wiki/Volume_of_an_n-ball#Formulas + # You will need sci.gamma() + # Check out: https://docs.scipy.org/doc/scipy/reference/generated/scipy.special.gamma.html + # Also use this value for pi + pi = np.pi + + ### BEGIN SOLUTION + radius = diameter / 2 + #print("dimensions: ", dimensions) + #print("pi ** (dimensions / 2) = ", pi ** (dimensions / 2)) + #print("sci.gamma(dimensions / 2 + 1) = ", sci.gamma(dimensions / 2 + 1)) + #print("radius ** dimensions = ", radius ** dimensions) + volume = (pi ** (dimensions / 2)) / (sci.gamma(dimensions / 2 + 1)) * (radius ** dimensions) + ### END SOLUTION + + return volume +``` + +```{python} +diameter = 2.0 +vols = [] +for c_dim in range(1,21): + vols.append(volume_of_hypersphere(diameter, c_dim)) + print("Volume of unit radius hypersphere in %d dimensions is %3.3f"%(c_dim, volume_of_hypersphere(diameter, c_dim))) +``` + +```{python} +# plot vols +plt.scatter(range(1,21), vols) +plt.xlabel('Dimensions') +plt.ylabel('Volume') +plt.title('Volume of unit radius hypersphere') +plt.show() +``` + +```{python} +diameter = 1.0 +vols = [] +for c_dim in range(1,21): + vols.append(volume_of_hypersphere(diameter, c_dim)) + print("Volume of unit radius hypersphere in %d dimensions is %3.3f"%(c_dim, volume_of_hypersphere(diameter, c_dim))) +``` + +```{python} +# plot vols +plt.scatter(range(1,21), vols) +plt.xlabel('Dimensions') +plt.ylabel('Volume') +plt.title('Volume of unit diameter hypersphere') +plt.show() +``` + +You should see that the volume decreases to almost nothing in high dimensions. All of the volume is in the corners of the unit hypercube (which always has volume 1). + +# Proportion of hypersphere in outer shell + +In the third part of the notebook you will calculate what proportion of the volume of a hypersphere is in the outer 1% of the radius/diameter. Calculate the volume of a hypersphere and then the volume of a hypersphere with 0.99 of the radius and then figure out the ratio. + +```{python} +def get_prop_of_volume_in_outer_1_percent(dimension): + + ### BEGIN SOLUTION + outer_diameter = 1.0 + outer_volume = volume_of_hypersphere(outer_diameter, dimension) + inner_diameter = 0.99 + inner_volume = volume_of_hypersphere(inner_diameter, dimension) + proportion = (outer_volume - inner_volume) / outer_volume + ### END SOLUTION + + # print(f"Outer volume: {outer_volume}, Inner volume: {inner_volume}") + return proportion +``` + +```{python} +# While we're here, let's look at how much of the volume is in the outer 1% of the radius +propvols = [] +for c_dim in [1,2,10,20,50,100,150,200,250,300]: + propvols.append(get_prop_of_volume_in_outer_1_percent(c_dim)) + print('Proportion of volume in outer 1 percent of radius in %d dimensions =%3.3f'%(c_dim, get_prop_of_volume_in_outer_1_percent(c_dim))) +``` + +```{python} +# plot propvols +plt.scatter([1,2,10,20,50,100,150,200,250,300], propvols) +plt.xlabel('Dimensions') +plt.ylabel('Proportion of volume in outer 1%') +plt.title('Proportion of volume in outer 1% of diameter of hypersphere') +plt.show() +``` + +You should see see that by the time we get to 300 dimensions most of the volume is in the outer 1 percent.

+ +The conclusion of all of this is that in high dimensions you should be sceptical of your intuitions about how things work. I have tried to visualize many things in one or two dimensions in the book, but you should also be sceptical about these visualizations! + + + +# Distance ratios + +Some experiments comparing the closest/farthest distance ratios for different metrics. + +## Closest/farthest distance ratios + +Define a function to calculate the ratio of the largest to smallest distance between points in a dataset. + +```{python} +from scipy.spatial import distance +def distance_ratio(x, metric='euclidean'): + + if metric == 'euclidean': + ord = 2 + elif metric == 'manhattan': + ord = 1 + elif metric == 'cosine': + pass + else: + raise ValueError(f"Metric {metric} not supported") + + smallest_dist = np.inf + largest_dist = 0 + for i in range(x.shape[0]): + for j in range(i + 1, x.shape[0]): # start from i+1 to avoid redundant calcuations + if i != j: + if metric == 'euclidean' or metric == 'manhattan': + dist = np.linalg.norm(x[i,:] - x[j,:], ord=ord) + elif metric == 'cosine': + dist = distance.cosine(x[i,:].flatten(), x[j,:].flatten()) + if dist < smallest_dist: + smallest_dist = dist + if dist > largest_dist: + largest_dist = dist + + print(f"smallest_dist = {smallest_dist}, largest_dist = {largest_dist}") + + # Calculate the ratio and return + dist_ratio = largest_dist / smallest_dist + return dist_ratio +``` + +And then calculate the ratio for each dataset. + +```{python} +dist_ratio_2d = distance_ratio(x_2D) +print('Ratio of largest to smallest distance 2D: %3.3f'%(dist_ratio_2d)) + +dist_ratio_3d = distance_ratio(x_3D) +print('Ratio of largest to smallest distance 3D: %3.3f'%(dist_ratio_3d)) + +dist_ratio_100d = distance_ratio(x_100D) +print('Ratio of largest to smallest distance 100D: %3.3f'%(dist_ratio_100d)) + +dist_ratio_1000d = distance_ratio(x_1000D) +print('Ratio of largest to smallest distance 1000D: %3.3f'%(dist_ratio_1000d)) +``` + +--- + +```{python} +plt.scatter([2,3,100,1000], [dist_ratio_2d, dist_ratio_3d, dist_ratio_100d, dist_ratio_1000d]) +plt.plot([2,3,100,1000], [dist_ratio_2d, dist_ratio_3d, dist_ratio_100d, dist_ratio_1000d], '--', color='lightgray') +plt.xscale('log') +plt.title('Euclidean Distance ratio') +plt.xlabel('Dimension') +plt.ylabel('Distance ratio') +plt.show() +``` + + +--- +```{python} +euc_dist_ratios = [] +euc_dist_ratios.append(distance_ratio(x_2D)) +print('Ratio of largest to smallest distance 2D: %3.3f'%(euc_dist_ratios[0])) + +euc_dist_ratios.append(distance_ratio(x_3D)) +print('Ratio of largest to smallest distance 3D: %3.3f'%(euc_dist_ratios[1])) + +euc_dist_ratios.append(distance_ratio(x_100D)) +print('Ratio of largest to smallest distance 100D: %3.3f'%(euc_dist_ratios[2])) + +euc_dist_ratios.append(distance_ratio(x_1000D)) +print('Ratio of largest to smallest distance 1000D: %3.3f'%(euc_dist_ratios[3])) +``` + +```{python} +man_dist_ratios = [] +man_dist_ratios.append(distance_ratio(x_2D, metric='manhattan')) +print('Ratio of largest to smallest distance 2D: %3.3f'%(man_dist_ratios[0])) + +man_dist_ratios.append(distance_ratio(x_3D, metric='manhattan')) +print('Ratio of largest to smallest distance 3D: %3.3f'%(man_dist_ratios[1])) + +man_dist_ratios.append(distance_ratio(x_100D, metric='manhattan')) +print('Ratio of largest to smallest distance 100D: %3.3f'%(man_dist_ratios[2])) + +man_dist_ratios.append(distance_ratio(x_1000D, metric='manhattan')) +print('Ratio of largest to smallest distance 1000D: %3.3f'%(man_dist_ratios[3])) +``` + +```{python} +cos_dist_ratios = [] +cos_dist_ratios.append(distance_ratio(x_2D, metric='cosine')) +print('Ratio of largest to smallest distance 2D: %3.3f'%(cos_dist_ratios[0])) + +cos_dist_ratios.append(distance_ratio(x_3D, metric='cosine')) +print('Ratio of largest to smallest distance 3D: %3.3f'%(cos_dist_ratios[1])) + +cos_dist_ratios.append(distance_ratio(x_100D, metric='cosine')) +print('Ratio of largest to smallest distance 100D: %3.3f'%(cos_dist_ratios[2])) + +cos_dist_ratios.append(distance_ratio(x_1000D, metric='cosine')) +print('Ratio of largest to smallest distance 1000D: %3.3f'%(cos_dist_ratios[3])) +``` +--- + +```{python} +import matplotlib.pyplot as plt + +plt.scatter([2,3,100,1000], euc_dist_ratios, label='Euclidean') +plt.plot([2,3,100,1000], euc_dist_ratios, '--', color='lightgray') + +plt.scatter([2,3,100,1000], man_dist_ratios, label='Manhattan', color='orange') +plt.plot([2,3,100,1000], man_dist_ratios, '--', color='orange') + +plt.xscale('log') +plt.legend() +plt.title('Distance ratio') +plt.xlabel('Dimension') +plt.ylabel('Distance ratio') +plt.show() +``` diff --git a/ds701_book/20-RecSys-GPTo1.qmd b/ds701_book/20-RecSys-GPTo1.qmd new file mode 100644 index 00000000..bb193d1f --- /dev/null +++ b/ds701_book/20-RecSys-GPTo1.qmd @@ -0,0 +1,290 @@ +--- +title: "20-Recommender-Systems" +--- + +# Introduction to Recommender Systems + +## Motivating Examples + +• Netflix: Suggesting new movies or TV shows based on user’s watch history and ratings. +• Amazon: Recommending products based on user’s browsing and purchase history. +• YouTube: Suggesting videos a user might like based on their viewing history. +• Spotify: Creating playlists and recommending songs based on user’s music preferences. + +Discussion Prompt: Think about the recommender systems you encounter daily (e.g., social media feeds, shopping recommendations). How do they influence your decision-making? + +## Why Recommender Systems Matter + +• Information Overload: With vast amounts of content (products, movies, articles), recommender systems help users find what is relevant. +• Personalization: Tailor user experiences to individual preferences. +• Business Value: Increase engagement and sales. For instance, a large percentage of Netflix’s watched content is driven by recommendations. + +Question for Class: Can you name some other domains where recommender systems might be useful besides entertainment and shopping? + +## Goals of This Lecture + +1. Understand the basic types of recommender systems. +2. Learn about content-based filtering and collaborative filtering. +3. Explore example algorithms like Matrix Factorization and SVD. +4. Work through an example dataset. +5. Discuss advanced and hybrid approaches. +6. Encourage critical thinking on how to evaluate and improve recommenders. + +# Basic Approaches to Recommender Systems + +## Content-Based Filtering: Introduction + +• Definition: Recommend items similar to those a user has liked in the past. +• How It Works: Uses item features and user profiles. If a user liked a particular item, the system finds items with similar features. +• Example: If a user likes sci-fi movies, recommend other sci-fi movies. + +Example Dataset: Imagine a dataset with movies and their genres. If a user likes “Inception” (Sci-Fi/Thriller), the system might recommend “The Matrix” (Sci-Fi) based on overlapping genres. + +## Content-Based Filtering: Steps + +1. Profile each item using features (genres, actors, author, etc.). +2. Profile the user based on items they’ve liked. +3. Recommend items whose features match the user’s profile. + +Example: +• User’s watched and liked movies: “Inception” (Sci-Fi, Thriller), “Interstellar” (Sci-Fi, Drama). +• Derived user profile: Genre preference - Sci-Fi, Thriller. +• Recommend items with similar genres like “Matrix” (Sci-Fi) or “Shutter Island” (Thriller). + +Discussion Prompt: What are some limitations of content-based filtering? + +## Limitations of Content-Based Filtering + +• Feature Engineering: Requires carefully identified and extracted features. +• Lack of Diversity: Tends to recommend items similar to those already liked, possibly leading to a “filter bubble.” +• Cold Start Problem (Item): For new items with no features or feedback, it’s challenging to make recommendations. + +# Collaborative Filtering: Introduction + +## What is Collaborative Filtering? + +• Definition: Recommending items based on the past behavior of users who are similar to the target user. +• Key Idea: Similar users have similar preferences. If user A and user B have a high similarity in terms of preferences, items liked by user B might be recommended to user A. + +Example: In a movie recommendation scenario: + +• User A likes “Interstellar” and “Inception.” +• User B likes “Interstellar” and “The Martian.” +• Since both users like “Interstellar,” the system might recommend “Inception” to user B and “The Martian” to user A. + +## Collaborative Filtering Approaches + +1. User-Based Collaborative Filtering: + * Finds users similar to the target user and recommends items liked by those similar users. +2. Item-Based Collaborative Filtering: + * Finds items that are similar to the items the user likes, then recommends those similar items. + +Discussion Prompt: Which approach might work better in a situation with millions of users and thousands of items? + +## User-Based Collaborative Filtering + +* How It Works: + 1. Calculate similarity between users based on ratings or behavior. + 2. Select top N similar users. + 3. Aggregate items from those similar users and recommend them. + +Example: + +* Suppose we have user ratings for movies. For a target user who has rated some movies, we find other users with similar rating patterns. Items highly rated by these similar users are recommended. + +Class Exercise: Using a small movie ratings matrix, try to identify which users are similar and which movies would be recommended. + +## Item-Based Collaborative Filtering + +* How It Works: + 1. Calculate similarity between items based on user ratings. + 2. For an item the user has shown interest in, find similar items. + 3. Recommend items that are similar to the ones the user likes. + +Example: +* If a user has rated “Inception” highly, the system looks at other items that have been rated similarly by other users who liked “Inception.” + +Discussion Prompt: What might be the advantages or disadvantages of item-based vs. user-based collaborative filtering? + +## Similarity Measures + +* Common similarity measures used in collaborative filtering: + * Cosine Similarity + * Pearson Correlation + * Jaccard Similarity (for implicit feedback) + +Example: +* If we have vectors of user ratings for two items, we can compute the cosine similarity. This gives us how similar the items are in terms of user ratings patterns. + +Class Exercise: Given two rating vectors [5, 4, 0, 1] and [4, 5, 0, 2], compute the cosine similarity. + +# Advanced Collaborative Filtering Techniques + +## Matrix Factorization: Introduction + +* Definition: A technique that decomposes the user-item interaction matrix into lower-dimensional latent factors. +* Motivation: Exploit underlying structure in user-item interactions. +* Famous Example: Netflix Prize used matrix factorization methods for recommendation. + +## Matrix Factorization: How It Works + +* The user-item matrix R (where R_{u,i} is the rating of user u for item i) is approximated by two latent factor matrices P and Q: +* R ≈ P * Q^T +* P is a |U| x k matrix, representing k latent factors for each user. +* Q is a |I| x k matrix, representing k latent factors for each item. +* The ratings are predicted by the dot product of these latent factor vectors. + +Example: + +* If we have 1000 users and 500 movies, we might factor it into 20 latent factors. Each user and movie is represented in this 20-dimensional space, capturing aspects like genre preferences. + +## Singular Value Decomposition (SVD) + +* Definition: A factorization of a matrix into three matrices: R = U * Σ * V^T. +* In the context of recommender systems, SVD can be used to identify latent factors capturing underlying preferences. +* Truncated SVD: For recommendation, we often use a truncated version to reduce complexity. + +Class Exercise: + +* If the user-item matrix is: + +| User\Item | Item1 | Item2 | Item3 | +| ------- | ------- | ------- | ------- | +| User1 | 5 | 0 | 4 | +| User2 | 4 | 0 | 5 | +| User3 | 1 | 5 | 0 | + +Compute the truncated SVD for this small matrix. + +## Implementing Matrix Factorization + +* Optimization Problem: Minimize the squared error between actual ratings and predicted ratings. + +$$ +min_{P, Q} \sum_{(u,i) \in R} (R_{u,i} - P_u^T Q_i)^2 + \text{regularization terms} +$$ + +* Stochastic Gradient Descent (SGD): Commonly used method to learn P and Q. +* Regularization: Important to avoid overfitting. + +Discussion Prompt: Why might matrix factorization perform better than user-based or item-based collaborative filtering in some cases? + +# Practical Example with a Dataset + +## Example Dataset: MovieLens + +* MovieLens: A commonly used dataset for building and testing recommender systems. +* Dataset Details: + * Users, Movies, and Ratings from 1 to 5. + * Also includes movie metadata like genres. + +Plan: +* We’ll explore how to apply both content-based and collaborative filtering on this dataset. + +Class Discussion: If you were to recommend a movie to a user who only watched “Star Wars”, how would you proceed using each method? + +## Applying Content-Based Filtering to MovieLens + +* Step 1: Represent movies by their genres (and possibly actors, directors). +* Step 2: If a user liked a set of movies, compute the average feature vector of those movies. +* Step 3: Recommend movies whose feature vectors are most similar to the user’s profile vector. + +Example: +* If the user liked “The Empire Strikes Back” (Sci-Fi, Adventure), we recommend other Sci-Fi, Adventure movies like “Interstellar” or “Indiana Jones”. + +## Applying Collaborative Filtering to MovieLens + +* User-Based Approach: + 1. Find users similar to the target user based on ratings. + 2. Recommend movies highly rated by these similar users. +* Item-Based Approach: +1. Find movies similar to the movies the user has rated highly. +2. Recommend these similar movies. + +Example: +* If the user rated “Star Wars” and “The Matrix” highly, the system finds other movies that are commonly liked by users who liked these two movies, such as “Inception” or “Lord of the Rings.” + +Class Exercise: Use the user-item matrix from MovieLens to predict a missing rating using item-based collaborative filtering. + +## Evaluating Recommender Systems + +* Accuracy Metrics: + * Mean Absolute Error (MAE) + * Root Mean Square Error (RMSE) +* Rank-based Metrics: + * Precision@k + * Recall@k +* Mean Average Precision (MAP) +* Normalized Discounted Cumulative Gain (nDCG) + +Discussion Prompt: Besides accuracy, what other factors might be important when evaluating recommender systems? + +## Practical Concerns + +* Scalability: Large datasets with millions of users and items require efficient algorithms. +* Sparsity: Often user-item interaction matrices are very sparse. +* Cold Start: Difficulty making recommendations for new users (User Cold Start) or new items (Item Cold Start) with no history. + +Class Discussion: How can we tackle the cold start problem? + +# Advanced and Hybrid Approaches + +## Hybrid Recommender Systems + +* Definition: Combine content-based and collaborative filtering methods. +* Motivation: Overcome limitations of each approach. +* Content-based can handle new items (solves cold start for items). +* Collaborative filtering can learn more nuanced user preferences. + +Example: +* A hybrid approach may start by recommending items similar to those the user liked before (content-based), and refine recommendations by observing user feedback (collaborative filtering). + +Discussion Prompt: Can you think of ways to combine the two methods effectively? + +## Neural Network-based Approaches + +* Autoencoders: Used to learn latent representations of user-item interactions. +* Neural Collaborative Filtering: Deep neural networks to model complex user-item interactions. +* Graph Neural Networks: Model relationships in user-item graphs. + +Example: +* Using an autoencoder to predict missing ratings from known ratings by compressing user preferences into a latent space. + +Class Discussion: What might be the advantages of using neural networks over traditional matrix factorization? + +## Graph-based Approaches + +* Social Recommender Systems: Use relationships in a social graph. +* Trust-based Recommendations: If user A trusts user B, we can use user B’s preferences to inform recommendations for user A. + +Example: +* If your friend likes a certain product, the system might be more likely to recommend it to you in a social shopping platform. + +Discussion Prompt: How can network or graph analyses be integrated into recommender systems? + +# Summary and Wrap-up + +## Key Takeaways + +* Content-Based Filtering: Recommends items similar to those the user liked before. +* Collaborative Filtering: Uses preferences of similar users to make recommendations. +* Matrix Factorization: Finds latent factors in user-item interactions. +* Hybrid Approaches: Combine multiple methods to overcome individual limitations. + +## Final Thoughts + +* Recommender systems are crucial in helping users navigate large volumes of information and product choices. +* The choice of method depends on data availability, domain, and specific objectives (e.g., accuracy, diversity, or serendipity). +* Ongoing research in deep learning and graph-based methods is pushing the field forward. + +Question for Class: Reflect on the methods covered. Which method (or combination of methods) do you think would be most effective for a streaming music service and why? + +## Further Reading + +* Books: + * “Recommender Systems: The Textbook” by Charu C. Aggarwal. +* Papers: + * “Matrix Factorization Techniques for Recommender Systems” by Yehuda Koren, Robert Bell, and Chris Volinsky. + * “Deep Neural Networks for YouTube Recommendations” by Paul Covington, Jay Adams, Emre Sargin. + +Class Discussion: Any questions about the topics covered today or how these methods can be applied to real-world datasets? diff --git a/ds701_book/20-Recommender-Systems-notes.md b/ds701_book/20-Recommender-Systems-notes.md new file mode 100644 index 00000000..5f4b5c82 --- /dev/null +++ b/ds701_book/20-Recommender-Systems-notes.md @@ -0,0 +1,7 @@ +# Notes on Recommender Systems + +https://engineering.roku.com/personalised-retrieval-for-typed-text-queries + +Also look at Spotify playlist... + + diff --git a/ds701_book/20a-RecSys-with-Deep-Learning.qmd b/ds701_book/20a-RecSys-with-Deep-Learning.qmd new file mode 100644 index 00000000..3a9397e0 --- /dev/null +++ b/ds701_book/20a-RecSys-with-Deep-Learning.qmd @@ -0,0 +1,351 @@ +--- +title: Recommendation Systems Part II -- Deep Learning Based +bibliography: references.bib +jupyter: python3 +nocite: | + @ricci2022recommender +--- + +# Deep Learning for Recommender Systems + +Based on [@zhang2022deep]. + +## Introduction +- **Deep Learning in Recommender Systems**: + - Revolutionized AI applications across fields like computer vision and NLP. + - Reduces feature engineering effort and supports diverse data (e.g., text, images). + - Enhances tasks such as cold-start problems, temporal dynamics, and explainability. + +## Key Techniques +1. **Multi-layer Perceptrons (MLPs)**: + - Flexible, hierarchical networks for feature interaction. + - Universal approximators (@fig-dl-recsys-mlp). +2. **Convolutional Neural Networks (CNNs)**: + - Efficiently capture spatial patterns in grid-like data (@fig-dl-recsys-cnn). +3. **Recurrent Neural Networks (RNNs)**: + - Models sequential data with memory states (@fig-dl-recsys-rnn). +4. **Graph Neural Networks (GNNs)**: + - Handles graph-structured data like social and knowledge graphs. +5. **Autoencoders and GANs**: + - For representation learning and data generation. + +![Multi-layer Perceptrons](figs/RecSys-figs/dl-recsys-mlp.png){width=30% fig-align="center" #fig-dl-recsys-mlp} + +![Convolutional Neural Networks](figs/RecSys-figs/dl-recsys-cnn.png){width=30% fig-align="center" #fig-dl-recsys-cnn} + +![Recurrent Neural Networks](figs/RecSys-figs/dl-recsys-rnn.png){width=30% fig-align="center" #fig-dl-recsys-rnn} + +--- + +## Challenges in Recommender Systems +### Interaction Modeling +- Captures relationships in sparse user-item matrices. +- Approaches: + - **NeuMF**: Replaces dot product with MLPs. + - **Outer Product + CNNs**: Higher-order correlations (Figure 6). + +--- + +### User Modeling +1. **Temporal Dynamics**: + - Sequence-aware recommendations using RNNs, CNNs, and attention mechanisms (Figure 8). +2. **Diverse Interests**: + - Models multiple user preferences via clustering and disentanglement. + +--- + +## Content Representation Learning +1. **Text Features**: + - Leverages reviews and descriptions using CNNs, RNNs, and attention mechanisms. +2. **Image Features**: + - Integrates CNN-extracted visual data for applications like fashion and social media. +3. **Video/Audio Features**: + - Processes multimedia content for personalized music and video recommendations. + +--- + +## Advanced Applications +### Graph-Structured Data +- Incorporates graphs like user-item networks for collaborative filtering (Figure 10). +- Examples: + - **Session-based Recommendations**: Sequence learning from click patterns. + - **Knowledge Graphs**: Path-based reasoning for explainability. + +### Cold-Start Recommendations +- Tackles sparse data with side information and meta-learning. +- GNN-based approaches predict embeddings for new users/items. + +--- + +## Beyond Accuracy +### Explainability +- Enhances transparency and trust: + - Attention mechanisms highlight critical features. + - Knowledge graph paths provide reasoning for recommendations. + +### Robustness +- Defends against adversarial attacks with perturbation-based training. + +--- + +## Applications of Deep Learning in Recommendation +1. **E-commerce**: + - Amazon, eBay, Alibaba use deep learning for personalized shopping. +2. **Entertainment**: + - YouTube employs candidate generation and ranking modules. +3. **News**: + - Self-attention mechanisms capture user preferences across multiple views. +4. **Point-of-Interest**: + - Combines visual and textual data for location-based recommendations. + +--- + +## Conclusion +- **Deep learning transforms recommender systems**: + - Combines memorization and generalization. + - Supports diverse data types and complex challenges. +- Future directions include improving scalability and real-time adaptability. + +This presentation references key figures and tables from the document and is paced for a 15-minute delivery. Let me know if further adjustments are needed! + + +# Wide and Deep Learning for Recommender Systems + +Based on [@cheng2016wide]. + +## Introduction +- **Problem**: Balancing memorization and generalization in recommender systems. + - **Memorization**: Learns frequent co-occurrences of features for relevant recommendations. + - **Generalization**: Predicts unseen feature combinations for diverse recommendations. +- **Wide & Deep Learning Framework**: + - Combines linear models (memorization) and neural networks (generalization). + - Evaluated on Google Play, with over 1 billion users and 1 million apps. +- **Key Results**: + - Significant improvements in app acquisitions. + - Open-sourced implementation in TensorFlow. + +--- + +## Recommender System Overview +- Workflow (Figure 2): + - **Query**: User and contextual features (e.g., demographics, app usage). + - **Retrieval**: Filters 100 items based on relevance. + - **Ranking**: Scores and ranks items using Wide & Deep Learning. +- Challenges: + - High throughput with low latency (e.g., scoring over 10 million apps per second). + +--- + +## Wide & Deep Learning Framework +### Wide Component +- Linear model (Figure 1, left): + - Cross-product transformations capture interactions (e.g., "gender=female" AND "language=en"). + - Effective for memorization but limited in generalization. + +### Deep Component +- Feed-forward neural network (Figure 1, right): + - Converts sparse categorical features into dense embeddings. + - Layers compute activations using ReLU functions. + - Learns complex, nonlinear feature interactions. + +--- + +### Joint Training +- Combines wide and deep components (Figure 1, center): + - Jointly optimized via a shared logistic loss function. + - Uses: + - **FTRL** optimizer for wide part. + - **AdaGrad** for deep part. +- Distinction: + - Joint training integrates components during training, unlike ensembles. + +--- + +## System Implementation +### Data Generation +- **Training Data**: + - Generated from user-app interactions. + - Labels: 1 for app installs, 0 otherwise. +- **Feature Engineering**: + - Maps categorical strings to integer IDs (vocabulary generation). + - Normalizes continuous features using quantile-based scaling. + +### Model Training +- Structure (Figure 4): + - Cross-product transformations for wide component. + - Dense embeddings (32 dimensions each) for deep component. + - Three ReLU layers process embeddings and continuous features. +- **Warm-Starting**: + - Retrains models incrementally using weights from previous models. + +--- + +### Model Serving +- Scoring: + - Scores candidate apps using forward inference on Wide & Deep models. +- Optimization: + - Multithreading reduces latency from 31 ms to 14 ms (Table 2). + +--- + +## Experiment Results +### App Acquisitions +- A/B Testing (Table 1): + - **Wide-only**: Baseline model. + - **Deep-only**: +2.9% acquisition rate. + - **Wide & Deep**: +3.9% acquisition rate over baseline. +- Insights: + - Joint training enables exploratory recommendations for new user responses. + +### Serving Performance +- At peak, servers score over 10 million apps/second. +- Optimized serving reduces latency significantly. + +--- + +## Related Work +- **Factorization Machines**: + - Generalize linear models but lack nonlinear interaction modeling. +- **Collaborative Deep Learning**: + - Combines deep learning with collaborative filtering. +- Wide & Deep innovates by integrating linear models with deep networks. + +--- + +## Conclusion +- **Key Contributions**: + - Combines memorization (wide) and generalization (deep) in a single model. + - Scalable and effective for massive datasets (e.g., Google Play). +- **Impact**: + - Open-source implementation facilitates adoption in diverse applications. + +This structure references critical figures (Figure 1, 2, 4) and tables (Table 1, 2) and is timed for a detailed explanation within 25 minutes. Let me know if adjustments are needed! + + +# Deep Learning Recommender Model + +Besides the Collaborative Filtering and Matrix Factorization models, another popular approach to building recommender systems is to use Deep Learning. + +We'll look at the Deep Learning Recommender Model (DLRM) proposed by Facebook in 2019 [@naumov2019deep]. + +## Introduction + +- **Key Features**: + - Embeddings for categorical data. + - Multi-layer perceptrons (MLPs) for dense data processing. + - Combines statistical techniques like matrix factorization and factorization machines. + +## DLRM Architecture + + +:::: {.columns} +::: {.column width="50%"} + +- Components (@fig-dlrm-model): + 1. **Embeddings**: Dense representations for categorical data. + 2. **Bottom MLP**: Transforms dense continuous features. + 3. **Feature Interaction**: Dot-product of embeddings and dense features. + 4. **Top MLP**: Processes interactions and outputs probabilities. + +::: +::: {.column width="50%"} + +![DLRM Architecture](figs/RecSys-figs/dlrm-model.png){width=80% fig-align="center" #fig-dlrm-model} + +::: +:::: + +## Embeddings and Feature Interactions +1. **Embeddings**: + - Maps categorical inputs to latent factor space. + - Multi-hot vectors allow weighted combinations (Equation 2). + +```{python} +import torch +import torch.nn as nn + +# Example embedding matrix: 5 embeddings, each of dimension 3 +embedding_matrix = nn.EmbeddingBag(num_embeddings=5, embedding_dim=3, mode='mean') + +# Input: Indices into the embedding matrix +input_indices = torch.tensor([1, 2, 3, 4]) # Flat list of indices +offsets = torch.tensor([0, 2]) # Start new bag at position 0 and 2 in input_indices + +# Forward pass +output = embedding_matrix(input_indices, offsets) + +print("Embedding Matrix:\n", embedding_matrix.weight) +print("Output:\n", output) +``` + +## 2. **Feature Interaction**: + - Second-order interactions modeled via dot-products. + - Mimics Factorization Machines for efficiency (Equation 4). + + + +--- + +## Model Training and Parallelism +- **Training Challenges**: + - Large embeddings exceed single-device memory. + - Requires efficient parallelization of computations. +- **Parallelism Strategy**: + - **Model Parallelism**: Distributes embeddings across devices. + - **Data Parallelism**: Replicates MLPs for concurrent mini-batch processing. + - Butterfly shuffle for all-to-all communication (Figure 2). + +--- + +## Data Handling +1. **Random and Synthetic Data**: + - Facilitates system testing and preserves data privacy. + - Techniques for generating synthetic categorical data (Figure 3). +2. **Public Datasets**: + - Criteo AI Labs Ad Kaggle Dataset. + - Used for evaluating click-through rate (CTR) prediction models. + +--- + +## Experiments: Accuracy and Performance +1. **Accuracy**: + - Evaluated on Criteo dataset (Figure 5). + - Compared with Deep & Cross Network (DCN). + - DLRM shows superior training and validation accuracy with both SGD and Adagrad. +2. **Performance Profiling**: + - Tested on Big Basin AI platform (Figure 4). + - GPU significantly outperforms CPU, particularly in MLP computations (Figure 6). + +--- + +## Comparison with Prior Models +- DLRM vs Other Networks: + - Simplified interactions reduce dimensionality. + - Focuses on second-order interactions for computational efficiency. + - Outperforms alternatives like Wide & Deep, DeepFM, and xDeepFM. + +--- + +## Conclusion +- **Key Takeaways**: + - DLRM effectively combines embeddings, MLPs, and interaction layers for personalization tasks. + - Offers a scalable solution for large-scale recommendation systems. + - Open-source implementation fosters further research and system design. +- **Future Directions**: + - Optimization of communication primitives. + - Exploring higher-order interactions with minimal computational costs. + + +# Recap and References + + +## Recap + + +## References + +::: {#refs} +::: + + + diff --git a/ds701_book/21-ring-lattice-shortest-path-proof.qmd b/ds701_book/21-ring-lattice-shortest-path-proof.qmd new file mode 100644 index 00000000..c0f68527 --- /dev/null +++ b/ds701_book/21-ring-lattice-shortest-path-proof.qmd @@ -0,0 +1,39 @@ +From prompt to GPT-4o: + +Certainly! The simplification in step 5 arises from considering the distribution of distances in a ring lattice and how they contribute to the average shortest path length. + +### Detailed Explanation: + +1. **Understanding the Distance Distribution**: + - In a ring lattice, each node is connected to its nearest neighbors, and the distance between any two nodes $i$ and $j$ is given by: + +$$ +d(i, j) = \min(|i-j|, n - |i-j|) +$$ + + - This formula accounts for the circular nature of the lattice, where you can travel either clockwise or counterclockwise to reach another node. + +2. **Distance Contribution**: + + - For a node $i$, the distance to another node $j$ is determined by the smaller of the two possible paths (clockwise or counterclockwise). + - As you move further from node $i$, the distance initially increases linearly until you reach the halfway point around the circle, after which it starts decreasing symmetrically. + +3. **Summing Over All Pairs**: + + - The total distance for all pairs of nodes is calculated by summing the distances for each pair and then dividing by the number of pairs: + $$ + L = \frac{1}{n(n-1)} \sum_{i=0}^{n-1} \sum_{j=0, j \neq i}^{n-1} d(i, j) + $$ + - For each node $i$, the distances to other nodes range from 1 to $n/2$ (assuming $n$ is even for simplicity), and then symmetrically back to 1. + +4. **Approximation for Large $n$**: + + - For large $n$, the average distance from a node to all other nodes can be approximated by considering that, on average, you traverse about a quarter of the way around the circle to reach another node. + - This is because the average distance is roughly the midpoint of the maximum distance, which is $n/2$. Therefore, the average distance is approximately $n/4$. + + For skip connections of 2, then the average path length is $n/8$, or 1/2 of n/4. + +5. **Conclusion**: + - The approximation $L \approx n/4$ reflects the fact that in a ring lattice, the average path length grows linearly with the number of nodes $n$, but at a slower rate than the maximum possible distance, due to the circular structure and symmetry. + +This simplification is particularly useful for large $n$ where the exact calculation becomes cumbersome, and the approximation provides a good estimate of the average shortest path length. \ No newline at end of file diff --git a/ds701_book/23-25-NNs-outline.md b/ds701_book/23-25-NNs-outline.md new file mode 100644 index 00000000..9ba1cc1d --- /dev/null +++ b/ds701_book/23-25-NNs-outline.md @@ -0,0 +1,29 @@ +# NN Outline + +## The "Unreasonable" Effectiveness of Deep Neural Networks + +UDL examples of all the types of models... + +> Note: should we add more generative AI examples? chatgpt? NotebookLM? Suno.AI? + +### Emergent behavior in Pre-Trained LLMs + +Can we expand on this phenomenon a little bit? + +Is GPT scaling laws article relevant? + + +### Theory Sometimes Follows Invention + +Still relevant? Think so.... + +## Loss Functions for Model Fitting + +Relates a bit to models previously discussed... + +discusses simple convex loss functions with global minimum vs non-convex +loss functions without many local minima + +## Gradient Descent Intuitively + + diff --git a/ds701_book/23-NN-I-Gradient-Descent-save.qmd b/ds701_book/23-NN-I-Gradient-Descent-save.qmd new file mode 100644 index 00000000..6f0da79c --- /dev/null +++ b/ds701_book/23-NN-I-Gradient-Descent-save.qmd @@ -0,0 +1,1017 @@ +--- +title: Neural Networks I -- Gradient Descent +jupyter: python3 +--- + +## Introduction + +[![](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/tools4ds/DS701-Course-Notes/blob/main/ds701_book/jupyter_notebooks/23-NN-I-Gradient-Descent.ipynb) + +```{python} +#| code-fold: true +import numpy as np +import scipy as sp +import matplotlib.pyplot as plt +import pandas as pd +import seaborn as sns +import matplotlib as mp +import sklearn +import networkx as nx +from IPython.display import Image, HTML + +import laUtilities as ut + +%matplotlib inline +``` + + +::: {.callout-note} +The content builds upon + +* Andrej Karpathy's excellent [video](https://youtu.be/VMj-3S1tku0?si=9HKPIq36EnHektSm) on building _micrograd_ and +* _Understanding Deep Learning_ [book preprint](https://udlbook.github.io/udlbook/) by Simone Prince +* as well as many other sources cited below. +::: + + +## The "Unreasonable" Effectiveness of Deep Neural Networks + +Deep Neural Networks have been effective in many applications. + +![](figs/NN-figs/IntroModels.svg){width="75%"} + +![](figs/NN-figs/IntroModels2a.svg){width="75%"} + +[Understanding Deep Learning, Simon J.D. Prince, MIT Press, 2023](http://udlbook.com) + +## Emergent Behavior in Pre-Trained Large Language Models + +![Emergence](./figs/NN-figs/EmergentAbilitiesFig2.png) + +[Emergent Abilities of Large Language Models.](https://arxiv.org/abs/2206.07682) J. Wei et al., Oct. 26, 2022. + +## Theory Sometimes Follows Invention + +| Invention | Theory | +| --------- | ------ | +| Telescope (1608) | Optics (1650-1700) | +| Steam Engine (1695-1715) | Thermodynamics (1824...) | +| Electromagnetism (1820) | Electrodynamics (1821) | +| Sailboat (??) | Aerodynamics (1757), Hydrodynamics (1738) | +| Airplane (1885-1905) | Wing Theory (1907-1918) | +| Computer (1941-1945) | Computer Science (1950-1960) | +| Teletype (1906) | Information Theory (1948) | + +* But then when theory is developed it can more quickly improve invention +* The same can be said for Neural Networks. The theory to make them work is well understood. The theory of why they work is still developing. +* We'll balance theory and application + +--- +[The Power and Limits of Deep Learning](https://learning.acm.org/techtalks/powerandlimitsdl), Yann LeCun, March 2019. + +Underlying all these techniques is the idea of applying optimization techniques +to minimize some kind of "loss" function. + +## Loss Functions for Model Fitting + +Most of the machine learning we have studied this semester is based on the idea +that we have a model that is _parameterized_, and our goal is to find good +settings for the parameters. + +We have seen example after example of this problem. + +* In $k$-means, our goal was to find $k$ cluster centroids, so that the $k$-means + objective was minimized. +* In linear regression, our goal was to find a parameter vector $\beta$ so that + sum of squared error $\Vert \mathbf{y} - \hat{\mathbf{y}}\Vert_2$ was minimized. +* In the support vector machine, our goal was to find a parameter vector $\theta$ + so that classification error was minimized. + +And similarly we'll want to find good parameter settings in neural networks. + +It's time now to talk about how, in general, one can find "good settings" for the +parameters in problems like these. + +What allows us to unify our approach to many such problems is the following: + +First, we start by defining an error function, generally called a __loss__ +function, to describe how well our method is doing. + +And second, we choose loss functions that are __differentiable__ with respect to +the parameters. + +These two requirements mean that we can think of the parameter tuning problem +using surfaces like these: + +![](figs/L23-convex_cost_function.jpeg){width="75%"} + +Imagine that the $x$ and $y$ axes in these pictures represent parameter settings. +That is, we have two parameters to set, corresponding to the values of $x$ and $y$. + +For each $(x, y)$ setting, the $z$-axis shows the value of the loss function. + +What we want to do is find the minimum of a surface, corresponding to the +parameter settings that minimize loss. + +Notice the difference between the two kinds of surfaces. + +The surface on the left corresponds to a __strictly convex__ loss function. +If we find a local minimum of this function, it is a global minimum. + +The surface on the right corresponds to a __non-convex__ loss function. +There are local minima that are not globally minimal. + +Both kinds of loss functions arise in machine learning. + +For example, convex loss functions arise in + +* Linear regression +* Logistic regression + +While non-convex loss functions arise in + +* $k$-means +* Gaussian Mixture Modeling +* and of course neural networks + +## Gradient Descent Intuitively + +The intuition of gradient descent is the following. + +Imagine you are lost in the mountains, and it is foggy out. You want to find a +valley. But since it is foggy, you can only see the local area around you. + +![](figs/L23-fog-in-the-mountains.jpeg){width="75%"} + + + +The natural thing to do is: + +1. Look around you 360 degrees. +2. Observe in which direction the ground is sloping downward most steeply. +3. Take a few steps in that direction. +4. Repeat the process ... until the ground seems to be level. + +The key to this intuitive idea is formalizing the idea of "direction of steepest +descent." + +This is where the differentiability of the loss function comes into play. + +As long as the loss function is _locally_ differentiable, we can define the +direction of steepest descent (really, ascent). + +That direction is called the __gradient.__ + +## Derivatives on Single Variable Functions + +```{python} +#| code-fold: true +import math +import numpy as np +import matplotlib.pyplot as plt +import ipywidgets as widgets +%matplotlib inline +``` + +We'll build up to concept of gradient by starting with derivatives on single +variable functions. + +Let's start with a simple quadratic function. + +$$ +f(x) = 3x^2 - 4x +5 +$$ + +Which we can write in python as well. + +```{python} +#| code-fold: true +def f(x): + return 3*x**2 - 4*x + 5 +``` + +And we can plot it. + +```{python} +#| code-fold: true +import numpy as np + +xs = np.arange(-5, 5, 0.25) +ys = f(xs) +plt.plot(xs, ys); +``` + +Let's assume for a minute that this is our loss function that we are minimizing. + + +__Question__ + +What do we know about where the minimum is in terms of the slope of the curve? + +__Answer__ + +It is necessary but _not sufficient_ that the slope be zero. + +__Question__ + +How do we calculate the slope? + +We take the derivative, denoted + +$$ +\frac{d f(x)}{dx} \hspace{10pt} \textrm{Leibniz' notation} +$$ + +or + +$$ +f'(x) \hspace{10pt} \textrm{Lagrange's notation} +$$ + +You may see both notations. The nice thing about Leibniz' notation is that it is +easy to express _partial derivatives_ when we get to multivariate differentiation, +which we'll get to shortly. + +We can take the derivate of the $f(x)$ + +$$ +f(x) = 3x^2 - 4x +5 +$$ + +By definition of the [derivative](https://en.wikipedia.org/wiki/Derivative), the +function $f(x)$ is differentiable at $x$ if + +$$ +\lim_{h\to 0} \frac{f(x+h)-f(x)}{h} +$$ + +exists at $x$. And in fact, that limit approaches the value of the derivative in the limit. + +```{python} +#| code-fold: true +import matplotlib.pyplot as plt +import numpy as np +import ipywidgets as widgets +from IPython.display import display + +# Define the function f(x) +def f(x): + return 3 * x ** 2 - 4 * x + 5 + +# Define the derivative f'(x) +def df(x): + return 6 * x - 4 + +# Function to plot f(x) and its tangent line at x = x_value +def plot_with_tangents(x_value, h_value): + # Generate x values for the function + x = np.linspace(-5, 5, 400) + y = f(x) + + #h_value = 10**(-e_value) + + # Compute the slope and function value at x = x_value + slope_at_x_value = df(x_value) + limit_at_x_value = (f(x_value + h_value) - f(x_value)) / h_value + f_at_x_value = f(x_value) + f_at_x_plus_h_value = f(x_value + h_value) + + # Generate x and y values for the tangent line near x = x_value + x_tangent = np.linspace(x_value - 2, x_value + 2, 400) + y_tangent = f_at_x_value + slope_at_x_value * (x_tangent - x_value) + y_limit_tangent = f_at_x_value + limit_at_x_value * (x_tangent - x_value) + + # Create the plot + plt.figure(figsize=(10, 6)) + plt.plot(x, y, label='f(x) = 3x^2 - 4x + 5') + plt.plot(x_tangent, y_tangent, linestyle='--', label=f'Asymptotic slope of {df(x_value):.2f} at x = {x_value:.2f}') + plt.plot(x_tangent, y_limit_tangent, linestyle='-.', label=f'Asymptotic limit of {limit_at_x_value:.2f} at x = {x_value:.2f}, h = {h_value:.3f}') + plt.scatter([x_value], [f_at_x_value], color='red') # point of tangency + plt.scatter([x_value+h_value], [f_at_x_plus_h_value], color='red') # point of tangency + plt.title('Plot of the function f(x) = 3x^2 - 4x + 5') + plt.xlabel('x') + plt.ylabel('f(x)') + plt.grid(True) + plt.legend() + plt.show() + +# Create an interactive widget +widgets.interact(plot_with_tangents, x_value=widgets.FloatSlider(value=-2, min=-5, max=5, step=0.1), h_value=widgets.FloatSlider(value=1, min=.001, max=2, step=.001)); +#widgets.interact(plot_with_tangents, h_value=widgets.FloatSlider(value=1, min=1, max=10, step=1)); +``` + +We use the rules of derivatives. See for example the derivative +[rules for basic functions](https://en.wikipedia.org/wiki/Derivative#Rules_for_basic_functions), +e.g. + +$$ +\frac{d}{dx} x^a = ax^{a-1}, + \quad \textrm{e.g.} \quad \frac{d}{dx} 3x^2 = 6x + \quad \textrm{,} \quad \frac{d}{dx} 6x = 6 + \quad \textrm{,} \quad \frac{d}{dx} 6 = 0 +$$ + +so + +$$ +\frac{d f(x)}{dx} = 6x - 4 +$$ + +```{python} +#| code-fold: true +# define the derivate of f as df +def df(x): + return 6*x - 4 +``` + +We can solve for where $\frac{d}{dx} f(x) = 0$ + +$$ +6x - 4 = 0 +$$ + +```{python} +#| code-fold: true +# Evaluate df and f for x where df = 0 +x_zero = 2/3 + +# Evaluate df +df(x_zero) +``` + +```{python} +#| code-fold: true +# And f at that value is +f(x_zero) +``` + +Which we can add to the plot of $f(x)$ to see if it indeed is at the minimum. + +```{python} +#| code-fold: true +xs = np.arange(-5, 5, 0.25) +ys = f(xs) +plt.plot(xs, ys) + +# Add a circle point at (2, 5) +plt.plot([x_zero], [f(x_zero)], 'o') + +# Show the plot +plt.show() +``` + +Now as Wikipedia [states](https://en.wikipedia.org/wiki/Derivative), + +> The derivative of a function of a single variable at a chosen input value, when +it exists, is the slope of the tangent line to the graph of the function at that point. + +## Slope of a Function + +We can explore the tangent at different x-values. + +```{python} +#| code-fold: true +import matplotlib.pyplot as plt +import numpy as np +import ipywidgets as widgets +from IPython.display import display + +# Define the function f(x) +def f(x): + return 3 * x ** 2 - 4 * x + 5 + +# Define the derivative f'(x) +def df(x): + return 6 * x - 4 + +# Function to plot f(x) and its tangent line at x = x_value +def plot_with_tangent(x_value): + # Generate x values for the function + x = np.linspace(-5, 5, 400) + y = f(x) + + # Compute the slope and function value at x = x_value + slope_at_x_value = df(x_value) + f_at_x_value = f(x_value) + + # Generate x and y values for the tangent line near x = x_value + x_tangent = np.linspace(x_value - 2, x_value + 2, 400) + y_tangent = f_at_x_value + slope_at_x_value * (x_tangent - x_value) + + # Create the plot + plt.figure(figsize=(10, 6)) + plt.plot(x, y, label='$f(x) = 3x^2 - 4x + 5$') + plt.plot(x_tangent, y_tangent, linestyle='--', label=f'Asymptotic slope of {df(x_value):.2f} at x = {x_value:.2f}') + plt.scatter([x_value], [f_at_x_value], color='red') # point of tangency + plt.title('Plot of the function $f(x) = 3x^2 - 4x + 5$') + plt.xlabel('$x$') + plt.ylabel('$f(x)$') + plt.grid(True) + plt.legend() + plt.show() + +# Create an interactive widget +widgets.interact(plot_with_tangent, x_value=widgets.FloatSlider(value=-2, min=-5, max=5, step=0.1)); +``` + +## Slope Shows Influence of $x$ on $f$ + +__Important Note:__ + +* if the slope is negative, then by increasing $x$, we will decrease $f(x)$. +* And if the slope is positive, then decreasing $x$ will decrease $f(x)$. + +## Interpretation of Slope + +Let's illustrate with this function $f(x)$ a useful way to interpret the slope. + +In the graph above, with $x=-2$, we see the slope, call it $m$, is -16. What that +means is that when we change the value of $x$, the impact on the ouptut will +roughly be _amplified_ by $m$, or -16 when $x=2$. + +Put another way, the slope (equivalently the derivative) of a function $f(x)$ at +an input $x$ indicates how sensitive the output is to changes in the input. + +> This will be key to understanding how we have to tweak the weights of our model +> to minimize our loss function. + +## Gradient Descent on a Linear Regression Model + +Now, in 2 or higher dimensions we can there many directions that will descend, +but we want to pick the direction of steepest descent. We'll formalize that idea. + +As long as the loss function is _locally_ differentiable, we can define the +direction of steepest descent. + +That direction is given by the _negative_ of the __gradient.__ + +The gradient is a generalization of the slope of a line. + +Let's say we have a loss function $\mathcal{L}(\mathbf{w})$. + +The components of $\mathbf{w}\in\mathbb{R}^n$ are the parameters we want to optimize. + +Just a reminder that $\mathbf{w} \in \mathbb{R}^n$ denotes an $n$-dimensional vector. + +For linear regression, the loss function could be squared loss: + +$$ +\mathcal{L}(\mathbf{w}) = \Vert\mathbf{y} - \hat{\mathbf{y}}\Vert^2 +$$ + +where $\hat{\mathbf{y}}$ is our estimate, ie, $\hat{\mathbf{y}} = X\mathbf{w}$ so that + +$$ +\mathcal{L}(\mathbf{w}) = \Vert\mathbf{y} - X\mathbf{w}\Vert^2 +$$ + +To find the gradient, we take the partial derivative of our loss function with respect to each parameter: + +$$ +\frac{\partial \mathcal{L}}{\partial w_i} +$$ + +and collect all the partial derivatives into a vector of the same shape as $\mathbf{w}$: + +$$ +\nabla_\mathbf{w}\mathcal{L} = \begin{bmatrix} + \frac{\partial \mathcal{L}}{\partial w_1}\\ + \frac{\partial \mathcal{L}}{\partial w_2}\\ + \vdots \\ + \frac{\partial \mathcal{L}}{\partial w_n} + \end{bmatrix} +$$ + +When you see the notation $\nabla_\mathbf{w}\mathcal{L},$ think of it as the +derivative with respect to the vector $\mathbf{w}$. + +The _nabla_ symbol, $\nabla$, denotes the _vector differentiator operator_ called _del_. + +It turns out that if we are going to take a small step of unit length, then the +gradient is the direction that maximizes the change in the loss function. + + + +![](figs/L23-gradient-of-convex.png){width="60%"} + +As you can see from the above figure, in general the gradient varies depending on +where you are in the parameter space. + +So we write: + +$$ +\nabla_\mathbf{w}\mathcal{L}(\mathbf{w}) = \begin{bmatrix} + \frac{\partial \mathcal{L}}{\partial w_1}(\mathbf{w})\\ + \frac{\partial \mathcal{L}}{\partial w_2}(\mathbf{w})\\ + \vdots \\ + \frac{\partial \mathcal{L}}{\partial w_n}(\mathbf{w}) + \end{bmatrix} +$$ + +Each time we seek to improve our parameter estimates $\mathbf{w}$, we will take +a step in the negative direction of the gradient. + +... "negative direction" because the gradient specifies the direction of maximum +increase -- and we want to decrease the loss function. + +How big a step should we take? + +For step size, will use a scalar value, here denoted by the greek letter "eta", +$\eta$, which we call the __learning rate.__ + +The learning rate is a hyperparameter that needs to be tuned for a given problem, +or even can be modified adaptively as the algorithm progresses as we will see later. + +Now we can write the __gradient descent__ algorithm formally: + +1. Start with an initial parameter estimate $\mathbf{w}^0$. +2. Update: $\mathbf{w}^{n+1} = \mathbf{w}^n - \eta \nabla_\mathbf{w}\mathcal{L}(\mathbf{w}^n)$ +3. If not converged, go to step 2. + +How do we know if we are "converged"? + +Typically we stop + +* after a certain number of iterations, or +* the loss has not improved by a fixed amount -- _early stopping_ + +## Example: Linear Regression + +```{python} +#| code-fold: true +import numpy as np +import scipy as sp +import matplotlib.pyplot as plt +import pandas as pd +import seaborn as sns +import matplotlib as mp +import sklearn +import networkx as nx +from IPython.display import Image, HTML + +import laUtilities as ut + +%matplotlib inline +``` + +Let's say we have this dataset. + +```{python} +#| code-fold: true +def centerAxes(ax): + ax.spines['left'].set_position('zero') + ax.spines['right'].set_color('none') + ax.spines['bottom'].set_position('zero') + ax.spines['top'].set_color('none') + ax.xaxis.set_ticks_position('bottom') + ax.yaxis.set_ticks_position('left') + bounds = np.array([ax.axes.get_xlim(), ax.axes.get_ylim()]) + ax.plot(bounds[0][0],bounds[1][0],'') + ax.plot(bounds[0][1],bounds[1][1],'') + +n = 10 +beta = np.array([1., 0.5]) +ax = plt.figure(figsize = (7, 7)).add_subplot() +centerAxes(ax) +np.random.seed(1) +xlin = -10.0 + 20.0 * np.random.random(n) +y = beta[0] + (beta[1] * xlin) + np.random.randn(n) +ax.plot(xlin, y, 'ro', markersize = 10); +``` + +Let's fit a least-squares line to this data. + +The loss function for this problem is the least-squares error: + +$$ +\mathcal{L}(\mathbf{\beta}) = \Vert\mathbf{y} - X\mathbf{\beta}\Vert^2 +$$ + +Of course, we know how to solve this problem using the normal equations, but let's do it using gradient descent instead. + +Here is the line we'd like to find: + +```{python} +#| code-fold: true +ax = plt.figure(figsize = (7, 7)).add_subplot() +centerAxes(ax) +ax.plot(xlin, y, 'ro', markersize = 10) +ax.plot(xlin, beta[0] + beta[1] * xlin, 'b-') +plt.text(-9, 3, r'$y = \beta_0 + \beta_1x$', size=20); +``` + +There are $n = 10$ data points, whose $x$ and $y$ values are stored in `xlin` and `y`. + +First, let's create our $X$ (design) matrix, and include a column of ones to model the intercept: + +```{python} +X = np.column_stack([np.ones((n, 1)), xlin]) +``` + +Now, let's visualize the loss function $\mathcal{L}(\mathbf{\beta}) = \Vert \mathbf{y}-X\mathbf{\beta}\Vert^2.$ + +```{python} +#| code-fold: true +fig = ut.three_d_figure((23, 1), '', + -12, 12, -4, 4, -1, 2000, + figsize = (7, 7)) +qf = np.array(X.T @ X) +fig.ax.view_init(azim = 60, elev = 22) +fig.plotGeneralQF(X.T @ X, -2 * (y.T @ X), y.T @ y, alpha = 0.5) +fig.ax.set_zlabel('$\mathcal{L}$') +fig.ax.set_xlabel(r'$\beta_0$') +fig.ax.set_ylabel(r'$\beta_1$') +fig.set_title(r'$\Vert \mathbf{y}-X\mathbf{\beta}\Vert^2$', '', + number_fig = False, size = 18) +# fig.save(); +``` + +We won't take you through computing the gradient for this problem (you can find it in the online text). + +We'll will just tell you that the gradient for a least squares problem is: + +$$ +\nabla_\beta \mathcal{L}(\mathbf{\beta}) = X^T X \beta - X^T\mathbf{y} +$$ + +::: {.callout-note} +For those interested in a little more insight into what these plots are showing, +here is the derivation. + +We start from the rule that $\Vert \mathbf{v}\Vert = \sqrt{\mathbf{v}^T\mathbf{v}}$. + +Applying this rule to our loss function: + +$$ +\mathcal{L}(\mathbf{\beta}) = \Vert \mathbf{y} - X\mathbf{\beta} \Vert^2 = \beta^T X^T X \beta - 2\mathbf{\beta}^TX^T\mathbf{y} + \mathbf{y}^T\mathbf{y} +$$ + +The first term, $\beta^T X^T X \beta$, is a quadratic form, and it is what makes +this surface curved. As long as $X$ has independent columns, $X^TX$ is positive +definite, so the overall shape is a paraboloid opening upward, and the surface +has a unique minimum point. + +To find the gradient, we can use standard calculus rules for derivates involving +vectors. The rules are not complicated, but the bottom line is that in this case, +you can almost use the same rules you would if $\beta$ were a scalar: + +$$ +\nabla_\beta \mathcal{L}(\mathbf{\beta}) = 2X^T X \beta - 2X^T\mathbf{y} +$$ + +And by the way -- since we've computed the derivative as a function of $\beta$, instead of using gradient descent, we could simply solve for the point where the gradient is zero. This is the optimal point which we know must exist: + +$$ +\nabla_\beta \mathcal{L}(\mathbf{\beta}) = 0 +$$ + +$$ +2X^T X \beta - 2X^T\mathbf{y} = 0 +$$ + +$$ +X^T X \beta = X^T\mathbf{y} +$$ + +Which of course, are the normal equations for this linear system. +::: + +So here is our code for gradient descent: + +```{python} +def loss(X, y, beta): + return np.linalg.norm(y - X @ beta) ** 2 + +def gradient(X, y, beta): + return X.T @ X @ beta - X.T @ y + +def gradient_descent(X, y, beta_hat, eta, nsteps = 1000): + losses = [loss(X, y, beta_hat)] + betas = [beta_hat] + # + for step in range(nsteps): + # + # the gradient step + new_beta_hat = beta_hat - eta * gradient(X, y, beta_hat) + beta_hat = new_beta_hat + # + # accumulate statistics + losses.append(loss(X, y, new_beta_hat)) + betas.append(new_beta_hat) + + return np.array(betas), np.array(losses) +``` + +We'll start at an arbitrary point, say, $(-8, -3.2)$. + +That is, $\beta_0 = -8$, and $\beta_1 = -3.2$. + +```{python} +beta_start = np.array([-8, -3.2]) +eta = 0.002 +betas, losses = gradient_descent(X, y, beta_start, eta) +``` + +What happens to our loss function per GD iteration? + +```{python} +#| code-fold: true +plt.plot(np.log(losses), '.-') +plt.ylabel(r'$\log\mathcal{L}$', size = 14) +plt.xlabel('Iteration', size = 14) +plt.title('Improvement in Loss Per Iteration of GD', size = 16); +``` + +And how do the parameter values $\beta$ evolve? + +```{python} +#| code-fold: true +plt.plot(betas[:, 0], betas[:, 1], '.-') +plt.xlabel(r'$\beta_0$', size = 14) +plt.ylabel(r'$\beta_1$', size = 14) +plt.title(r'Evolution of $\beta$', size = 16); +``` + +Notice that the improvement in loss decreases over time. Initially the gradient +is steep and loss improves fast, while later on the gradient is shallow and loss +doesn't improve much per step. + +Now remember that in reality we are like the person who is trying to find their +way down the mountain, in the fog. + +In general we cannot "see" the entire loss function surface. + +Nonetheless, since we know what the loss surface looks like in this case, we can +visualize the algorithm "moving" on that surface. + +This visualization combines the last two plots into a single view. + +```{.python} +#| code-fold: true +%matplotlib inline +# set up view +import matplotlib.animation as animation +mp.rcParams['animation.html'] = 'jshtml' + +anim_frames = np.array(list(range(10)) + [2 * x for x in range(5, 25)] + [5 * x for x in range(10, 100)]) + +fig = ut.three_d_figure((23, 1), 'z = 3 x1^2 + 7 x2 ^2', + -12, 12, -4, 4, -1, 2000, + figsize = (7, 7)) +plt.close() +fig.ax.view_init(azim = 60, elev = 22) +qf = np.array(X.T @ X) +fig.plotGeneralQF(X.T @ X, -2 * (y.T @ X), y.T @ y, alpha = 0.5) +fig.ax.set_zlabel('$\mathcal{L}$') +fig.ax.set_xlabel(r'$\beta_0$') +fig.ax.set_ylabel(r'$\beta_1$') +fig.set_title(r'$\Vert \mathbf{y}-X\mathbf{\beta}\Vert^2$', '', + number_fig = False, size = 18) +# +def anim(frame): + fig.ax.plot(betas[:frame, 0], betas[:frame, 1], 'o-', zs = losses[:frame], c = 'k', markersize = 5) + # fig.canvas.draw() +# +# create the animation +animation.FuncAnimation(fig.fig, anim, + frames = anim_frames, + fargs = None, + interval = 1, + repeat = False) +``` + +We can also see how evolution of the parameters translate to the line fitting to the data. + +```{.python} +#| code-fold: true +fig, ax = plt.subplots(figsize = (7, 7)) +plt.close() +centerAxes(ax) +ax.plot(xlin, y, 'ro', markersize = 10) +fit_line = ax.plot([], []) + +# +#to get additional args to animate: +#def animate(angle, *fargs): +# fargs[0].view_init(azim=angle) +def animate(frame): + fit_line[0].set_data(xlin, betas[frame, 0] + betas[frame, 1] * xlin) + fig.canvas.draw() +# +# create the animation +animation.FuncAnimation(fig, animate, + frames = anim_frames, + fargs=None, + interval=100, + repeat=False) +``` + +## Challenges in Gradient Descent + +Gradient Descent is a very general algorithm, one that can be applied to a huge +array of problem types. + +However, there are a variety of issues that arise in using gradient descent in +practice. + +## Learning Rate + +Setting the learning rate can be a challenge. + +Previously we had set the learning rate $\eta = 0.002$. + +Let set it a little higher and see what happens: $\eta = 0.0065.$ + +```{python} +beta_start = np.array([-8, -2]) +eta = 0.0065 +betas, losses = gradient_descent(X, y, beta_start, eta, nsteps = 100) +``` + +```{python} +#| code-fold: true +plt.plot(np.log(losses), '.-') +plt.ylabel(r'$\log\mathcal{L}$', size = 14) +plt.xlabel('Iteration', size = 14) +plt.title('Improvement in Loss Per Iteration of GD', size = 16); +``` + +```{python} +#| code-fold: true +plt.plot(betas[:, 0], betas[:, 1], '.-') +plt.xlabel(r'$\beta_0$', size = 14) +plt.ylabel(r'$\beta_1$', size = 14) +plt.title(r'Evolution of $\beta$', size = 16); +``` + +This is a total disaster. What is going on? + +It is helpful to look at the progress of the algorithm using the loss surface: + +```{python} +#| code-fold: true +%matplotlib inline +fig = ut.three_d_figure((23, 1), '', + -12, 2, -4, 4, -1, 2000, + figsize = (7, 7)) +qf = np.array(X.T @ X) +fig.ax.view_init(azim = 142, elev = 58) +fig.plotGeneralQF(X.T @ X, -2 * (y.T @ X), y.T @ y, alpha = 0.5) +fig.ax.set_zlabel('$\mathcal{L}$') +fig.ax.set_xlabel(r'$\beta_0$') +fig.ax.set_ylabel(r'$\beta_1$') +fig.set_title(r'$\Vert \mathbf{y}-X\mathbf{\beta}\Vert^2$', '', + number_fig = False, size = 18) +nplot = 18 +fig.ax.plot(betas[:nplot, 0], betas[:nplot, 1], 'o-', zs = losses[:nplot], markersize = 5); +# +``` + +We can see what is going on more clearly here. + +What is happening is that because the steps are __too large,__ each step +overshoots the local minimum. + +The next step then lands on a portion of the surface that steeper ... and in the +opposite direction. + +And so the process diverges. + +> For an interesting comparison, try setting $\eta = 0.0055$ and observe the +> evolution of $\beta$. + +Hence it is important to decrease the step size when divergence appears. + +Unfortunately, on a complicated loss surface, a given step size may diverge in +one location or starting point, but not in another. + +## Complex Loss Surfaces + +The loss surface for linear regression is the best possible kind: it is strictly +convex, so it has a single global minimum. + +For neural networks, the loss surface is more complex. + +In general, the larger the neural network, the more complex the loss surface. + +And deep neural networks, especially transformers have billions of parameters. + +Here's a visualization of the loss surface for the 56 layer neural network +[VGG-56](http://arxiv.org/abs/1409.1556), from +[Visualizing the Loss Landscape of Neural Networks](https://www.cs.umd.edu/~tomg/projects/landscapes/). + + + +![](figs/L23-complex-landscape.png){width="40%"} + +For a fun exploration, see https://losslandscape.com/explorer. + +## Recap + +So far we applied gradient descent on a simple linear regression model. + +As we'll soon see, deep neural networks are much more complicated multi-stage +models, with millions or billions of parameters to differentiate. + +Fortunately, the _Chain Rule_ from calculus gives us a relatively simple and +scalable algorithm, called _Back Propagation_, that solves this problem. + +## Neuron and Neural Networks + +Now let's switch gears a bit to define an _artificial neuron_. For better or worse +it is named after and loosely modeled on a biological neuron. + + + +![](figs/NN-figs/neuron.png){width="75%"} + + +From [cs231n](https://cs231n.github.io/neural-networks-1/) + +* The dendrites carry impulses from other neurons of different distances. +* Once the collective firing rate of the impulses exceed a certain threshold, + the neuron fires its own pulse through the axon to other neurons + +There are companies trying to mimic this impulse (i.e. spiking) based neuron in +silicon -- so called _neuromorphic computing_. + +See for example +[Neuromorphic Computing](https://en.wikipedia.org/wiki/Neuromorphic_engineering) +or [Spiking Neural Network](https://en.wikipedia.org/wiki/Spiking_neural_network) + +Some examples of companies and projects are Intel's +[Loihi](https://www.intel.com/content/www/us/en/research/neuromorphic-computing-loihi-2-technology-brief.html) +and startups such as GrAI Matter Labs [VIP processor](https://www.graimatterlabs.ai/product). + +## Artificial Neuron + + + + +![](figs/NN-figs/neuron_model.jpeg){width="75%"} + +From [cs231n](https://cs231n.github.io/neural-networks-1/) + +The more common artifical neuron + +* collects one or more inputs, +* each multiplied by a unique weight +* sums the weighted inputs +* adds a bias +* then finally usually applies a nonlinear activation function + +## Multi-Layer Perceptron (MLP) or Fully Connected Network (FCN) + +![](figs/NN-figs/neural_net2.jpeg){width="75%"} + +From [cs231n](https://cs231n.github.io/convolutional-networks/) + +Multiple artificial neurons can be acting on the same inputs, in what we call +a _layer_, and we can have more than one _layer_ until we produce one or more +outputs. + +The example above shows a network with _3 inputs_, two layers of neurons, each +with 4 neurons, followed by one layer that produces a single value output. + +E.g. a binary classifier. + +Activation function is typically some nonlinear function that compresses the input +in some way. Historically, it's been the sigmoid and $\tanh()$ functions. See for +example [Hyperbolic Functions](https://en.wikipedia.org/wiki/Hyperbolic_functions#Tanh). + +```{python} +#| code-fold: true +plt.plot(np.arange(-5,5,0.2), np.tanh(np.arange(-5,5,0.2))) +plt.title('tanh(x)') +plt.xlabel('x') +plt.ylabel('f(x)') +plt.grid() +``` + +A more common activation function these days and that is more efficient to implement is the _Rectified Linear Unit_ or _ReLU_. + +$$ +\textrm{ReLU}(x) = \mathrm{max}(0, x) +$$ + +```{python} +#| code-fold: true +plt.plot(np.arange(-5,5,0.2), np.maximum(0,np.arange(-5,5,0.2))) +plt.title('ReLU(x)') +plt.xlabel('x') +plt.ylabel('f(x)') +plt.grid() +``` + +There are many other variations. See for example +[PyTorch Non-linear Activations](https://pytorch.org/docs/stable/nn.html#non-linear-activations-weighted-sum-nonlinearity) + +## Next Lecture + +* We'll build out a Value class +* Visualize our compute graph +* Implement Backpropagation +* Build out our neural network +* Train and evaluate it +* Recreate and match it in PyTorch + diff --git a/ds701_book/24-NN-Outline.md b/ds701_book/24-NN-Outline.md new file mode 100644 index 00000000..f0c8c1eb --- /dev/null +++ b/ds701_book/24-NN-Outline.md @@ -0,0 +1,49 @@ +# NN II and III Outline + +## NN I Recap + +* Applications of NNs +* Intuition on loss functions for model fitting -- convex and nonconvex +* Gradient descent intuition +* Derivatives refresher (slope, gradient) +* Gradient descent on linear regression model +* Challenges in gradient descent -- learning rate... +* Complex loss surfaces of NNs +* biological and artificial neurons +* first glimpse at a MLP/FCN + + +## NN II Outline + +* Artificial Neuron -- picture +* Neuron -- scalar equation +* Neuron -- vector equation + +Individual weights are $\omega$ and matrices of weights are $\Omega$. +Each neuron has its own set of weights for each input. + +$\beta$ are vector of biases. Each neuron has it's own biases + +* Shallow network -- 1 hidden layer with 4 neurons, single regression output + * Show picture, + * then show matrix equation + * define loss function... + +* define as $loss(h_1(a_0(h_0(x))))$ + * linear part of hidden -- $h_0(x)$ + * activation function $a_0(.)$ + * linear part of output $h_1(.)$ + * loss function loss(.), say is just MSE + +We want to each parameter by the negative of the partial derivative + +* write the equations in terms of matrix values + +* then we want the partial derivative of each to update the parameters + +* Introduce the chain rule + +* show a single neuron with single input +* show as compute graph + +* show torchviz with numbers? diff --git a/ds701_book/26-TimeSeries-Notes.qmd b/ds701_book/26-TimeSeries-Notes.qmd new file mode 100644 index 00000000..60c84aec --- /dev/null +++ b/ds701_book/26-TimeSeries-Notes.qmd @@ -0,0 +1,279 @@ +--- +title: TS Notes +jupyter: python3 +--- + + +(gtp-4o, personal communications, Nov. 20, 2024) + +Classical decomposition is primarily used for breaking down a time series into its constituent components: trend, seasonality, and residuals. While it doesn't directly provide a forecasting method, you can use the decomposed components to make forecasts. Here's a general approach to forecasting using classical decomposition: + +1. **Decompose the Time Series**: Break down the time series into trend, seasonal, and residual components. + +2. **Forecast the Trend Component**: Use a suitable method (e.g., linear regression, moving average) to forecast the trend component. + +3. **Forecast the Seasonal Component**: Assume the seasonal component repeats itself and use the last observed seasonal pattern for future periods. + +4. **Combine the Forecasts**: Add the forecasted trend and seasonal components to get the final forecast. The residual component is often assumed to be zero for forecasting purposes. + +Here's a simplified example in Python using the `statsmodels` library for decomposition and a basic approach for forecasting: + +````{python} +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +from statsmodels.tsa.seasonal import seasonal_decompose + +# Load the dataset +data = pd.read_csv('data/air_passengers_1949_1960.csv', index_col='Date', parse_dates=True) +ts = data['Number of Passengers'] + +# Decompose the time series +decomposition = seasonal_decompose(ts, model='additive') +trend = decomposition.trend +seasonal = decomposition.seasonal +residual = decomposition.resid + +# Forecast the trend component using a simple linear extrapolation +trend_forecast = trend.dropna().iloc[-1] + np.arange(1, 25) * (trend.dropna().iloc[-1] - trend.dropna().iloc[-2]) + +# Forecast the seasonal component by repeating the last observed seasonal pattern +seasonal_forecast = np.tile(seasonal[-12:], 2) + +# Combine the forecasts +forecast = trend_forecast + seasonal_forecast + +# Plot the original data and the forecast +plt.figure(figsize=(10, 6)) +plt.plot(ts, label='Observed') +forecast_index = pd.date_range(ts.index[-1] + pd.DateOffset(months=1), periods=24, freq='M') +plt.plot(forecast_index, forecast, label='Forecast', color='red') +plt.legend() +plt.title('Forecast using Classical Decomposition') +plt.show() +```` + +### Explanation: +- **Trend Forecasting**: This example uses a simple linear extrapolation of the last two trend values. You might want to use a more sophisticated method depending on your data. +- **Seasonal Forecasting**: The seasonal component is assumed to repeat itself, so the last observed seasonal pattern is used for future periods. +- **Combining Components**: The final forecast is the sum of the trend and seasonal forecasts. The residual component is not used in the forecast. + +This approach is quite basic and may not be suitable for all datasets, especially those with complex patterns. For more accurate forecasting, consider using more advanced models like ARIMA or machine learning techniques. + +## MA Model Definition + +(gtp-4o, personal communications, Nov. 20, 2024) + +Certainly! Let's walk through an example of defining and training a Moving Average (MA) model using the Air Passengers dataset in Python. We'll use the `statsmodels` library, which provides tools for estimating MA models. + +### Step-by-Step Example + +1. **Import Libraries**: We'll need `pandas` for data manipulation and `statsmodels` for the MA model. + +2. **Load the Data**: We'll load the Air Passengers dataset. + +3. **Visualize the Data**: It's always a good idea to visualize the data to understand its structure. + +4. **Fit an MA Model**: We'll fit an MA model to the data. + +5. **Evaluate the Model**: We'll look at the model summary and plot the residuals. + +Here's how you can do it: + +```{python} +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +from statsmodels.tsa.arima.model import ARIMA + +# Load the Air Passengers dataset +data = pd.read_csv(os.path.join('data', 'air_passengers_1949_1960.csv'), + index_col='Date', parse_dates=True) + +# Visualize the data +plt.figure(figsize=(10, 6)) +plt.plot(data, label='Number of Passengers') +plt.title('Air Passengers Data') +plt.xlabel('Date') +plt.ylabel('Number of Passengers') +plt.legend() +plt.show() + +# Fit an MA model +# We use ARIMA with order (0, 0, q) to specify an MA(q) model +ma_order = 1 # You can experiment with different orders +model = ARIMA(data, order=(0, 0, ma_order)) +model_fit = model.fit() + +# Print the model summary +print(model_fit.summary()) + +# Plot the residuals +residuals = model_fit.resid +plt.figure(figsize=(10, 6)) +plt.plot(residuals, label='Residuals') +plt.title('Residuals of the MA Model') +plt.xlabel('Date') +plt.ylabel('Residuals') +plt.legend() +plt.show() + +# Plot the ACF of the residuals to check for remaining autocorrelation +from statsmodels.graphics.tsaplots import plot_acf +plot_acf(residuals, lags=20) +plt.show() +``` + +### Explanation + +- **Data Loading**: We load the dataset directly from a URL. The `parse_dates=True` argument ensures that the 'Month' column is treated as a datetime index. + +- **Visualization**: We plot the time series to get a sense of its structure. + +- **Model Fitting**: We use the `ARIMA` class from `statsmodels` with the order `(0, 0, ma_order)` to specify an MA model. The first two zeros indicate no autoregressive terms and no differencing, respectively. + +- **Model Summary**: The summary provides details about the estimated parameters and their statistical significance. + +- **Residuals**: We plot the residuals to check for any patterns. Ideally, they should resemble white noise. + +- **ACF Plot**: The autocorrelation function (ACF) plot of the residuals helps us check if there is any remaining autocorrelation. If the residuals are white noise, the ACF should show no significant lags. + +This example demonstrates how to define and train an MA model using the Air Passengers dataset. You can experiment with different orders of the MA model to see how it affects the fit. + +## Building MA from scratch + +(gtp-4o, personal communications, Nov. 20, 2024) + +Building a Moving Average (MA) model from scratch involves manually calculating the moving average of the error terms. Here's how you can implement an MA(1) model without using the `statsmodels` library: + +### Step-by-Step Implementation + +1. **Import Libraries**: We'll use `pandas` for data manipulation and `numpy` for numerical operations. + +2. **Load the Data**: Load the Air Passengers dataset. + +3. **Define the MA Model**: Implement the MA(1) model manually. + +4. **Calculate Residuals**: Compute the residuals and use them to predict future values. + +5. **Visualize the Results**: Plot the original data and the fitted values. + +Here's how you can do it: + +```{python} +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt + +# Load the Air Passengers dataset +data = pd.read_csv('data/air_passengers_1949_1960.csv', index_col='Date', parse_dates=True) +ts = data['Number of Passengers'] + +# Define the MA(1) model +def moving_average_forecast(series, window_size): + # Initialize the list to store the forecasted values + forecast = [np.nan] * window_size # First 'window_size' values are NaN + for t in range(window_size, len(series)): + # Calculate the average of the past 'window_size' residuals + forecast.append(np.mean(series[t-window_size:t])) + return np.array(forecast) + +# Calculate the residuals +residuals = ts - ts.shift(1) + +# Fit the MA(1) model +window_size = 1 # MA(1) model +ma_forecast = moving_average_forecast(residuals.dropna(), window_size) + +# Add the forecasted residuals to the original series to get the fitted values +fitted_values = ts.shift(1) + ma_forecast + +# Plot the original data and the fitted values +plt.figure(figsize=(10, 6)) +plt.plot(ts, label='Observed') +plt.plot(fitted_values, label='Fitted', color='red') +plt.title('MA(1) Model Fitting') +plt.xlabel('Date') +plt.ylabel('Number of Passengers') +plt.legend() +plt.show() +``` + +### Explanation + +- **Data Loading**: We load the dataset and parse the dates to use them as the index. + +- **MA Model Definition**: The `moving_average_forecast` function calculates the moving average of the past `window_size` residuals to forecast future values. + +- **Residual Calculation**: We calculate the residuals by subtracting the previous value from the current value. + +- **Model Fitting**: We use a window size of 1 for the MA(1) model, which means we use the last residual to predict the next value. + +- **Visualization**: We plot the original time series and the fitted values to visualize the model's performance. + +This implementation provides a basic example of how to manually fit an MA(1) model. You can extend this approach to higher-order MA models by adjusting the `window_size` and modifying the forecast calculation accordingly. + +## Parameterize the MA 1. + +(gtp-4o, personal communications, Nov. 20, 2024) + +Certainly! We can modify the function to accept the MA order as a parameter, allowing you to specify different orders for the Moving Average model. Here's how you can do it: + +```{python} +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt + +# Load the Air Passengers dataset +data = pd.read_csv('data/air_passengers_1949_1960.csv', index_col='Date', parse_dates=True) +ts = data['Number of Passengers'] + +# Define the MA model with a parameter for the order +def moving_average_forecast(series, ma_order): + # Initialize the list to store the forecasted values + forecast = [np.nan] * ma_order # First 'ma_order' values are NaN + for t in range(ma_order, len(series)): + # Calculate the average of the past 'ma_order' residuals + forecast.append(np.mean(series[t-ma_order:t])) + return np.array(forecast) + +# Calculate the residuals +residuals = ts - ts.shift(1) + +print(len(residuals)) +print(len(ts)) +print(len(ts.shift(1))) + +# Specify the MA order +ma_order = 3 # You can change this to experiment with different orders + +# Fit the MA model +ma_forecast = moving_average_forecast(residuals.dropna(), ma_order) +# Extend the forecast to be the same length as ts.shift(1) by repeating the last value +extended_ma_forecast = np.concatenate([ma_forecast, np.full(len(ts.shift(1)) - len(ma_forecast), ma_forecast[-1])]) + +print(len(ma_forecast)) + +# Add the forecasted residuals to the original series to get the fitted values +fitted_values = ts.shift(1) + ma_forecast + +# Plot the original data and the fitted values +plt.figure(figsize=(10, 6)) +plt.plot(ts, label='Observed') +plt.plot(fitted_values, label='Fitted', color='red') +plt.title(f'MA({ma_order}) Model Fitting') +plt.xlabel('Date') +plt.ylabel('Number of Passengers') +plt.legend() +plt.show() +``` + +### Explanation + +- **MA Order Parameter**: The `moving_average_forecast` function now takes `ma_order` as a parameter, allowing you to specify the order of the MA model. + +- **Forecast Calculation**: The function calculates the moving average of the past `ma_order` residuals to forecast future values. + +- **Experimentation**: You can change the `ma_order` variable to experiment with different orders of the MA model and observe how it affects the fit. + +This approach provides flexibility in choosing the order of the MA model, enabling you to tailor the model to better fit your data. diff --git a/ds701_book/RNN-time-series-example.qmd b/ds701_book/RNN-time-series-example.qmd new file mode 100644 index 00000000..38b4cde0 --- /dev/null +++ b/ds701_book/RNN-time-series-example.qmd @@ -0,0 +1,232 @@ +## Neural Network Example -- PyTorch + +Dataset: Energy Consumption Dataset (UCI Machine Learning Repository) + +```{python} +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +from sklearn.preprocessing import MinMaxScaler +import torch +import torch.nn as nn +from torch.utils.data import DataLoader, Dataset +``` + +Load dataset from https://archive.ics.uci.edu/dataset/374/appliances+energy+prediction + +```{python} +import os + +file_path = 'energydata_complete.csv' +url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00374/energydata_complete.csv" + +if os.path.exists(file_path): + data = pd.read_csv(file_path) +else: + data = pd.read_csv(url) + data.to_csv(file_path, index=False) + +data.head() +``` + +```{python} +# Save the dataframe locally if it doesn't exist +if not os.path.exists(file_path): + data.to_csv(file_path, index=False) +``` + +
+ +**Column Descriptions** + + +| Column | Description | +| ---- | ----------- | +| date | time year-month-day hour:minute:second | +| Appliances | energy use in Wh | +| lights | energy use of light fixtures in the house in Wh | +| T1 | Temperature in kitchen area, in Celsius | +| RH_1 | Humidity in kitchen area, in % | +| T2 | Temperature in living room area, in Celsius | +| RH_2 | Humidity in living room area, in % | +| T3 | Temperature in laundry room area | +| RH_3 | Humidity in laundry room area, in % | +| T4 | Temperature in office room, in Celsius | +| RH_4 | Humidity in office room, in % | +| T5 | Temperature in bathroom, in Celsius | +| RH_5 | Humidity in bathroom, in % | +| T6 | Temperature outside the building (north side), in Celsius | +| RH_6 | Humidity outside the building (north side), in % | +| T7 | Temperature in ironing room , in Celsius | +| RH_7 | Humidity in ironing room, in % | +| T8 | Temperature in teenager room 2, in Celsius | +| RH_8 | Humidity in teenager room 2, in % | +| T9 | Temperature in parents room, in Celsius | +| RH_9 | Humidity in parents room, in % | +| To | Temperature outside (from Chievres weather station), in Celsius | +| Pressure | (from Chievres weather station), in mm Hg | +| RH_out | Humidity outside (from Chievres weather station), in % | +| Wind speed | (from Chievres weather station), in m/s | +| Visibility | (from Chievres weather station), in km | +| Tdewpoint | (from Chievres weather station), °C | +| rv1 | Random variable 1, nondimensional | +| rv2 | Random variable 2, nondimensional | + +
+ +Where indicated, hourly data (then interpolated) from the nearest airport weather station (Chievres Airport, Belgium) was downloaded from a public data set from Reliable Prognosis, rp5.ru. Permission was obtained from Reliable Prognosis for the distribution of the 4.5 months of weather data. + +```{python} +data.info() +``` + +```{python} + +data['date'] = pd.to_datetime(data['date']) +data.set_index('date', inplace=True) + +data.head() +``` + +We're interested in the `Appliances` column, which is the energy use of the appliances in Wh. + +First, we'll resample the data to hourly resolution and fill missing values using the forward fill method. + +```{python} +data = data['Appliances'].resample('h').mean().fillna(method='ffill') # Resample and fill missing + +data.head() +``` + +Scale the values to be between 0 and 1 and convert to a numpy array. + +```{python} +# Normalize data +scaler = MinMaxScaler() +data_scaled = scaler.fit_transform(data.values.reshape(-1, 1)) + +print(type(data_scaled)) +print(data_scaled.shape) +``` + +```{python} + +# Prepare data for LSTM +class TimeSeriesDataset(Dataset): + def __init__(self, data, seq_length): + self.data = data + self.seq_length = seq_length + + def __len__(self): + return len(self.data) - self.seq_length + + def __getitem__(self, index): + X = self.data[index:index + self.seq_length] + y = self.data[index + self.seq_length] + return torch.tensor(X, dtype=torch.float32), torch.tensor(y, dtype=torch.float32) +``` + +```{python} + +seq_length = 24 +dataset = TimeSeriesDataset(data_scaled, seq_length) + +print(len(dataset)) +``` + +```{python} + +# Split data into training and testing +train_size = int(len(dataset) * 0.8) +test_size = len(dataset) - train_size + +train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size]) + +train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True) +test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False) + +print(len(train_loader)) +print(len(test_loader)) +``` + +```{python} +# let's look at the first batch +for X, y in train_loader: + print(X.shape) + print(y.shape) + break +``` + +```{python} + +# Define the LSTM model +class LSTMModel(nn.Module): + def __init__(self, input_size=1, hidden_size=50, output_size=1): + super(LSTMModel, self).__init__() + self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True) + self.fc = nn.Linear(hidden_size, output_size) + + def forward(self, x): + x, _ = self.lstm(x) + x = self.fc(x[:, -1, :]) # Use the output of the last time step + return x +``` + +```{python} +model = LSTMModel() +criterion = nn.MSELoss() +optimizer = torch.optim.Adam(model.parameters(), lr=0.001) +``` + +```{python} +# Train the model +epochs = 20 +for epoch in range(epochs): + model.train() + train_loss = 0.0 + for X, y in train_loader: + X = X.unsqueeze(-1) # Add input dimension + y = y.unsqueeze(-1) # Add target dimension + + optimizer.zero_grad() + outputs = model(X) + loss = criterion(outputs, y) + loss.backward() + optimizer.step() + + train_loss += loss.item() + + print(f"Epoch {epoch+1}/{epochs}, Loss: {train_loss/len(train_loader):.4f}") +``` + +```{python} + +# Evaluate the model +model.eval() +predictions = [] +actuals = [] +with torch.no_grad(): + for X, y in test_loader: + X = X.unsqueeze(-1) + y = y.unsqueeze(-1) + preds = model(X) + predictions.extend(preds.numpy()) + actuals.extend(y.numpy()) +``` + +```{python} + +# Rescale predictions and actuals to original scale +predictions_rescaled = scaler.inverse_transform(predictions) +actuals_rescaled = scaler.inverse_transform(actuals) +``` + +```{python} + +# Plot results +plt.figure(figsize=(10, 6)) +plt.plot(actuals_rescaled, label='True Values') +plt.plot(predictions_rescaled, label='Predicted Values', alpha=0.7) +plt.legend() +plt.show() +``` \ No newline at end of file diff --git a/ds701_book/ch11.ipynb b/ds701_book/ch11.ipynb new file mode 100644 index 00000000..f02c25c1 --- /dev/null +++ b/ds701_book/ch11.ipynb @@ -0,0 +1,2710 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Chapter 11: Time Series -- Python for Data Analysis, 3rd Ed." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "From [Python for Data Analysis, 3rd Ed.](https://wesmckinney.com/book/time-series) \n", + "and accompanying [notebook](https://github.com/wesm/pydata-book/blob/3rd-edition/ch11.ipynb). " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "np.random.seed(12345)\n", + "import matplotlib.pyplot as plt\n", + "plt.rc(\"figure\", figsize=(10, 6))\n", + "PREVIOUS_MAX_ROWS = pd.options.display.max_rows\n", + "pd.options.display.max_columns = 20\n", + "pd.options.display.max_rows = 20\n", + "pd.options.display.max_colwidth = 80\n", + "np.set_printoptions(precision=4, suppress=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Time series data is an important form of structured data in many different fields, such as finance, economics, ecology, neuroscience, and physics. Anything that is recorded repeatedly at many points in time forms a time series. Many time series are fixed frequency, which is to say that data points occur at regular intervals according to some rule, such as every 15 seconds, every 5 minutes, or once per month. Time series can also be irregular without a fixed unit of time or offset between units. How you mark and refer to time series data depends on the application, and you may have one of the following:\n", + "\n", + "**Timestamps**
\n", + "Specific instants in time.\n", + "\n", + "**Fixed periods**
\n", + "Such as the whole month of January 2017, or the whole year 2020.\n", + "\n", + "**Intervals of time**
\n", + "Indicated by a start and end timestamp. Periods can be thought of as special cases of intervals.\n", + "\n", + "**Experiment or elapsed time**
\n", + "Each timestamp is a measure of time relative to a particular start time (e.g., the diameter of a cookie baking each second since being placed in the oven), starting from 0.\n", + "\n", + "In this chapter, I am mainly concerned with time series in the first three categories, though many of the techniques can be applied to experimental time series where the index may be an integer or floating-point number indicating elapsed time from the start of the experiment. The simplest kind of time series is indexed by timestamp.\n", + "\n", + ">pandas also supports indexes based on timedeltas, which can be a useful way of\n", + "> representing experiment or elapsed time. We do not explore timedelta indexes\n", + "> in this book, but you can learn more in the\n", + "> [pandas documentation](https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#time-span-index).\n", + "\n", + "pandas provides many built-in time series tools and algorithms. You can\n", + "efficiently work with large time series, and slice and dice, aggregate, and\n", + "resample irregular- and fixed-frequency time series. Some of these tools are\n", + "useful for financial and economics applications, but you could certainly use them\n", + "to analyze server log data, too.\n", + "\n", + "As with the rest of the chapters, we start by importing NumPy and pandas:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 11.1 Date and Time Data Types and Tools" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The Python standard library includes data types for date and time data, as well\n", + "as calendar-related functionality. \n", + "\n", + "The `datetime`, `time`, and `calendar` modules are the main places to start. The\n", + "`datetime.datetime` type, or simply `datetime`, is widely used:" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "datetime.datetime(2024, 11, 18, 16, 38, 16, 975792)" + ] + }, + "execution_count": 82, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from datetime import datetime\n", + "now = datetime.now()\n", + "now" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(2024, 11, 18)" + ] + }, + "execution_count": 83, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "now.year, now.month, now.day" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`datetime` stores both the date and time down to the microsecond. `datetime.timedelta`,\n", + "or simply `timedelta`, represents the temporal difference between two `datetime` objects:" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "datetime.timedelta(days=926, seconds=56700)" + ] + }, + "execution_count": 84, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "delta = datetime(2011, 1, 7) - datetime(2008, 6, 24, 8, 15)\n", + "delta" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "926" + ] + }, + "execution_count": 85, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "delta.days" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "56700" + ] + }, + "execution_count": 86, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "delta.seconds" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can add (or subtract) a timedelta or multiple thereof to a datetime object to yield a new shifted object:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "datetime.datetime(2010, 12, 14, 0, 0)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from datetime import timedelta\n", + "start = datetime(2011, 1, 7)\n", + "start + timedelta(12)\n", + "start - 2 * timedelta(12)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Table 11.1 summarizes the data types in the datetime module. While this chapter is mainly concerned with the data types in pandas and higher-level time series manipulation, you may encounter the datetime-based types in many other places in Python in the wild.\n", + "\n", + "Table 11.1: Types in the datetime module\n", + "\n", + "| Type | Description |\n", + "| ---- | ----------- |\n", + "| date | Store calendar date (year, month, day) using the Gregorian calendar |\n", + "| time | Store time of day as hours, minutes, seconds, and microseconds |\n", + "| datetime | Store both date and time |\n", + "| timedelta | The difference between two datetime values (as days, seconds, and microseconds) |\n", + "| tzinfo | Base type for storing time zone information |\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "### Converting Between String and Datetime\n", + "\n", + "You can format datetime objects and pandas Timestamp objects, which I’ll introduce later, as strings using str or the strftime method, passing a format specification:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'2011-01-03'" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stamp = datetime(2011, 1, 3)\n", + "str(stamp)\n", + "stamp.strftime(\"%Y-%m-%d\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[datetime.datetime(2011, 7, 6, 0, 0), datetime.datetime(2011, 8, 6, 0, 0)]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "value = \"2011-01-03\"\n", + "datetime.strptime(value, \"%Y-%m-%d\")\n", + "datestrs = [\"7/6/2011\", \"8/6/2011\"]\n", + "[datetime.strptime(x, \"%m/%d/%Y\") for x in datestrs]" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "DatetimeIndex(['2011-07-06 12:00:00', '2011-08-06 00:00:00'], dtype='datetime64[ns]', freq=None)" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "datestrs = [\"2011-07-06 12:00:00\", \"2011-08-06 00:00:00\"]\n", + "pd.to_datetime(datestrs)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([False, False, True])" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "idx = pd.to_datetime(datestrs + [None])\n", + "idx\n", + "idx[2]\n", + "pd.isna(idx)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2011-01-02 -0.204708\n", + "2011-01-05 0.478943\n", + "2011-01-07 -0.519439\n", + "2011-01-08 -0.555730\n", + "2011-01-10 1.965781\n", + "2011-01-12 1.393406\n", + "dtype: float64" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dates = [datetime(2011, 1, 2), datetime(2011, 1, 5),\n", + " datetime(2011, 1, 7), datetime(2011, 1, 8),\n", + " datetime(2011, 1, 10), datetime(2011, 1, 12)]\n", + "ts = pd.Series(np.random.standard_normal(6), index=dates)\n", + "ts" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "DatetimeIndex(['2011-01-02', '2011-01-05', '2011-01-07', '2011-01-08',\n", + " '2011-01-10', '2011-01-12'],\n", + " dtype='datetime64[ns]', freq=None)" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ts.index" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2011-01-02 -0.409415\n", + "2011-01-05 NaN\n", + "2011-01-07 -1.038877\n", + "2011-01-08 NaN\n", + "2011-01-10 3.931561\n", + "2011-01-12 NaN\n", + "dtype: float64" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ts + ts[::2]" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dtype('\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ColoradoTexasNew YorkOhio
2001-05-02-0.0060450.490094-0.277186-0.707213
2001-05-09-0.5601072.7355270.9273351.513906
2001-05-160.5386001.2737680.667876-0.969206
2001-05-231.676091-0.8176490.0501881.951312
2001-05-303.2603830.9633011.201206-1.852001
\n", + "" + ], + "text/plain": [ + " Colorado Texas New York Ohio\n", + "2001-05-02 -0.006045 0.490094 -0.277186 -0.707213\n", + "2001-05-09 -0.560107 2.735527 0.927335 1.513906\n", + "2001-05-16 0.538600 1.273768 0.667876 -0.969206\n", + "2001-05-23 1.676091 -0.817649 0.050188 1.951312\n", + "2001-05-30 3.260383 0.963301 1.201206 -1.852001" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dates = pd.date_range(\"2000-01-01\", periods=100, freq=\"W-WED\")\n", + "long_df = pd.DataFrame(np.random.standard_normal((100, 4)),\n", + " index=dates,\n", + " columns=[\"Colorado\", \"Texas\",\n", + " \"New York\", \"Ohio\"])\n", + "long_df.loc[\"2001-05\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2000-01-01 0\n", + "2000-01-02 1\n", + "2000-01-02 2\n", + "2000-01-02 3\n", + "2000-01-03 4\n", + "dtype: int64" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dates = pd.DatetimeIndex([\"2000-01-01\", \"2000-01-02\", \"2000-01-02\",\n", + " \"2000-01-02\", \"2000-01-03\"])\n", + "dup_ts = pd.Series(np.arange(5), index=dates)\n", + "dup_ts" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "False" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dup_ts.index.is_unique" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2000-01-02 1\n", + "2000-01-02 2\n", + "2000-01-02 3\n", + "dtype: int64" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dup_ts[\"2000-01-03\"] # not duplicated\n", + "dup_ts[\"2000-01-02\"] # duplicated" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2000-01-01 1\n", + "2000-01-02 3\n", + "2000-01-03 1\n", + "dtype: int64" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "grouped = dup_ts.groupby(level=0)\n", + "grouped.mean()\n", + "grouped.count()" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ts\n", + "resampler = ts.resample(\"D\")\n", + "resampler" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "DatetimeIndex(['2012-04-01', '2012-04-02', '2012-04-03', '2012-04-04',\n", + " '2012-04-05', '2012-04-06', '2012-04-07', '2012-04-08',\n", + " '2012-04-09', '2012-04-10', '2012-04-11', '2012-04-12',\n", + " '2012-04-13', '2012-04-14', '2012-04-15', '2012-04-16',\n", + " '2012-04-17', '2012-04-18', '2012-04-19', '2012-04-20',\n", + " '2012-04-21', '2012-04-22', '2012-04-23', '2012-04-24',\n", + " '2012-04-25', '2012-04-26', '2012-04-27', '2012-04-28',\n", + " '2012-04-29', '2012-04-30', '2012-05-01', '2012-05-02',\n", + " '2012-05-03', '2012-05-04', '2012-05-05', '2012-05-06',\n", + " '2012-05-07', '2012-05-08', '2012-05-09', '2012-05-10',\n", + " '2012-05-11', '2012-05-12', '2012-05-13', '2012-05-14',\n", + " '2012-05-15', '2012-05-16', '2012-05-17', '2012-05-18',\n", + " '2012-05-19', '2012-05-20', '2012-05-21', '2012-05-22',\n", + " '2012-05-23', '2012-05-24', '2012-05-25', '2012-05-26',\n", + " '2012-05-27', '2012-05-28', '2012-05-29', '2012-05-30',\n", + " '2012-05-31', '2012-06-01'],\n", + " dtype='datetime64[ns]', freq='D')" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "index = pd.date_range(\"2012-04-01\", \"2012-06-01\")\n", + "index" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "DatetimeIndex(['2012-05-13', '2012-05-14', '2012-05-15', '2012-05-16',\n", + " '2012-05-17', '2012-05-18', '2012-05-19', '2012-05-20',\n", + " '2012-05-21', '2012-05-22', '2012-05-23', '2012-05-24',\n", + " '2012-05-25', '2012-05-26', '2012-05-27', '2012-05-28',\n", + " '2012-05-29', '2012-05-30', '2012-05-31', '2012-06-01'],\n", + " dtype='datetime64[ns]', freq='D')" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.date_range(start=\"2012-04-01\", periods=20)\n", + "pd.date_range(end=\"2012-06-01\", periods=20)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/ly/jkydg4dj2vs93b_ds7yp5t7r0000gn/T/ipykernel_24968/3742551278.py:1: FutureWarning: 'BM' is deprecated and will be removed in a future version, please use 'BME' instead.\n", + " pd.date_range(\"2000-01-01\", \"2000-12-01\", freq=\"BM\")\n" + ] + }, + { + "data": { + "text/plain": [ + "DatetimeIndex(['2000-01-31', '2000-02-29', '2000-03-31', '2000-04-28',\n", + " '2000-05-31', '2000-06-30', '2000-07-31', '2000-08-31',\n", + " '2000-09-29', '2000-10-31', '2000-11-30'],\n", + " dtype='datetime64[ns]', freq='BME')" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.date_range(\"2000-01-01\", \"2000-12-01\", freq=\"BM\")" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "DatetimeIndex(['2012-05-02 12:56:31', '2012-05-03 12:56:31',\n", + " '2012-05-04 12:56:31', '2012-05-05 12:56:31',\n", + " '2012-05-06 12:56:31'],\n", + " dtype='datetime64[ns]', freq='D')" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.date_range(\"2012-05-02 12:56:31\", periods=5)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "DatetimeIndex(['2012-05-02', '2012-05-03', '2012-05-04', '2012-05-05',\n", + " '2012-05-06'],\n", + " dtype='datetime64[ns]', freq='D')" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.date_range(\"2012-05-02 12:56:31\", periods=5, normalize=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from pandas.tseries.offsets import Hour, Minute\n", + "hour = Hour()\n", + "hour" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<4 * Hours>" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "four_hours = Hour(4)\n", + "four_hours" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/ly/jkydg4dj2vs93b_ds7yp5t7r0000gn/T/ipykernel_24968/3449897904.py:1: FutureWarning: 'H' is deprecated and will be removed in a future version, please use 'h' instead.\n", + " pd.date_range(\"2000-01-01\", \"2000-01-03 23:59\", freq=\"4H\")\n" + ] + }, + { + "data": { + "text/plain": [ + "DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 04:00:00',\n", + " '2000-01-01 08:00:00', '2000-01-01 12:00:00',\n", + " '2000-01-01 16:00:00', '2000-01-01 20:00:00',\n", + " '2000-01-02 00:00:00', '2000-01-02 04:00:00',\n", + " '2000-01-02 08:00:00', '2000-01-02 12:00:00',\n", + " '2000-01-02 16:00:00', '2000-01-02 20:00:00',\n", + " '2000-01-03 00:00:00', '2000-01-03 04:00:00',\n", + " '2000-01-03 08:00:00', '2000-01-03 12:00:00',\n", + " '2000-01-03 16:00:00', '2000-01-03 20:00:00'],\n", + " dtype='datetime64[ns]', freq='4h')" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.date_range(\"2000-01-01\", \"2000-01-03 23:59\", freq=\"4H\")" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<150 * Minutes>" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Hour(2) + Minute(30)" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 01:30:00',\n", + " '2000-01-01 03:00:00', '2000-01-01 04:30:00',\n", + " '2000-01-01 06:00:00', '2000-01-01 07:30:00',\n", + " '2000-01-01 09:00:00', '2000-01-01 10:30:00',\n", + " '2000-01-01 12:00:00', '2000-01-01 13:30:00'],\n", + " dtype='datetime64[ns]', freq='90min')" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.date_range(\"2000-01-01\", periods=10, freq=\"1h30min\")" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Timestamp('2012-01-20 00:00:00'),\n", + " Timestamp('2012-02-17 00:00:00'),\n", + " Timestamp('2012-03-16 00:00:00'),\n", + " Timestamp('2012-04-20 00:00:00'),\n", + " Timestamp('2012-05-18 00:00:00'),\n", + " Timestamp('2012-06-15 00:00:00'),\n", + " Timestamp('2012-07-20 00:00:00'),\n", + " Timestamp('2012-08-17 00:00:00')]" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "monthly_dates = pd.date_range(\"2012-01-01\", \"2012-09-01\", freq=\"WOM-3FRI\")\n", + "list(monthly_dates)" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/ly/jkydg4dj2vs93b_ds7yp5t7r0000gn/T/ipykernel_24968/2994308253.py:2: FutureWarning: 'M' is deprecated and will be removed in a future version, please use 'ME' instead.\n", + " index=pd.date_range(\"2000-01-01\", periods=4, freq=\"M\"))\n" + ] + }, + { + "data": { + "text/plain": [ + "2000-01-31 -0.117388\n", + "2000-02-29 -0.517795\n", + "2000-03-31 NaN\n", + "2000-04-30 NaN\n", + "Freq: ME, dtype: float64" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ts = pd.Series(np.random.standard_normal(4),\n", + " index=pd.date_range(\"2000-01-01\", periods=4, freq=\"M\"))\n", + "ts\n", + "ts.shift(2)\n", + "ts.shift(-2)" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/ly/jkydg4dj2vs93b_ds7yp5t7r0000gn/T/ipykernel_24968/903147437.py:1: FutureWarning: 'M' is deprecated and will be removed in a future version, please use 'ME' instead.\n", + " ts.shift(2, freq=\"M\")\n" + ] + }, + { + "data": { + "text/plain": [ + "2000-03-31 -0.066748\n", + "2000-04-30 0.838639\n", + "2000-05-31 -0.117388\n", + "2000-06-30 -0.517795\n", + "Freq: ME, dtype: float64" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ts.shift(2, freq=\"M\")" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/ly/jkydg4dj2vs93b_ds7yp5t7r0000gn/T/ipykernel_24968/4056683127.py:2: FutureWarning: 'T' is deprecated and will be removed in a future version, please use 'min' instead.\n", + " ts.shift(1, freq=\"90T\")\n" + ] + }, + { + "data": { + "text/plain": [ + "2000-01-31 01:30:00 -0.066748\n", + "2000-02-29 01:30:00 0.838639\n", + "2000-03-31 01:30:00 -0.117388\n", + "2000-04-30 01:30:00 -0.517795\n", + "dtype: float64" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ts.shift(3, freq=\"D\")\n", + "ts.shift(1, freq=\"90T\")" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Timestamp('2011-11-20 00:00:00')" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from pandas.tseries.offsets import Day, MonthEnd\n", + "now = datetime(2011, 11, 17)\n", + "now + 3 * Day()" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Timestamp('2011-12-31 00:00:00')" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "now + MonthEnd()\n", + "now + MonthEnd(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Timestamp('2011-10-31 00:00:00')" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "offset = MonthEnd()\n", + "offset.rollforward(now)\n", + "offset.rollback(now)" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2000-01-31 -0.005833\n", + "2000-02-29 0.015894\n", + "2000-03-31 0.150209\n", + "dtype: float64" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ts = pd.Series(np.random.standard_normal(20),\n", + " index=pd.date_range(\"2000-01-15\", periods=20, freq=\"4D\"))\n", + "ts\n", + "ts.groupby(MonthEnd().rollforward).mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/ly/jkydg4dj2vs93b_ds7yp5t7r0000gn/T/ipykernel_24968/3978117481.py:1: FutureWarning: 'M' is deprecated and will be removed in a future version, please use 'ME' instead.\n", + " ts.resample(\"M\").mean()\n" + ] + }, + { + "data": { + "text/plain": [ + "2000-01-31 -0.005833\n", + "2000-02-29 0.015894\n", + "2000-03-31 0.150209\n", + "Freq: ME, dtype: float64" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ts.resample(\"M\").mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['US/Eastern', 'US/Hawaii', 'US/Mountain', 'US/Pacific', 'UTC']" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pytz\n", + "pytz.common_timezones[-5:]" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tz = pytz.timezone(\"America/New_York\")\n", + "tz" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2012-03-09 09:30:00 -0.202469\n", + "2012-03-10 09:30:00 0.050718\n", + "2012-03-11 09:30:00 0.639869\n", + "2012-03-12 09:30:00 0.597594\n", + "2012-03-13 09:30:00 -0.797246\n", + "2012-03-14 09:30:00 0.472879\n", + "Freq: D, dtype: float64" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dates = pd.date_range(\"2012-03-09 09:30\", periods=6)\n", + "ts = pd.Series(np.random.standard_normal(len(dates)), index=dates)\n", + "ts" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "None\n" + ] + } + ], + "source": [ + "print(ts.index.tz)" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "DatetimeIndex(['2012-03-09 09:30:00+00:00', '2012-03-10 09:30:00+00:00',\n", + " '2012-03-11 09:30:00+00:00', '2012-03-12 09:30:00+00:00',\n", + " '2012-03-13 09:30:00+00:00', '2012-03-14 09:30:00+00:00',\n", + " '2012-03-15 09:30:00+00:00', '2012-03-16 09:30:00+00:00',\n", + " '2012-03-17 09:30:00+00:00', '2012-03-18 09:30:00+00:00'],\n", + " dtype='datetime64[ns, UTC]', freq='D')" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.date_range(\"2012-03-09 09:30\", periods=10, tz=\"UTC\")" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "DatetimeIndex(['2012-03-09 09:30:00+00:00', '2012-03-10 09:30:00+00:00',\n", + " '2012-03-11 09:30:00+00:00', '2012-03-12 09:30:00+00:00',\n", + " '2012-03-13 09:30:00+00:00', '2012-03-14 09:30:00+00:00'],\n", + " dtype='datetime64[ns, UTC]', freq='D')" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ts\n", + "ts_utc = ts.tz_localize(\"UTC\")\n", + "ts_utc\n", + "ts_utc.index" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2012-03-09 04:30:00-05:00 -0.202469\n", + "2012-03-10 04:30:00-05:00 0.050718\n", + "2012-03-11 05:30:00-04:00 0.639869\n", + "2012-03-12 05:30:00-04:00 0.597594\n", + "2012-03-13 05:30:00-04:00 -0.797246\n", + "2012-03-14 05:30:00-04:00 0.472879\n", + "Freq: D, dtype: float64" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ts_utc.tz_convert(\"America/New_York\")" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2012-03-09 15:30:00+01:00 -0.202469\n", + "2012-03-10 15:30:00+01:00 0.050718\n", + "2012-03-11 14:30:00+01:00 0.639869\n", + "2012-03-12 14:30:00+01:00 0.597594\n", + "2012-03-13 14:30:00+01:00 -0.797246\n", + "2012-03-14 14:30:00+01:00 0.472879\n", + "dtype: float64" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ts_eastern = ts.tz_localize(\"America/New_York\")\n", + "ts_eastern.tz_convert(\"UTC\")\n", + "ts_eastern.tz_convert(\"Europe/Berlin\")" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "DatetimeIndex(['2012-03-09 09:30:00+08:00', '2012-03-10 09:30:00+08:00',\n", + " '2012-03-11 09:30:00+08:00', '2012-03-12 09:30:00+08:00',\n", + " '2012-03-13 09:30:00+08:00', '2012-03-14 09:30:00+08:00'],\n", + " dtype='datetime64[ns, Asia/Shanghai]', freq=None)" + ] + }, + "execution_count": 55, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ts.index.tz_localize(\"Asia/Shanghai\")" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Timestamp('2011-03-11 23:00:00-0500', tz='America/New_York')" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stamp = pd.Timestamp(\"2011-03-12 04:00\")\n", + "stamp_utc = stamp.tz_localize(\"utc\")\n", + "stamp_utc.tz_convert(\"America/New_York\")" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Timestamp('2011-03-12 04:00:00+0300', tz='Europe/Moscow')" + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stamp_moscow = pd.Timestamp(\"2011-03-12 04:00\", tz=\"Europe/Moscow\")\n", + "stamp_moscow" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1299902400000000000" + ] + }, + "execution_count": 58, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stamp_utc.value\n", + "stamp_utc.tz_convert(\"America/New_York\").value" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Timestamp('2012-03-11 03:30:00-0400', tz='US/Eastern')" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stamp = pd.Timestamp(\"2012-03-11 01:30\", tz=\"US/Eastern\")\n", + "stamp\n", + "stamp + Hour()" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Timestamp('2012-11-04 01:30:00-0500', tz='US/Eastern')" + ] + }, + "execution_count": 60, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stamp = pd.Timestamp(\"2012-11-04 00:30\", tz=\"US/Eastern\")\n", + "stamp\n", + "stamp + 2 * Hour()" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "DatetimeIndex(['2012-03-07 09:30:00+00:00', '2012-03-08 09:30:00+00:00',\n", + " '2012-03-09 09:30:00+00:00', '2012-03-12 09:30:00+00:00',\n", + " '2012-03-13 09:30:00+00:00', '2012-03-14 09:30:00+00:00',\n", + " '2012-03-15 09:30:00+00:00'],\n", + " dtype='datetime64[ns, UTC]', freq=None)" + ] + }, + "execution_count": 61, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dates = pd.date_range(\"2012-03-07 09:30\", periods=10, freq=\"B\")\n", + "ts = pd.Series(np.random.standard_normal(len(dates)), index=dates)\n", + "ts\n", + "ts1 = ts[:7].tz_localize(\"Europe/London\")\n", + "ts2 = ts1[2:].tz_convert(\"Europe/Moscow\")\n", + "result = ts1 + ts2\n", + "result.index" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/ly/jkydg4dj2vs93b_ds7yp5t7r0000gn/T/ipykernel_24968/520127536.py:1: FutureWarning: 'A-DEC' is deprecated and will be removed in a future version, please use 'Y-DEC' instead.\n", + " p = pd.Period(\"2011\", freq=\"A-DEC\")\n" + ] + }, + { + "data": { + "text/plain": [ + "Period('2011', 'Y-DEC')" + ] + }, + "execution_count": 62, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "p = pd.Period(\"2011\", freq=\"A-DEC\")\n", + "p" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Period('2009', 'Y-DEC')" + ] + }, + "execution_count": 63, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "p + 5\n", + "p - 2" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/ly/jkydg4dj2vs93b_ds7yp5t7r0000gn/T/ipykernel_24968/3816979827.py:1: FutureWarning: 'A-DEC' is deprecated and will be removed in a future version, please use 'Y-DEC' instead.\n", + " pd.Period(\"2014\", freq=\"A-DEC\") - p\n" + ] + }, + { + "data": { + "text/plain": [ + "<3 * YearEnds: month=12>" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.Period(\"2014\", freq=\"A-DEC\") - p" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "PeriodIndex(['2000-01', '2000-02', '2000-03', '2000-04', '2000-05', '2000-06'], dtype='period[M]')" + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "periods = pd.period_range(\"2000-01-01\", \"2000-06-30\", freq=\"M\")\n", + "periods" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2000-01 -0.514551\n", + "2000-02 -0.559782\n", + "2000-03 -0.783408\n", + "2000-04 -1.797685\n", + "2000-05 -0.172670\n", + "2000-06 0.680215\n", + "Freq: M, dtype: float64" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.Series(np.random.standard_normal(6), index=periods)" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "PeriodIndex(['2001Q3', '2002Q2', '2003Q1'], dtype='period[Q-DEC]')" + ] + }, + "execution_count": 67, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "values = [\"2001Q3\", \"2002Q2\", \"2003Q1\"]\n", + "index = pd.PeriodIndex(values, freq=\"Q-DEC\")\n", + "index" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/ly/jkydg4dj2vs93b_ds7yp5t7r0000gn/T/ipykernel_24968/3519524340.py:1: FutureWarning: 'A-DEC' is deprecated and will be removed in a future version, please use 'Y-DEC' instead.\n", + " p = pd.Period(\"2011\", freq=\"A-DEC\")\n" + ] + }, + { + "data": { + "text/plain": [ + "Period('2011-12', 'M')" + ] + }, + "execution_count": 68, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "p = pd.Period(\"2011\", freq=\"A-DEC\")\n", + "p\n", + "p.asfreq(\"M\", how=\"start\")\n", + "p.asfreq(\"M\", how=\"end\")\n", + "p.asfreq(\"M\")" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/ly/jkydg4dj2vs93b_ds7yp5t7r0000gn/T/ipykernel_24968/1547579311.py:1: FutureWarning: 'A-JUN' is deprecated and will be removed in a future version, please use 'Y-JUN' instead.\n", + " p = pd.Period(\"2011\", freq=\"A-JUN\")\n" + ] + }, + { + "data": { + "text/plain": [ + "Period('2011-06', 'M')" + ] + }, + "execution_count": 69, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "p = pd.Period(\"2011\", freq=\"A-JUN\")\n", + "p\n", + "p.asfreq(\"M\", how=\"start\")\n", + "p.asfreq(\"M\", how=\"end\")" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/ly/jkydg4dj2vs93b_ds7yp5t7r0000gn/T/ipykernel_24968/2271097004.py:2: FutureWarning: 'A-JUN' is deprecated and will be removed in a future version, please use 'Y-JUN' instead.\n", + " p.asfreq(\"A-JUN\")\n" + ] + }, + { + "data": { + "text/plain": [ + "Period('2012', 'Y-JUN')" + ] + }, + "execution_count": 70, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "p = pd.Period(\"Aug-2011\", \"M\")\n", + "p.asfreq(\"A-JUN\")" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/ly/jkydg4dj2vs93b_ds7yp5t7r0000gn/T/ipykernel_24968/3858577534.py:1: FutureWarning: 'A-DEC' is deprecated and will be removed in a future version, please use 'Y-DEC' instead.\n", + " periods = pd.period_range(\"2006\", \"2009\", freq=\"A-DEC\")\n" + ] + }, + { + "data": { + "text/plain": [ + "2006-01 1.607578\n", + "2007-01 0.200381\n", + "2008-01 -0.834068\n", + "2009-01 -0.302988\n", + "Freq: M, dtype: float64" + ] + }, + "execution_count": 71, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "periods = pd.period_range(\"2006\", \"2009\", freq=\"A-DEC\")\n", + "ts = pd.Series(np.random.standard_normal(len(periods)), index=periods)\n", + "ts\n", + "ts.asfreq(\"M\", how=\"start\")" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/ly/jkydg4dj2vs93b_ds7yp5t7r0000gn/T/ipykernel_24968/2984082239.py:1: FutureWarning: PeriodDtype[B] is deprecated and will be removed in a future version. Use a DatetimeIndex with freq='B' instead\n", + " ts.asfreq(\"B\", how=\"end\")\n" + ] + }, + { + "data": { + "text/plain": [ + "2006-12-29 1.607578\n", + "2007-12-31 0.200381\n", + "2008-12-31 -0.834068\n", + "2009-12-31 -0.302988\n", + "Freq: B, dtype: float64" + ] + }, + "execution_count": 72, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ts.asfreq(\"B\", how=\"end\")" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Period('2012Q4', 'Q-JAN')" + ] + }, + "execution_count": 73, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "p = pd.Period(\"2012Q4\", freq=\"Q-JAN\")\n", + "p" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Period('2012-01-31', 'D')" + ] + }, + "execution_count": 74, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "p.asfreq(\"D\", how=\"start\")\n", + "p.asfreq(\"D\", how=\"end\")" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/ly/jkydg4dj2vs93b_ds7yp5t7r0000gn/T/ipykernel_24968/1108768030.py:1: FutureWarning: Period with BDay freq is deprecated and will be removed in a future version. Use a DatetimeIndex with BDay freq instead.\n", + " p4pm = (p.asfreq(\"B\", how=\"end\") - 1).asfreq(\"T\", how=\"start\") + 16 * 60\n", + "/var/folders/ly/jkydg4dj2vs93b_ds7yp5t7r0000gn/T/ipykernel_24968/1108768030.py:1: FutureWarning: 'T' is deprecated and will be removed in a future version, please use 'min' instead.\n", + " p4pm = (p.asfreq(\"B\", how=\"end\") - 1).asfreq(\"T\", how=\"start\") + 16 * 60\n" + ] + }, + { + "data": { + "text/plain": [ + "Timestamp('2012-01-30 16:00:00')" + ] + }, + "execution_count": 75, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "p4pm = (p.asfreq(\"B\", how=\"end\") - 1).asfreq(\"T\", how=\"start\") + 16 * 60\n", + "p4pm\n", + "p4pm.to_timestamp()" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/ly/jkydg4dj2vs93b_ds7yp5t7r0000gn/T/ipykernel_24968/142431650.py:4: FutureWarning: PeriodDtype[B] is deprecated and will be removed in a future version. Use a DatetimeIndex with freq='B' instead\n", + " new_periods = (periods.asfreq(\"B\", \"end\") - 1).asfreq(\"H\", \"start\") + 16\n", + "/var/folders/ly/jkydg4dj2vs93b_ds7yp5t7r0000gn/T/ipykernel_24968/142431650.py:4: FutureWarning: 'H' is deprecated and will be removed in a future version, please use 'h' instead.\n", + " new_periods = (periods.asfreq(\"B\", \"end\") - 1).asfreq(\"H\", \"start\") + 16\n" + ] + }, + { + "data": { + "text/plain": [ + "2010-10-28 16:00:00 0\n", + "2011-01-28 16:00:00 1\n", + "2011-04-28 16:00:00 2\n", + "2011-07-28 16:00:00 3\n", + "2011-10-28 16:00:00 4\n", + "2012-01-30 16:00:00 5\n", + "dtype: int64" + ] + }, + "execution_count": 76, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "periods = pd.period_range(\"2011Q3\", \"2012Q4\", freq=\"Q-JAN\")\n", + "ts = pd.Series(np.arange(len(periods)), index=periods)\n", + "ts\n", + "new_periods = (periods.asfreq(\"B\", \"end\") - 1).asfreq(\"H\", \"start\") + 16\n", + "ts.index = new_periods.to_timestamp()\n", + "ts" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/ly/jkydg4dj2vs93b_ds7yp5t7r0000gn/T/ipykernel_24968/2638414699.py:1: FutureWarning: 'M' is deprecated and will be removed in a future version, please use 'ME' instead.\n", + " dates = pd.date_range(\"2000-01-01\", periods=3, freq=\"M\")\n" + ] + }, + { + "data": { + "text/plain": [ + "2000-01 1.663261\n", + "2000-02 -0.996206\n", + "2000-03 1.521760\n", + "Freq: M, dtype: float64" + ] + }, + "execution_count": 77, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dates = pd.date_range(\"2000-01-01\", periods=3, freq=\"M\")\n", + "ts = pd.Series(np.random.standard_normal(3), index=dates)\n", + "ts\n", + "pts = ts.to_period()\n", + "pts" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2000-01 0.244175\n", + "2000-01 0.423331\n", + "2000-01 -0.654040\n", + "2000-02 2.089154\n", + "2000-02 -0.060220\n", + "2000-02 -0.167933\n", + "Freq: M, dtype: float64" + ] + }, + "execution_count": 78, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dates = pd.date_range(\"2000-01-29\", periods=6)\n", + "ts2 = pd.Series(np.random.standard_normal(6), index=dates)\n", + "ts2\n", + "ts2.to_period(\"M\")" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2000-01-29 23:59:59.999999999 0.244175\n", + "2000-01-30 23:59:59.999999999 0.423331\n", + "2000-01-31 23:59:59.999999999 -0.654040\n", + "2000-02-01 23:59:59.999999999 2.089154\n", + "2000-02-02 23:59:59.999999999 -0.060220\n", + "2000-02-03 23:59:59.999999999 -0.167933\n", + "Freq: D, dtype: float64" + ] + }, + "execution_count": 79, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pts = ts2.to_period()\n", + "pts\n", + "pts.to_timestamp(how=\"end\")" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": {}, + "outputs": [ + { + "ename": "FileNotFoundError", + "evalue": "[Errno 2] No such file or directory: 'examples/macrodata.csv'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[80], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[43mpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_csv\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mexamples/macrodata.csv\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2\u001b[0m data\u001b[38;5;241m.\u001b[39mhead(\u001b[38;5;241m5\u001b[39m)\n\u001b[1;32m 3\u001b[0m data[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124myear\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n", + "File \u001b[0;32m~/Source/courses/tools4ds/DS701-Course-Notes/.venv/lib/python3.12/site-packages/pandas/io/parsers/readers.py:1026\u001b[0m, in \u001b[0;36mread_csv\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)\u001b[0m\n\u001b[1;32m 1013\u001b[0m kwds_defaults \u001b[38;5;241m=\u001b[39m _refine_defaults_read(\n\u001b[1;32m 1014\u001b[0m dialect,\n\u001b[1;32m 1015\u001b[0m delimiter,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1022\u001b[0m dtype_backend\u001b[38;5;241m=\u001b[39mdtype_backend,\n\u001b[1;32m 1023\u001b[0m )\n\u001b[1;32m 1024\u001b[0m kwds\u001b[38;5;241m.\u001b[39mupdate(kwds_defaults)\n\u001b[0;32m-> 1026\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_read\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/Source/courses/tools4ds/DS701-Course-Notes/.venv/lib/python3.12/site-packages/pandas/io/parsers/readers.py:620\u001b[0m, in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m 617\u001b[0m _validate_names(kwds\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnames\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m))\n\u001b[1;32m 619\u001b[0m \u001b[38;5;66;03m# Create the parser.\u001b[39;00m\n\u001b[0;32m--> 620\u001b[0m parser \u001b[38;5;241m=\u001b[39m \u001b[43mTextFileReader\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 622\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m chunksize \u001b[38;5;129;01mor\u001b[39;00m iterator:\n\u001b[1;32m 623\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m parser\n", + "File \u001b[0;32m~/Source/courses/tools4ds/DS701-Course-Notes/.venv/lib/python3.12/site-packages/pandas/io/parsers/readers.py:1620\u001b[0m, in \u001b[0;36mTextFileReader.__init__\u001b[0;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[1;32m 1617\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m kwds[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 1619\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles: IOHandles \u001b[38;5;241m|\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m-> 1620\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_engine \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_make_engine\u001b[49m\u001b[43m(\u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mengine\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/Source/courses/tools4ds/DS701-Course-Notes/.venv/lib/python3.12/site-packages/pandas/io/parsers/readers.py:1880\u001b[0m, in \u001b[0;36mTextFileReader._make_engine\u001b[0;34m(self, f, engine)\u001b[0m\n\u001b[1;32m 1878\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m mode:\n\u001b[1;32m 1879\u001b[0m mode \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m-> 1880\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;241m=\u001b[39m \u001b[43mget_handle\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1881\u001b[0m \u001b[43m \u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1882\u001b[0m \u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1883\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoding\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1884\u001b[0m \u001b[43m \u001b[49m\u001b[43mcompression\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcompression\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1885\u001b[0m \u001b[43m \u001b[49m\u001b[43mmemory_map\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmemory_map\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1886\u001b[0m \u001b[43m \u001b[49m\u001b[43mis_text\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mis_text\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1887\u001b[0m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoding_errors\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstrict\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1888\u001b[0m \u001b[43m \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstorage_options\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1889\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1890\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1891\u001b[0m f \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles\u001b[38;5;241m.\u001b[39mhandle\n", + "File \u001b[0;32m~/Source/courses/tools4ds/DS701-Course-Notes/.venv/lib/python3.12/site-packages/pandas/io/common.py:873\u001b[0m, in \u001b[0;36mget_handle\u001b[0;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[1;32m 868\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(handle, \u001b[38;5;28mstr\u001b[39m):\n\u001b[1;32m 869\u001b[0m \u001b[38;5;66;03m# Check whether the filename is to be opened in binary mode.\u001b[39;00m\n\u001b[1;32m 870\u001b[0m \u001b[38;5;66;03m# Binary mode does not support 'encoding' and 'newline'.\u001b[39;00m\n\u001b[1;32m 871\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mencoding \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mmode:\n\u001b[1;32m 872\u001b[0m \u001b[38;5;66;03m# Encoding\u001b[39;00m\n\u001b[0;32m--> 873\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\n\u001b[1;32m 874\u001b[0m \u001b[43m \u001b[49m\u001b[43mhandle\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 875\u001b[0m \u001b[43m \u001b[49m\u001b[43mioargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 876\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mioargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencoding\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 877\u001b[0m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 878\u001b[0m \u001b[43m \u001b[49m\u001b[43mnewline\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 879\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 880\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 881\u001b[0m \u001b[38;5;66;03m# Binary mode\u001b[39;00m\n\u001b[1;32m 882\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(handle, ioargs\u001b[38;5;241m.\u001b[39mmode)\n", + "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'examples/macrodata.csv'" + ] + } + ], + "source": [ + "data = pd.read_csv(\"examples/macrodata.csv\")\n", + "data.head(5)\n", + "data[\"year\"]\n", + "data[\"quarter\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": {}, + "outputs": [], + "source": [ + "index = pd.PeriodIndex(year=data[\"year\"], quarter=data[\"quarter\"],\n", + " freq=\"Q-DEC\")\n", + "index\n", + "data.index = index\n", + "data[\"infl\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": {}, + "outputs": [], + "source": [ + "dates = pd.date_range(\"2000-01-01\", periods=100)\n", + "ts = pd.Series(np.random.standard_normal(len(dates)), index=dates)\n", + "ts\n", + "ts.resample(\"M\").mean()\n", + "ts.resample(\"M\", kind=\"period\").mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": {}, + "outputs": [], + "source": [ + "dates = pd.date_range(\"2000-01-01\", periods=12, freq=\"T\")\n", + "ts = pd.Series(np.arange(len(dates)), index=dates)\n", + "ts" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "metadata": {}, + "outputs": [], + "source": [ + "ts.resample(\"5min\").sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "metadata": {}, + "outputs": [], + "source": [ + "ts.resample(\"5min\", closed=\"right\").sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "metadata": {}, + "outputs": [], + "source": [ + "ts.resample(\"5min\", closed=\"right\", label=\"right\").sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "metadata": {}, + "outputs": [], + "source": [ + "from pandas.tseries.frequencies import to_offset\n", + "result = ts.resample(\"5min\", closed=\"right\", label=\"right\").sum()\n", + "result.index = result.index + to_offset(\"-1s\")\n", + "result" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "metadata": {}, + "outputs": [], + "source": [ + "ts = pd.Series(np.random.permutation(np.arange(len(dates))), index=dates)\n", + "ts.resample(\"5min\").ohlc()" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "metadata": {}, + "outputs": [], + "source": [ + "frame = pd.DataFrame(np.random.standard_normal((2, 4)),\n", + " index=pd.date_range(\"2000-01-01\", periods=2,\n", + " freq=\"W-WED\"),\n", + " columns=[\"Colorado\", \"Texas\", \"New York\", \"Ohio\"])\n", + "frame" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "metadata": {}, + "outputs": [], + "source": [ + "df_daily = frame.resample(\"D\").asfreq()\n", + "df_daily" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "metadata": {}, + "outputs": [], + "source": [ + "frame.resample(\"D\").ffill()" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "metadata": {}, + "outputs": [], + "source": [ + "frame.resample(\"D\").ffill(limit=2)" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "metadata": {}, + "outputs": [], + "source": [ + "frame.resample(\"W-THU\").ffill()" + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "metadata": {}, + "outputs": [], + "source": [ + "frame = pd.DataFrame(np.random.standard_normal((24, 4)),\n", + " index=pd.period_range(\"1-2000\", \"12-2001\",\n", + " freq=\"M\"),\n", + " columns=[\"Colorado\", \"Texas\", \"New York\", \"Ohio\"])\n", + "frame.head()\n", + "annual_frame = frame.resample(\"A-DEC\").mean()\n", + "annual_frame" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "metadata": {}, + "outputs": [], + "source": [ + "# Q-DEC: Quarterly, year ending in December\n", + "annual_frame.resample(\"Q-DEC\").ffill()\n", + "annual_frame.resample(\"Q-DEC\", convention=\"end\").asfreq()" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "metadata": {}, + "outputs": [], + "source": [ + "annual_frame.resample(\"Q-MAR\").ffill()" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "metadata": {}, + "outputs": [], + "source": [ + "N = 15\n", + "times = pd.date_range(\"2017-05-20 00:00\", freq=\"1min\", periods=N)\n", + "df = pd.DataFrame({\"time\": times,\n", + " \"value\": np.arange(N)})\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "metadata": {}, + "outputs": [], + "source": [ + "df.set_index(\"time\").resample(\"5min\").count()" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "metadata": {}, + "outputs": [], + "source": [ + "df2 = pd.DataFrame({\"time\": times.repeat(3),\n", + " \"key\": np.tile([\"a\", \"b\", \"c\"], N),\n", + " \"value\": np.arange(N * 3.)})\n", + "df2.head(7)" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "metadata": {}, + "outputs": [], + "source": [ + "time_key = pd.Grouper(freq=\"5min\")" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "metadata": {}, + "outputs": [], + "source": [ + "resampled = (df2.set_index(\"time\")\n", + " .groupby([\"key\", time_key])\n", + " .sum())\n", + "resampled\n", + "resampled.reset_index()" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "metadata": {}, + "outputs": [], + "source": [ + "close_px_all = pd.read_csv(\"examples/stock_px.csv\",\n", + " parse_dates=True, index_col=0)\n", + "close_px = close_px_all[[\"AAPL\", \"MSFT\", \"XOM\"]]\n", + "close_px = close_px.resample(\"B\").ffill()" + ] + }, + { + "cell_type": "code", + "execution_count": 103, + "metadata": {}, + "outputs": [], + "source": [ + "close_px[\"AAPL\"].plot()\n", + "close_px[\"AAPL\"].rolling(250).mean().plot()" + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "metadata": {}, + "outputs": [], + "source": [ + "plt.figure()\n", + "std250 = close_px[\"AAPL\"].pct_change().rolling(250, min_periods=10).std()\n", + "std250[5:12]\n", + "std250.plot()" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "metadata": {}, + "outputs": [], + "source": [ + "expanding_mean = std250.expanding().mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 106, + "metadata": {}, + "outputs": [], + "source": [ + "plt.figure()" + ] + }, + { + "cell_type": "code", + "execution_count": 107, + "metadata": {}, + "outputs": [], + "source": [ + "plt.style.use('grayscale')\n", + "close_px.rolling(60).mean().plot(logy=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 108, + "metadata": {}, + "outputs": [], + "source": [ + "close_px.rolling(\"20D\").mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 109, + "metadata": {}, + "outputs": [], + "source": [ + "plt.figure()" + ] + }, + { + "cell_type": "code", + "execution_count": 110, + "metadata": {}, + "outputs": [], + "source": [ + "aapl_px = close_px[\"AAPL\"][\"2006\":\"2007\"]\n", + "\n", + "ma30 = aapl_px.rolling(30, min_periods=20).mean()\n", + "ewma30 = aapl_px.ewm(span=30).mean()\n", + "\n", + "aapl_px.plot(style=\"k-\", label=\"Price\")\n", + "ma30.plot(style=\"k--\", label=\"Simple Moving Avg\")\n", + "ewma30.plot(style=\"k-\", label=\"EW MA\")\n", + "plt.legend()" + ] + }, + { + "cell_type": "code", + "execution_count": 111, + "metadata": {}, + "outputs": [], + "source": [ + "plt.figure()" + ] + }, + { + "cell_type": "code", + "execution_count": 112, + "metadata": {}, + "outputs": [], + "source": [ + "spx_px = close_px_all[\"SPX\"]\n", + "spx_rets = spx_px.pct_change()\n", + "returns = close_px.pct_change()" + ] + }, + { + "cell_type": "code", + "execution_count": 113, + "metadata": {}, + "outputs": [], + "source": [ + "corr = returns[\"AAPL\"].rolling(125, min_periods=100).corr(spx_rets)\n", + "corr.plot()" + ] + }, + { + "cell_type": "code", + "execution_count": 114, + "metadata": {}, + "outputs": [], + "source": [ + "plt.figure()" + ] + }, + { + "cell_type": "code", + "execution_count": 115, + "metadata": {}, + "outputs": [], + "source": [ + "corr = returns.rolling(125, min_periods=100).corr(spx_rets)\n", + "corr.plot()" + ] + }, + { + "cell_type": "code", + "execution_count": 116, + "metadata": {}, + "outputs": [], + "source": [ + "plt.figure()" + ] + }, + { + "cell_type": "code", + "execution_count": 117, + "metadata": {}, + "outputs": [], + "source": [ + "from scipy.stats import percentileofscore\n", + "def score_at_2percent(x):\n", + " return percentileofscore(x, 0.02)\n", + "\n", + "result = returns[\"AAPL\"].rolling(250).apply(score_at_2percent)\n", + "result.plot()" + ] + }, + { + "cell_type": "code", + "execution_count": 118, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 119, + "metadata": {}, + "outputs": [], + "source": [ + "pd.options.display.max_rows = PREVIOUS_MAX_ROWS" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/ds701_book/complete-guide-on-time-series-analysis-in-python.ipynb b/ds701_book/complete-guide-on-time-series-analysis-in-python.ipynb new file mode 100644 index 00000000..e1c21464 --- /dev/null +++ b/ds701_book/complete-guide-on-time-series-analysis-in-python.ipynb @@ -0,0 +1 @@ +{"cells":[{"cell_type":"markdown","metadata":{},"source":["\n","# **Complete Guide on Time Series Analysis in Python**\n","\n","From https://www.kaggle.com/code/prashant111/complete-guide-on-time-series-analysis-in-python/notebook.\n","\n","The dataset used is from https://www.kaggle.com/datasets/chirag19/air-passengers\n","and exhibits seasonality as well as long term trends.\n","\n","Hello friends,\n","\n","\n","As the name implies, this notebook is all about **Time Series Analysis**. A time series is a series of data points recorded at different time-intervals. The time series analysis means analyzing the time series data using various statistical tools and techniques. \n","\n","So, let's get started."]},{"cell_type":"markdown","metadata":{"trusted":true},"source":["### **I hope you find this notebook useful and your UPVOTES keep me motivated.**"]},{"cell_type":"markdown","metadata":{},"source":["\n","# **Table of Contents**\n","\n","\n","1.\t[Introduction to Time Series Analysis](#1)\n","2.\t[Types of data](#2)\n","3.\t[Time Series terminology](#3)\n","4.\t[Time Series Analysis](#4)\n","5.\t[Visualize the Time Series](#5)\n","6.\t[Patterns in a Time Series](#6)\n","7.\t[Additive and Multiplicative Time Series](#7)\n","8.\t[Decomposition of a Time Series](#8)\n","9.\t[Stationary and Non-Stationary Time Series](#9)\n","10.\t[How to make a time series stationary](#10)\n","11.\t[How to test for stationarity](#11)\n"," - 11.1\t[Augmented Dickey Fuller test (ADF Test)](#11.1)\n"," - 11.2\t[Kwiatkowski-Phillips-Schmidt-Shin – KPSS test (trend stationary)](#11.2)\n"," - 11.3\t[Philips Perron test (PP Test)](#11.3)\n","12.\t[Difference between white noise and a stationary series](#12)\n","13.\t[Detrend a Time Series](#13)\n","14.\t[Deseasonalize a Time Series](#14)\n","15.\t[How to test for seasonality of a time series](#15)\n","16.\t[Autocorrelation and Partial Autocorrelation Functions](#16)\n","17.\t[Computation of Partial Autocorrelation Function](#17)\n","18.\t[Lag Plots](#18)\n","19.\t[Granger Causality Test](#19)\n","20.\t[Smoothening a Time Series](#20)\n","21.\t[References](#21)\n"]},{"cell_type":"markdown","metadata":{},"source":["# **1. Introduction to Time-Series Analysis** \n","\n","\n","[Table of Contents](#0.1)\n","\n","\n","\n","- A **time-series** data is a series of data points or observations recorded at different or regular time intervals. In general, a time series is a sequence of data points taken at equally spaced time intervals. The frequency of recorded data points may be hourly, daily, weekly, monthly, quarterly or annually.\n","\n","\n","- **Time-Series Forecasting** is the process of using a statistical model to predict future values of a time-series based on past results.\n","\n","\n","- A time series analysis encompasses statistical methods for analyzing time series data. These methods enable us to extract meaningful statistics, patterns and other characteristics of the data. Time series are visualized with the help of line charts. So, time series analysis involves understanding inherent aspects of the time series data so that we can create meaningful and accurate forecasts.\n","\n","\n","- Applications of time series are used in statistics, finance or business applications. A very common example of time series data is the daily closing value of the stock index like NASDAQ or Dow Jones. Other common applications of time series are sales and demand forecasting, weather forecasting, econometrics, signal processing, pattern recognition and earthquake prediction.\n","\n","\n","\n","### **Components of a Time-Series**\n","\n","\n","- **Trend** - The trend shows a general direction of the time series data over a long period of time. A trend can be increasing(upward), decreasing(downward), or horizontal(stationary).\n","\n","\n","- **Seasonality** - The seasonality component exhibits a trend that repeats with respect to timing, direction, and magnitude. Some examples include an increase in water consumption in summer due to hot weather conditions.\n","\n","\n","- **Cyclical Component** - These are the trends with no set repetition over a particular period of time. A cycle refers to the period of ups and downs, booms and slums of a time series, mostly observed in business cycles. These cycles do not exhibit a seasonal variation but generally occur over a time period of 3 to 12 years depending on the nature of the time series.\n","\n","\n","- **Irregular Variation** - These are the fluctuations in the time series data which become evident when trend and cyclical variations are removed. These variations are unpredictable, erratic, and may or may not be random.\n","\n","\n","- **ETS Decomposition** - ETS Decomposition is used to separate different components of a time series. The term ETS stands for Error, Trend and Seasonality.\n","\n","\n","- In this notebook, I conduct time series analysis of video game sales over time."]},{"cell_type":"markdown","metadata":{},"source":["# **2. Types of data** \n","\n","[Table of Contents](#0.1)\n","\n","\n","As stated above, the time series analysis is the statistical analysis of the time series data. A time series data means that data is recorded at different time periods or intervals. The time series data may be of three types:-\n","\n","\n","1 **Time series data** - The observations of the values of a variable recorded at different points in time is called time series data. \n","\n","\n","2 **Cross sectional data** - It is the data of one or more variables recorded at the same point in time.\n","\n","\n","3 **Pooled data**- It is the combination of time series data and cross sectional data.\n"]},{"cell_type":"markdown","metadata":{},"source":["# **3. Time Series terminology** \n","\n","[Table of Contents](#0.1)\n","\n","\n","There are various terms and concepts in time series that we should know. These are as follows:-\n","\n","1\t**Dependence**- It refers to the association of two observations of the same variable at prior time periods.\n","\n","\n","2\t**Stationarity**- It shows the mean value of the series that remains constant over the time period. If past effects accumulate and the values increase towards infinity then stationarity is not met.\n","\n","\n","3\t**Differencing**- Differencing is used to make the series stationary and to control the auto-correlations. There may be some cases in time series analyses where we do not require differencing and over-differenced series can produce wrong estimates.\n","\n","\n","4\t**Specification** - It may involve the testing of the linear or non-linear relationships of dependent variables by using time series models such as ARIMA models. \n","\n","\n","5\t**Exponential Smoothing** - Exponential smoothing in time series analysis predicts the one next period value based on the past and current value. It involves averaging of data such that the non-systematic components of each individual case or observation cancel out each other. The exponential smoothing method is used to predict the short term prediction.\n","\n","\n","\n","6\t**Curve fitting** - Curve fitting regression in time series analysis is used when data is in a non-linear relationship.\n","\n","\n","7\t**ARIMA** - ARIMA stands for Auto Regressive Integrated Moving Average.\n"]},{"cell_type":"markdown","metadata":{},"source":["# **4. Time Series Analysis** \n","\n","[Table of Contents](#0.1)\n"]},{"cell_type":"markdown","metadata":{},"source":["## **4.1 Basic set up** \n","\n","[Table of Contents](#0.1)"]},{"cell_type":"code","execution_count":2,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Downloading from https://www.kaggle.com/api/v1/datasets/download/chirag19/air-passengers?dataset_version_number=1...\n"]},{"name":"stderr","output_type":"stream","text":["100%|██████████| 764/764 [00:00<00:00, 955kB/s]"]},{"name":"stdout","output_type":"stream","text":["Extracting files...\n","Path to dataset files: /Users/tomg/.cache/kagglehub/datasets/chirag19/air-passengers/versions/1\n"]},{"name":"stderr","output_type":"stream","text":["\n"]}],"source":["import kagglehub\n","\n","# Download latest version\n","path = kagglehub.dataset_download(\"chirag19/air-passengers\")\n","\n","print(\"Path to dataset files:\", path)"]},{"cell_type":"code","execution_count":3,"metadata":{"_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","trusted":true},"outputs":[],"source":["# This Python 3 environment comes with many helpful analytics libraries installed\n","# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python\n","# For example, here's several helpful packages to load\n","\n","\n","import numpy as np # linear algebra\n","import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n","\n","\n","import matplotlib as mpl\n","import matplotlib.pyplot as plt # data visualization\n","import seaborn as sns # statistical data visualization\n","\n","\n","# Input data files are available in the read-only \"../input/\" directory\n","# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n","\n","import os\n","for dirname, _, filenames in os.walk('~/.cache/kagglehub/datasets/chirag19/air-passengers/versions/1'):\n"," for filename in filenames:\n"," print(os.path.join(dirname, filename))\n","\n","# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using \"Save & Run All\" \n","# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session"]},{"cell_type":"markdown","metadata":{},"source":["## **4.2 Import data** \n","\n","[Table of Contents](#0.1)"]},{"cell_type":"code","execution_count":4,"metadata":{"trusted":true},"outputs":[{"data":{"text/html":["
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
Month#Passengers
01949-01112
11949-02118
21949-03132
31949-04129
41949-05121
\n","
"],"text/plain":[" Month #Passengers\n","0 1949-01 112\n","1 1949-02 118\n","2 1949-03 132\n","3 1949-04 129\n","4 1949-05 121"]},"execution_count":4,"metadata":{},"output_type":"execute_result"}],"source":["path = '~/.cache/kagglehub/datasets/chirag19/air-passengers/versions/1/AirPassengers.csv'\n","\n","df = pd.read_csv(path)\n","\n","df.head()"]},{"cell_type":"markdown","metadata":{},"source":["- We should rename the column names."]},{"cell_type":"code","execution_count":6,"metadata":{"trusted":true},"outputs":[{"data":{"text/html":["
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
DateNumber of Passengers
01949-01112
11949-02118
21949-03132
31949-04129
41949-05121
\n","
"],"text/plain":[" Date Number of Passengers\n","0 1949-01 112\n","1 1949-02 118\n","2 1949-03 132\n","3 1949-04 129\n","4 1949-05 121"]},"execution_count":6,"metadata":{},"output_type":"execute_result"}],"source":["df.columns = ['Date','Number of Passengers']\n","\n","df.head()"]},{"cell_type":"code","execution_count":7,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["DataFrame written to air_passengers_1949_1960.csv\n"]}],"source":["# Write the dataframe to a local file\n","output_path = 'air_passengers_1949_1960.csv'\n","df.to_csv(output_path, index=False)\n","print(f\"DataFrame written to {output_path}\")\n"]},{"cell_type":"markdown","metadata":{},"source":["# **5. Visualize the Time Series** \n","\n","\n","[Table of Contents](#0.1)\n"]},{"cell_type":"code","execution_count":9,"metadata":{"trusted":true},"outputs":[{"data":{"image/png":"","text/plain":["
"]},"metadata":{},"output_type":"display_data"}],"source":["def plot_df(df, x, y, title=\"\", xlabel='Date', ylabel='Number of Passengers', dpi=100):\n"," plt.figure(figsize=(15,4), dpi=dpi)\n"," plt.plot(x, y, color='tab:red')\n"," plt.gca().set(title=title, xlabel=xlabel, ylabel=ylabel)\n"," plt.show()\n"," \n","\n","plot_df(df, x=df['Date'], y=df['Number of Passengers'], title='Number of US Airline passengers from 1949 to 1960')"]},{"cell_type":"markdown","metadata":{},"source":["- Since all the values are positive, we can show this on both sides of the Y axis to emphasize the growth."]},{"cell_type":"code","execution_count":10,"metadata":{"trusted":true},"outputs":[{"data":{"image/png":"","text/plain":["
"]},"metadata":{},"output_type":"display_data"}],"source":["x = df['Date'].values\n","y1 = df['Number of Passengers'].values\n","\n","# Plot\n","fig, ax = plt.subplots(1, 1, figsize=(16,5), dpi= 120)\n","plt.fill_between(x, y1=y1, y2=-y1, alpha=0.5, linewidth=2, color='seagreen')\n","plt.ylim(-800, 800)\n","plt.title('Air Passengers (Two Side View)', fontsize=16)\n","plt.hlines(y=0, xmin=np.min(df['Date']), xmax=np.max(df['Date']), linewidth=.5)\n","plt.show()"]},{"cell_type":"markdown","metadata":{},"source":["- It can be seen that its a monthly time series and follows a certain repetitive pattern every year. So, we can plot each year as a separate line in the same plot. This let us compare the year wise patterns side-by-side."]},{"cell_type":"markdown","metadata":{},"source":["# **6. Patterns in a Time Series** \n","\n","\n","[Table of Contents](#0.1)\n","\n","\n","- Any time series visualization may consist of the following components: **Base Level + Trend + Seasonality + Error**.\n","\n","\n","### **Trend**\n","\n","- A **trend** is observed when there is an increasing or decreasing slope observed in the time series. \n","\n","\n","### **Seasonality**\n","\n","- A **seasonality** is observed when there is a distinct repeated pattern observed between regular intervals due to seasonal factors. It could be because of the month of the year, the day of the month, weekdays or even time of the day.\n","\n","\n","However, It is not mandatory that all time series must have a trend and/or seasonality. A time series may not have a distinct trend but have a seasonality and vice-versa.\n"]},{"cell_type":"code","execution_count":11,"metadata":{"trusted":true},"outputs":[{"data":{"image/png":"","text/plain":["
"]},"metadata":{},"output_type":"display_data"}],"source":["def plot_df(df, x, y, title=\"\", xlabel='Date', ylabel='Number of Passengers', dpi=100):\n"," plt.figure(figsize=(15,4), dpi=dpi)\n"," plt.plot(x, y, color='blue')\n"," plt.gca().set(title=title, xlabel=xlabel, ylabel=ylabel)\n"," plt.show()\n"," \n","\n","plot_df(df, x=df['Date'], y=df['Number of Passengers'], title='Trend and Seasonality')"]},{"cell_type":"markdown","metadata":{},"source":["### **Cyclic behaviour**\n","\n","- Another important thing to consider is the **cyclic behaviour**. It happens when the rise and fall pattern in the series does not happen in fixed calendar-based intervals. We should not confuse 'cyclic' effect with 'seasonal' effect.\n","\n","- If the patterns are not of fixed calendar based frequencies, then it is cyclic. Because, unlike the seasonality, cyclic effects are typically influenced by the business and other socio-economic factors."]},{"cell_type":"markdown","metadata":{},"source":["# **7. Additive and Multiplicative Time Series** \n","\n","\n","[Table of Contents](#0.1)\n","\n","\n","- We may have different combinations of trends and seasonality. Depending on the nature of the trends and seasonality, a time series can be modeled as an additive or multiplicative time series. Each observation in the series can be expressed as either a sum or a product of the components.\n","\n","\n","### **Additive time series:**\n","\n","Value = Base Level + Trend + Seasonality + Error\n","\n","\n","### **Multiplicative Time Series:**\n","\n","Value = Base Level x Trend x Seasonality x Error"]},{"cell_type":"markdown","metadata":{},"source":["# **8. Decomposition of a Time Series** \n","\n","\n","[Table of Contents](#0.1)\n","\n","\n","- Decomposition of a time series can be performed by considering the series as an additive or multiplicative combination of the base level, trend, seasonal index and the residual term.\n","\n","\n","- The seasonal_decompose in statsmodels implements this conveniently."]},{"cell_type":"code","execution_count":12,"metadata":{"trusted":true},"outputs":[{"data":{"image/png":"","text/plain":["
"]},"metadata":{},"output_type":"display_data"},{"data":{"image/png":"","text/plain":["
"]},"metadata":{},"output_type":"display_data"}],"source":["from statsmodels.tsa.seasonal import seasonal_decompose\n","from dateutil.parser import parse\n","\n","\n","# Multiplicative Decomposition \n","multiplicative_decomposition = seasonal_decompose(df['Number of Passengers'], model='multiplicative', period=30)\n","\n","# Additive Decomposition\n","additive_decomposition = seasonal_decompose(df['Number of Passengers'], model='additive', period=30)\n","\n","# Plot\n","plt.rcParams.update({'figure.figsize': (16,12)})\n","multiplicative_decomposition.plot().suptitle('Multiplicative Decomposition', fontsize=16)\n","plt.tight_layout(rect=[0, 0.03, 1, 0.95])\n","\n","additive_decomposition.plot().suptitle('Additive Decomposition', fontsize=16)\n","plt.tight_layout(rect=[0, 0.03, 1, 0.95])\n","\n","plt.show()"]},{"cell_type":"markdown","metadata":{},"source":["- If we look at the residuals of the additive decomposition closely, it has some pattern left over. \n","\n","- The multiplicative decomposition, looks quite random which is good. So ideally, multiplicative decomposition should be preferred for this particular series."]},{"cell_type":"markdown","metadata":{},"source":["# **9. Stationary and Non-Stationary Time Series** \n","\n","\n","[Table of Contents](#0.1)\n","\n","\n","- Now, we wil discuss **Stationary and Non-Stationary Time Series**. **Stationarity** is a property of a time series. A stationary series is one where the values of the series is not a function of time. So, the values are independent of time.\n","\n","\n","- Hence the statistical properties of the series like mean, variance and autocorrelation are constant over time. Autocorrelation of the series is nothing but the correlation of the series with its previous values.\n","\n","\n","- A stationary time series is independent of seasonal effects as well.\n","\n","\n","- Now, we will plot some examples of stationary and non-stationary time series for clarity."]},{"cell_type":"markdown","metadata":{},"source":["![Stationary and Non-Stationary Time Series](https://www.machinelearningplus.com/wp-content/uploads/2019/02/stationary-and-non-stationary-time-series-865x569.png?ezimgfmt=ng:webp/ngcb1)\n","\n","image source : https://www.machinelearningplus.com/wp-content/uploads/2019/02/stationary-and-non-stationary-time-series-865x569.png?ezimgfmt=ng:webp/ngcb1"]},{"cell_type":"markdown","metadata":{},"source":["- We can covert any non-stationary time series into a stationary one by applying a suitable transformation. Mostly statistical forecasting methods are designed to work on a stationary time series. The first step in the forecasting process is typically to do some transformation to convert a non-stationary series to stationary."]},{"cell_type":"markdown","metadata":{},"source":["\n","\n","# **10. How to make a time series stationary?** \n","\n","\n","[Table of Contents](#0.1)\n","\n","\n","- We can apply some sort of transformation to make the time-series stationary. These transformation may include:\n","\n","\n","1. Differencing the Series (once or more)\n","2. Take the log of the series\n","3. Take the nth root of the series\n","4. Combination of the above\n","\n","\n","- The most commonly used and convenient method to stationarize the series is by differencing the series at least once until it becomes approximately stationary."]},{"cell_type":"markdown","metadata":{},"source":["## **10.1 Introduction to Differencing** \n","\n","\n","[Table of Contents](#0.1)\n","\n","\n","- If Y_t is the value at time t, then the first difference of Y = Yt – Yt-1. In simpler terms, differencing the series is nothing but subtracting the next value by the current value.\n","\n","\n","- If the first difference doesn’t make a series stationary, we can go for the second differencing and so on.\n","\n","\n"," - For example, consider the following series: [1, 5, 2, 12, 20]\n","\n","\n"," - First differencing gives: [5-1, 2-5, 12-2, 20-12] = [4, -3, 10, 8]\n","\n","\n"," - Second differencing gives: [-3-4, -10-3, 8-10] = [-7, -13, -2]"]},{"cell_type":"markdown","metadata":{},"source":["## **10.2 Reasons to convert a non-stationary series into stationary one before forecasting** \n","\n","\n","[Table of Contents](#0.1)\n","\n","\n","There are reasons why we want to convert a non-stationary series into a stationary one. These are given below:\n","\n","\n","- Forecasting a stationary series is relatively easy and the forecasts are more reliable.\n","\n","\n","- An important reason is, autoregressive forecasting models are essentially linear regression models that utilize the lag(s) of the series itself as predictors.\n","\n","\n","- We know that linear regression works best if the predictors (X variables) are not correlated against each other. So, stationarizing the series solves this problem since it removes any persistent autocorrelation, thereby making the predictors(lags of the series) in the forecasting models nearly independent."]},{"cell_type":"markdown","metadata":{},"source":["# **11. How to test for stationarity?** \n","\n","\n","[Table of Contents](#0.1)\n","\n","\n","- The stationarity of a series can be checked by looking at the plot of the series.\n","\n","\n","- Another method is to split the series into 2 or more contiguous parts and computing the summary statistics like the mean, variance and the autocorrelation. If the stats are quite different, then the series is not likely to be stationary.\n","\n","\n","- There are several quantitative methods we can use to determine if a given series is stationary or not. This can be done using statistical tests called [Unit Root Tests](https://en.wikipedia.org/wiki/Unit_root). This test checks if a time series is non-stationary and possess a unit root. \n","\n","\n","- There are multiple implementations of Unit Root tests like:\n","\n","\n","**1. Augmented Dickey Fuller test (ADF Test)**\n","\n","**2. Kwiatkowski-Phillips-Schmidt-Shin – KPSS test (trend stationary)**\n","\n","**3. Philips Perron test (PP Test)**\n","\n","\n"]},{"cell_type":"markdown","metadata":{},"source":["## **11.1 Augmented Dickey Fuller test (ADF Test)** \n","\n","\n","[Table of Contents](#0.1)\n","\n","\n","- **Augmented Dickey Fuller test or (ADF Test)** is the most commonly used test to detect stationarity. Here, we assume that the null hypothesis is the time series possesses a unit root and is non-stationary. Then, we collect evidence to support or reject the null hypothesis. So, if we find that the p-value in ADF test is less than the significance level (0.05), we reject the null hypothesis.\n","\n","\n","- Feel free to check the following links to learn more about the ADF Test.\n","\n","\n","https://en.wikipedia.org/wiki/Augmented_Dickey%E2%80%93Fuller_test\n","\n","https://www.machinelearningplus.com/time-series/augmented-dickey-fuller-test/\n","\n","https://machinelearningmastery.com/time-series-data-stationary-python/\n","\n","http://www.insightsbot.com/augmented-dickey-fuller-test-in-python/\n","\n","https://nwfsc-timeseries.github.io/atsa-labs/sec-boxjenkins-aug-dickey-fuller.html\n","\n","https://www.statisticshowto.com/adf-augmented-dickey-fuller-test/\n","\n"]},{"cell_type":"markdown","metadata":{},"source":["\n","## **11.2 Kwiatkowski-Phillips-Schmidt-Shin – KPSS test (trend stationary)** \n","\n","\n","[Table of Contents](#0.1)\n","\n","\n","- The KPSS test, on the other hand, is used to test for trend stationarity. The null hypothesis and the P-Value interpretation is just the opposite of ADH test.\n","\n","- Interested readers can learn more about the KPSS test from the below links:\n","\n","\n","https://en.wikipedia.org/wiki/KPSS_test\n","\n","https://www.machinelearningplus.com/time-series/kpss-test-for-stationarity/\n","\n","https://www.statisticshowto.com/kpss-test/\n","\n","https://nwfsc-timeseries.github.io/atsa-labs/sec-boxjenkins-kpss.html\n","\n"]},{"cell_type":"markdown","metadata":{},"source":["## **11.3 Philips Perron test (PP Test)** \n","\n","\n","[Table of Contents](#0.1)\n","\n","\n","- The Philips Perron or PP test is a [unit root test](https://en.wikipedia.org/wiki/Unit_root). It is used in the [time series analysis](https://en.wikipedia.org/wiki/Time_series) to test the [null hypothesis](https://en.wikipedia.org/wiki/Null_hypothesis) that a time series is integrated of order 1. It is built on the ADF test discussed above.\n","\n","\n","- For more information on PP test, please visit the following links:\n","\n","\n","https://en.wikipedia.org/wiki/Phillips%E2%80%93Perron_test\n","\n","https://www.mathworks.com/help/econ/pptest.html\n","\n","https://people.bath.ac.uk/hssjrh/Phillips%20Perron.pdf\n","\n","https://www.stata.com/manuals13/tspperron.pdf"]},{"cell_type":"markdown","metadata":{},"source":["# **12. Difference between white noise and a stationary series** \n","\n","\n","[Table of Contents](#0.1)\n","\n","\n","- Like a stationary series, the white noise is also not a function of time. So, its mean and variance does not change over time. But the difference is that, the white noise is completely random with a mean of 0. In white noise there is no pattern.\n","\n","- Mathematically, a sequence of completely random numbers with mean zero is a white noise."]},{"cell_type":"code","execution_count":13,"metadata":{"trusted":true},"outputs":[{"data":{"text/plain":[""]},"execution_count":13,"metadata":{},"output_type":"execute_result"},{"data":{"image/png":"","text/plain":["
"]},"metadata":{},"output_type":"display_data"}],"source":["rand_numbers = np.random.randn(1000)\n","pd.Series(rand_numbers).plot(title='Random White Noise', color='b')"]},{"cell_type":"markdown","metadata":{},"source":["# **13. Detrend a Time Series** \n","\n","\n","[Table of Contents](#0.1)\n","\n","\n","- Detrending a time series means to remove the trend component from the time series. There are multiple approaches of doing this as listed below:\n","\n","\n","1. Subtract the line of best fit from the time series. The line of best fit may be obtained from a linear regression model with the time steps as the predictor. For more complex trends, we may want to use quadratic terms (x^2) in the model.\n","\n","2. We subtract the trend component obtained from time series decomposition.\n","\n","3. Subtract the mean.\n","\n","4. Apply a filter like Baxter-King filter(statsmodels.tsa.filters.bkfilter) or the Hodrick-Prescott Filter (statsmodels.tsa.filters.hpfilter) to remove the moving average trend lines or the cyclical components.\n","\n","\n","Now, we will implement the first two methods to detrend a time series."]},{"cell_type":"code","execution_count":14,"metadata":{"trusted":true},"outputs":[{"data":{"text/plain":["Text(0.5, 1.0, 'Air Passengers detrended by subtracting the least squares fit')"]},"execution_count":14,"metadata":{},"output_type":"execute_result"},{"data":{"image/png":"","text/plain":["
"]},"metadata":{},"output_type":"display_data"}],"source":["# Using scipy: Subtract the line of best fit\n","from scipy import signal\n","detrended = signal.detrend(df['Number of Passengers'].values)\n","plt.plot(detrended)\n","plt.title('Air Passengers detrended by subtracting the least squares fit', fontsize=16)"]},{"cell_type":"code","execution_count":15,"metadata":{"trusted":true},"outputs":[{"data":{"text/plain":["Text(0.5, 1.0, 'Air Passengers detrended by subtracting the trend component')"]},"execution_count":15,"metadata":{},"output_type":"execute_result"},{"data":{"image/png":"","text/plain":["
"]},"metadata":{},"output_type":"display_data"}],"source":["# Using statmodels: Subtracting the Trend Component\n","from statsmodels.tsa.seasonal import seasonal_decompose\n","result_mul = seasonal_decompose(df['Number of Passengers'], model='multiplicative', period=30)\n","detrended = df['Number of Passengers'].values - result_mul.trend\n","plt.plot(detrended)\n","plt.title('Air Passengers detrended by subtracting the trend component', fontsize=16)"]},{"cell_type":"markdown","metadata":{},"source":["# **14. Deseasonalize a Time Series** \n","\n","\n","[Table of Contents](#0.1)\n","\n","\n","There are multiple approaches to deseasonalize a time series. These approaches are listed below:\n","\n","\n","- 1. Take a moving average with length as the seasonal window. This will smoothen in series in the process.\n","\n","- 2. Seasonal difference the series (subtract the value of previous season from the current value).\n","\n","- 3. Divide the series by the seasonal index obtained from STL decomposition.\n","\n","\n","\n","If dividing by the seasonal index does not work well, we will take a log of the series and then do the deseasonalizing. We will later restore to the original scale by taking an exponential.\n"]},{"cell_type":"code","execution_count":16,"metadata":{"trusted":true},"outputs":[{"data":{"text/plain":["[]"]},"execution_count":16,"metadata":{},"output_type":"execute_result"},{"data":{"image/png":"","text/plain":["
"]},"metadata":{},"output_type":"display_data"}],"source":["# Subtracting the Trend Component\n","\n","\n","# Time Series Decomposition\n","result_mul = seasonal_decompose(df['Number of Passengers'], model='multiplicative', period=30)\n","\n","\n","# Deseasonalize\n","deseasonalized = df['Number of Passengers'].values / result_mul.seasonal\n","\n","\n","# Plot\n","plt.plot(deseasonalized)\n","plt.title('Air Passengers Deseasonalized', fontsize=16)\n","plt.plot()"]},{"cell_type":"markdown","metadata":{},"source":["# **15. How to test for seasonality of a time series?** \n","\n","\n","[Table of Contents](#0.1)\n","\n","\n","\n","The common way to test for seasonality of a time series is to plot the series and check for repeatable patterns in fixed time intervals. So, the types of seasonality is determined by the clock or the calendar.\n","\n","\n","1. Hour of day\n","2. Day of month\n","3. Weekly\n","4. Monthly\n","5. Yearly\n","\n","However, if we want a more definitive inspection of the seasonality, use the **Autocorrelation Function (ACF) plot**. There is a strong seasonal pattern, the ACF plot usually reveals definitive repeated spikes at the multiples of the seasonal window."]},{"cell_type":"code","execution_count":17,"metadata":{"trusted":true},"outputs":[{"data":{"text/plain":[""]},"execution_count":17,"metadata":{},"output_type":"execute_result"},{"data":{"image/png":"iVBORw0KGgoAAAANSUhEUgAABBEAAAJ/CAYAAAAu1Ut7AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/TGe4hAAAACXBIWXMAABJ0AAASdAHeZh94AACreElEQVR4nOzdd3RU1d7G8e/MpCekQAgkIRB6CxBaQpNipYigUryCiA0V+2u79oJ6r3Ltig2lKIiCKIqKihRpEnoNnVBSSO99Zt4/BqIxCSSkTMrzWcslOefsc37bbEPyZJ+9DVar1YqIiIiIiIiIyAUY7V2AiIiIiIiIiNQNChFEREREREREpFwUIoiIiIiIiIhIuShEEBEREREREZFyUYggIiIiIiIiIuWiEEFEREREREREykUhgoiIiIiIiIiUi0IEERERERERESkXhQgiIiIiIiIiUi4KEURERERERESkXBQiiIiIiIiIiEi5KEQQERERERERkXKpdyFCZmYmzz33HMOHD6dx48YYDAbmzp1b7vapqalMmzaNpk2b4u7uzrBhw9i+fXup137//ff06tULFxcXWrZsyXPPPUdhYWEV9URERERERESkdql3IUJiYiIvvvgikZGR9OjRo0JtLRYLo0aNYuHChdx777289tprxMfHM3ToUA4fPlzs2p9//pmxY8fi7e3Nu+++y9ixY3nppZe47777qrI7IiIiIiIiIrWGwWq1Wu1dRFXKy8sjJSWF5s2bs3XrVvr27cucOXOYOnXqBdt+/fXXTJw4kcWLFzNu3DgAEhIS6NChAyNGjGDhwoVF13bt2hVHR0e2bt2Kg4MDAE8//TSvvPIK+/fvp1OnTtXSPxERERERERF7qXczEZydnWnevPlFtV2yZAnNmjXjuuuuKzrWtGlTJkyYwLJly8jLywNg//797N+/n2nTphUFCADTp0/HarWyZMmSynVCREREREREpBZyuPAlDceOHTvo1asXRmPxbCUsLIyPP/6YQ4cO0a1bN3bs2AFAnz59il0XEBBAixYtis6XJT4+noSEhGLH0tPTi+7v7OxcBb0RERERERERKVteXh6nTp1iyJAheHt7l6uNQoS/iY2NZfDgwSWO+/v7AxATE0O3bt2IjY0tdvyf18bExJz3ObNmzeKFF16ogopFREREREREKue7775jzJgx5bpWIcLf5OTklDoLwMXFpej83/9d1rXp6ennfc706dMZP358sWP79+9nwoQJLFy4kDZt2lxU/VJ35OTksGfPHrp164arq6u9yxE70TgQ0DgQG40DAY0DsdE4EKi5cXDs2DFuvPFGgoKCyt1GIcLfuLq6Fq178He5ublF5//+77KuvdAn2c/PDz8/v1LPde/ena5du1aobql70tPTSUtLo1evXnh6etq7HLETjQMBjQOx0TgQ0DgQG40DgZobBx4eHkDpvyAvS71bWLEy/P39i15V+LtzxwICAoqu+/vxf1577joRERERERGR+kQhwt+Ehoayfft2LBZLseObN2/Gzc2NDh06FF0HsHXr1mLXxcTEcPr06aLzIiIiIiIiIvVJgw0RYmNjOXDgAAUFBUXHxo0bx5kzZ1i6dGnRscTERBYvXszo0aOLpnh07dqVTp068fHHH2M2m4uu/eCDDzAYDIwbN67mOiIiIiIiIiJSQ+rlmgjvvfceqampRbsk/PDDD5w+fRqA++67Dy8vL5544gnmzZvH8ePHCQ4OBmwhQr9+/bjlllvYv38/vr6+zJo1C7PZXGI3hZkzZ3LNNddw5ZVXcsMNN7B3717ee+89br/9djp37lyj/RURERERERGpCfUyRPjf//7HiRMnij5eunRp0eyCyZMn4+XlVWo7k8nETz/9xKOPPso777xDTk4Offv2Ze7cuXTs2LHYtVdffTVLly7lhRde4L777qNp06Y8+eSTPPvss9XXMRERERERERE7qpchQlRU1AWvmTt3LnPnzi1x3MfHh9mzZzN79uwL3mPs2LGMHTu24gWKiIiIiIiI1EENdk0EEREREREREakYhQgiIiIiIiIiUi4KEURERERERESkXBQiiIiIiIiIiEi5KESoZbLzC+1dgoiIiIiIiEipFCLUMpuPJdu7BBEREREREZFSKUSoZTYcSbR3CSIiIiIiIiKlUohQy0QcTya3wGzvMkRERERERERKUIhQy+QUmFl7KMHeZYiIiIiIiIiUoBChFlqxN87eJYiIiIiIiIiUoBChFloZeYb8Qou9yxAREREREREpRiFCLZSRW8iGo1pgUURERERERGoXhQi1jKPJ9ilZsUevNIiIiIiIiEjtohChlundyhuAX/fHUWjWKw0iIiIiIiJSeyhEqGUGtfMFICW7gIjjyXauRkREREREROQvChFqmf5tfXEwGgBYsU+vNIiIiIiIiEjtoRChlvF0daR/2yaAbatHi8Vq54pEREREREREbBQi1ELDQ5oDEJ+Rx45TKXauRkRERERERMRGIUItdGWX5hhsbzTws3ZpEBERERERkVpCIUIt1LSRM32DGwPw8944rFa90iAiIiIiIiL2pxChlhre1fZKQ3RqDnuj0+1cjYiIiIiIiIhChFrr3LoIAD/vjbVjJSIiIiIiIiI2ChFqqQBvV3oEeQO2XRr0SoOIiIiIiIjYm0KEWmzE2dkIxxKzOByfaedqREREREREpKFTiFCLjfj7Kw3apUFERERERETsTCFCLdaqiTud/T0BrYsgIiIiIiIi9qcQoZYbeXY2woG4DI7EZ9i5GhEREREREWnIFCLUclf3CCj68w+7NBtBRERERERE7EchQi3X2tedkEDbKw3Ld8dolwYRERERERGxG4UIdcDV3W2zEY4mZBEZq1caRERERERExD4UItQBo7r5F/15+e4YO1YiIiIiIiIiDZlChDogqLEboUHeACzfHatXGkRERERERMQuFCLUEVd3t81GOJmczZ7oNDtXIyIiIiIiIg2RQoQ6YlT3v7/SoF0aREREREREpOYpRKgj/L1cCQtuDMDyXTFYLHqlQURERERERGqWQoQ65OoettkIMWm57DiVYudqREREREREpKFRiFCHjAjxx2iw/fmHXXqlQURERERERGqWQoQ6pGkjZ/q1aQLAT3tiMeuVBhEREREREalBChHqmKu7BwAQn5HHlqhkO1cjIiIiIiIiDYlChDpmeEhzTGffaVi+O8bO1YiIiIiIiEhDohChjmns7sTAdr4A/LwnjkKzxc4ViYiIiIiISEOhEKEOGt3dtktDUlY+m44l2bkaERERERERaSgUItRBV3ZtjpPJ9qlbrl0aREREREREpIYoRKiDvFwdGdzB9krDin1x5BfqlQYRERERERGpfgoR6qhzuzSk5RSw5mC8nasRERERERGRhkAhQh11eZdmeDg7ADBvU5R9ixEREREREZEGQSFCHeXh7MD4Pi0A2HAkiYNxGXauSEREREREROo7hQh12NQBwRgMtj/P3XjcvsWIiIiIiIhIvacQoQ5r1cSdyzr5AbB0ezQpWfl2rkhERERERETqM4UIddwtA1sDkFdo4cstJ+1cjYiIiIiIiNRnChHquAFtm9CxWSMAPt90ggKztnsUERERERGR6qEQoY4zGAxMHRgMQGxaLr/si7NvQSIiIiIiIlJvKUSoB8aGBuLt5gjAnA1R9i1GRERERERE6i2FCPWAq5OJG8NaArDtRAq7TqXatyARERERERGplxQi1BM39W+FyWjb73HOBm33KCIiIiIiIlWvXoYIeXl5PP744wQEBODq6kp4eDi//fbbBdsFBwdjMBhK/ad9+/bFri3ruv/+97/V1a3z8vdyZURIcwB+3BNLfHquXeoQERERERGR+svB3gVUh6lTp7JkyRIefPBB2rdvz9y5cxk5ciSrV69m0KBBZbZ76623yMzMLHbsxIkTPP3001x55ZUlrr/iiiuYMmVKsWM9e/asmk5chFsGtmb57lgKzFa++PME/3dlR7vVUpViUnNYsu00V3RpRmd/T3uXIyIiIiIi0mDVuxAhIiKCRYsWMXPmTB555BEApkyZQkhICI899hgbN24ss+3YsWNLHHvppZcAmDRpUolzHTp0YPLkyVVTeBXo1dKbHi282HU6jQWbTzJ9WDtcHE32LqtSVuyN5bElu0nPLWTh5pOseXRone+TiIiIiIhIXVXvXmdYsmQJJpOJadOmFR1zcXHhtttuY9OmTZw6dapC91u4cCGtW7dmwIABpZ7PyckhN7d2vDpgMBi4ZWBrAJKy8vlhV4ydK7p4uQVmnv5uD3d9sZ303EIA4tJzWby1Yp8/ERERERERqTr1LkTYsWMHHTp0wNOz+LT3sLAwAHbu3Fmhe0VGRnLjjTeWen7u3Lm4u7vj6upKly5dWLhw4UXXXVVGdvPHr5EzAB/9cYwCs8XOFVXcoTMZjHlvA1/8eRIATxeHoj59sOYo+YV1r08iIiIiIiL1Qb17nSE2NhZ/f/8Sx88di4kp/2/nFyxYAJT+KsOAAQOYMGECrVu3JiYmhvfff59JkyaRlpbG3Xfffd77xsfHk5CQUOzYkSNHAMjMzCQ9Pb3cNZZmUt8A3lx1nCPxmXzw+wGm9mtRqfvVFKvVyjc743jtt2Pkng0KQlt48t8xHdl0PJUXfjpMTFouX2w4zLieJT/HdUlWVlaxf0vDpHEgoHEgNhoHAhoHYqNxIFBz4+CfawKWh8FqtVqroRa7adu2LR07duSnn34qdvzYsWO0bduWN998kwcffPCC97FYLLRs2RI/Pz+2b99+wevz8/Pp3bs3p0+fJiYmBldX1zKvff7553nhhRdKPffOO+/QsmXLCz7vfAot8OouE/G5BpyMVp4KNePtXKlbVjuLFRYeMbIl0TY5xoCVKwKtDA+yYDLY+vTSDhMp+QYaO1t5OtSMqd7NoxEREREREak5J0+e5P7772fv3r107dq1XG3q3UwEV1dX8vLyShw/t27B+X64/7u1a9cSHR3NQw89VK7rnZycuPfee7nrrrvYtm3beXeBmD59OuPHjy927MiRI4wdO5awsDA6d+5crmeeT6M2KUz7ci/5FgMbc5ozc3jl71mdvt9zhi1/HgKgqYcTr1zTkfBg72LXpPjE8tKKIyTnGcj07czYHs3tUGnVyMrKIiIigrCwMNzd3e1djtiJxoGAxoHYaBwIaByIjcaBQM2Ng8jIyAq3qXchgr+/P9HR0SWOx8bGAhAQEFCu+yxYsACj0ci//vWvcj87KCgIgOTk5PNe5+fnh5+fX6nnPDw8SqzncDGu7OHJqH1J/Lg7ll8iE7lpQD6D2vtW+r7VITOvkHfWnADAr5EzPz9wCU08Sk6duGmQO7M3niYuPZdP/4zmxgHtcKjj0xHc3d2r5PMtdZvGgYDGgdhoHAhoHIiNxoFA9Y8DDw+PCrep2z+BlSI0NJRDhw6VWFdg8+bNRecvJC8vj2+++YahQ4eWO3QA2ysTAE2bNi1/wdXomVFdcHeybYf47Pd7ySs027mi0s1afYT4DNvskceHdyo1QABwdjBx99C2AJxIyuaH3XV39wkREREREZG6qN6FCOPGjcNsNvPxxx8XHcvLy2POnDmEh4cXzRY4efIkBw4cKPUeP/30E6mpqaUuqAiUWBQRICMjg7feegtfX1969+5dBT2pvOZeLjx4eQcAjiVk8en643auqKSTSdnMXmerq0eQN9f2DDzv9RP7BtH07E4N7646gtlSr5b0EBERERERqdXq3esM4eHhjB8/nieeeIL4+HjatWvHvHnziIqK4tNPPy26bsqUKaxdu5bS1pVcsGABzs7OXH/99aU+4/333+e7775j9OjRtGzZktjYWD777DNOnjzJ559/jpOTU7X1r6KmDgxm8bZTHDqTybu/H2FMaCCB3uVbF6ImvPzTfvLPbkP5/OguGI2G817v4mjizsFteOnHSI4lZPHjnliu6VH+2SIiIiIiIiJy8erdTASA+fPn8+CDD/L5559z//33U1BQwPLlyxk8ePAF26anp/Pjjz8yatQovLy8Sr1m4MCB+Pn5MXv2bO655x7efPNNOnbsyMqVK8ucvWAvjiYjL44JASCnwMyLP+yzc0V/2XAkkV/2nQHgul6B9GzpU652k8Jb4ethC2re/f0wFs1GEBERERERqRH1biYCgIuLCzNnzmTmzJllXrNmzZpSj3t6epKTk3Pe+19xxRVcccUVlSmxRvVr04SxoQF8tzOGX/adYfXBeIZ1LH1hx5pSaLbwwtlAw83JxOPDO5W7rauTiTsuacN/fj7A4fhMVuyLY2Q3/+oqVURERERERM6qlzMRpKQnR3WmkbMtM3r++33kFth3kcWFESc5dCYTgHuGtaOZp0uF2k/u1wofN0cA3tFsBBERERERkRqhEKGB8Gvkwv9daVtk8URSNv/9ufRFJWtCSlY+r/96CICgxq7cNqh1he/h7uzA7Ze0AeBAXAZrDsVXaY0iIiIiIiJSkkKEBuSmfq0IDfIGYO7GKH7bf8Yudby58hBpOQUAPDWyCy6Opou6z5T+rXA7u4Xl0u3RVVafiIiIiIiIlE4hQgPiYDLy7r96Fr3W8OiSXcSmnX/9h6p2MC6DL/48AcDAdk24qmuzi75XIxdHruxia78y8gyZeYVVUqOIiIiIiIiUTiFCAxPU2I1XrusGQGp2AQ8s2om5htYTyC+08OiSXVisYDTAs1d3xWA4/5aOFzImNBCA3AILv+6Lq4oyRUREREREpAwKERqg0T0CuKFvEAARx5N5d9XhGnnuqysOsPt0GgC3DWpNx+aNKn3PQe19aexu2+7xu50xlb6fiIiIiIiIlE0hQgP13OiutPPzAGy7G2w+llStz1u5/wyfrj8OQLdALx65qmOV3NfRZGTU2e0dNxxJJCEjr0ruKyIiIiIiIiUpRGigXJ1MvHdjT5wdjFis8MCinaRk5VfLs2JSc3hkyS4APJwdzj734hZTLM3YngEAmC1Wftyt2QgiIiIiIiLVRSFCA9apuSdPX90FgLj0XB5dshurtWrXRyg0W3hg0Q5Ss227MbxyXTdaNXGv0mf0aulDCx9XQK80iIiIiIiIVCeFCA3c5PCWDO/aHLDtcPDh2mNVev+3fz/MlqgUAG7oG8Q1PQKq9P4ABoOBMaG2++48lcqJpKwqf4aIiIiIiIgoRGjwDAYDr17fnUBv22/yX11xgCe/3UN+oaXS995wJJH3Vh8BoEMzD54b3bXS9yzLuV0aAJZpNoKIiIiIiEi1UIggeLk58tFNvfH1cAZg4eaT3PjJn8Rn5F70PRMy8njwq51YreDiaOS9G3vh6lR16yD8U4dmjejs7wnAdzujq/y1DBEREREREVGIIGeFBHrxw30D6RHkDcDWEymMfnc9O06mVPheadkFPLBoR9FOCc+P7kqHZpXfzvFCxp59peFYQhb7YtKr/XkiIiIiIiINjUIEKeLv5cpX0/oxvncLAM6k5zHxoz/5esupct/jl31xXP7mWjYetW0ZObpHABP7BlVLvf80ukcABoPtz8t2RtfIM0VERERERBoShQhSjIujidfGdefFMV1xMBrIN1t47JvdPLF0N4fOZJT5mkBSZh73LtzOnZ9vK5qBMKRDU/5zXTcM536yr2YB3q6EBTcG4PtdMZgteqVBRERERESkKjnYuwCpfQwGA1P6B9OxWSOmL9hOUlY+X0ac4suIUwQ3ceOKLs24smtzerX0wWiw/cD+/Pf7SDm7jaOniwPPju7K9b0CayxAOGdMaCCbjydzJj2PzceTGNDWt0afLyIiIiIiUp8pRJAyhbdpwg/3DeLBRTuJiEoGICopm0/WHeeTdcdp4u5EyyZu7DiZWtTmqq7NmDEmBD9PF7vUPLJbc577fi8FZivLdsQoRBAREREREalCep1BzivA25Wv7+rPmkeG8tTIzvQN9iladyApK78oQGji7sT7N/biw8m97RYgAHi7OTGkgx8AP+2NJa/QbLdaRERERERE6hvNRJByCfZ1547BbbhjcBsSM/NYFRnPr/vj2BKVwmWd/Hj66i40dneyd5kAjO0ZwMrIM2TkFrL6QALDQ5rbuyQREREREZF6QSGCVJivhzMT+gYxoYZ2Xaioyzo1w93JRFa+mWU7oxUiiIiIiIiIVBG9ziD1jquTiavOBge/R8aTkpVv54pERERERETqB4UIUi+N722bJZFvtvDtjmg7VyMiIiIiIlI/KESQeqlfm8YEN3ED4Kstp7BarXauSEREREREpO5TiCD1ksFgYHwf22yEg2cy2HU6zc4ViYiIiIiI1H0KEaTeGte7Bcaz21F+teWUfYsRERERERGpBxQiSL3VzNOFSzv5AfDDrhiy8wvtXJGIiIiIiEjdphBB6rUJZ19pyMwr5MfdsXauRkREREREpG5TiCD12rBOfvh6OAN6pUFERERERKSyFCJIveZoMjKudwsAtp5I4Uh8pp0rEhERERERqbsUIki9N6FPi6I/L96q2QgiIiIiIiIXSyGC1HttmnoQ1roxAN9sP02B2WLnikREREREROomhQjSIEw8u8BiYmY+v0fG27kaERERERGRukkhgjQII7v508jZAYCvtpy0czUiIiIiIiJ1k0IEaRBcnUxcExoAwNpDCcSl5dq5IhERERERkbpHIYI0GBP72l5psFhhyTYtsCgiIiIiIlJRChGkwegW6EWn5o0A+GrrKSwWq50rEhERERERqVsUIkiDYTAYuOHsbIRTyTlsOJpo54pERERERETqFoUI0qCM7RmIi6Nt2M9afdTO1YiIiIiIiNQtChGkQfF2c2JSeCsANh1LYktUsp0rEhERERERqTsUIkiDc+fgNjg52Ib+O78ftnM1IiIiIiIidYdCBGlw/DxditZGWHc4kZ2nUu1bkIiIiIiISB2hEEEapLuGtMXRZADgXc1GEBERERERKReFCNIgBXi7Mq63bTbC7wfi2RudZueKREREREREaj+FCNJgTR/aFpPRNhvhvVVH7FyNiIiIiIhI7acQQRqsoMZuXNszEIAV++I4EJdu54pERERERERqN4UI0qDdM6wdZycjaDaCiIiIiIjIBShEkAatta871/QIAODHPbEcic+0c0UiIiIiIiK1l0IEafDuvbQdBgNYrTBrtWYjiIiIiIiIlEUhgjR47fwaMTLEH4Blu2KISsyyc0UiIiIiIiK1k0IEEWyzEQDMFivvazaCiIiIiIhIqRQiiACd/T25skszAJZsP82uU6n2LagKmC1W9pxOIyffbO9SRERERESknlCIIHLW4yM64WQyYrXC09/txWyx2rukixYZm851szYw+r31jPtwI/mFFnuXJCIiIiIi9YBCBJGz2jb14M4hbQDYE53GF3+esHNFFZdbYOZ/vxxk9Lvr2XU6DYB9Mel8su6YnSsTEREREZH6QCGCyN/cM6wdLRu7AfC/Xw4Sn55r54rKL+J4MiPfWcd7q49QaLFiMhrw9XAC4N1VhzmVnG3nCkVEREREpK5TiCDyNy6OJl4Y0xWAjLxCZvwYaeeKLiw9t4Cnvt3DhI82cSzBtrNESKAny+4ZyPs39gIgt8DCc9/vw2qtu69oiIiIiIiI/SlEEPmHYR39GNmtOQA/7Iph3eEEO1dUtozcAsa+t4EFm08C4Oxg5IkRnfhu+kBCAr0Ib9OE63u1AGDVgXh+2XfGnuWKiIiIiEgdVy9DhLy8PB5//HECAgJwdXUlPDyc33777YLtnn/+eQwGQ4l/XFxcSr3+008/pXPnzri4uNC+fXvefffdqu6K2MmzV3fF3clk+/OyfeQW1M4dDl5aHsmxRNvsgwFtm/DLg4O5c0hbHEx//a/95MhOeLk6AvDCD/vIyiu0S60iIiIiIlL31csQYerUqbzxxhtMmjSJt99+G5PJxMiRI1m/fn252n/wwQd8/vnnRf/MmTOnxDUfffQRt99+O127duXdd9+lf//+3H///bz66qtV3R2xg+ZeLjx0RQcAjidm8eHao3auqKRVB87w1dZTAAxq58sXt4UT7Ote4romHs78e0QnAGLTcnn798M1WqeIiIiIiNQfDvYuoKpFRESwaNEiZs6cySOPPALAlClTCAkJ4bHHHmPjxo0XvMe4cePw9fUt83xOTg5PPfUUo0aNYsmSJQDccccdWCwWZsyYwbRp0/Dx8amaDondTB0QzDfbo4mMTWfWmqOMDQ0s9Yd0e0jNzuff3+wBoJGzA6+O647RaCjz+ol9gli89RTbT6by6frjXNcrkE7NPWuqXBERERERqSfq3UyEJUuWYDKZmDZtWtExFxcXbrvtNjZt2sSpU6cueA+r1Up6enqZi9CtXr2apKQkpk+fXuz4PffcQ1ZWFj/++GPlOiG1goPJyMvXhmAwQH6hhWeW7a01CxM+//0+4jPyAHhmdBcCvV3Pe73RaOClsd0wGQ2YLVae/nYvFkvt6IuIiIiIiNQd9W4mwo4dO+jQoQOensV/yxoWFgbAzp07CQoKOu892rRpQ2ZmJu7u7owdO5bXX3+dZs2aFXsGQJ8+fYq16927N0ajkR07djB58uQy7x8fH09CQvHF+o4cOQJAZmYm6enpF+il1JR23iau79GcJTvjWHc4kfnrD3Ntj+aVvm9WVlaxf1fEygOJfLczBoDB7RpzVXvPco2ZFh5wY58APo+IZuuJFOavP8x1oZXvi1y8yowDqT80DgQ0DsRG40BA40BsamocZGZmVrhNvQsRYmNj8ff3L3H83LGYmJgy2/r4+HDvvffSv39/nJ2dWbduHe+//z4RERFs3bq1KJiIjY3FZDLh5+dXrL2TkxNNmjQ57zMAZs2axQsvvFDquYiICOLi4s7bXmpWLwf4xdFERoGBl34+RPbpSALcqubeERERFbo+owD+u9MEGHAzWbncK541a+LL3T4E8HIykZZv4LVfD+EYH4mHY8VqlqpX0XEg9ZPGgYDGgdhoHAhoHIhNdY+DkydPVrhNvQsRcnJycHZ2LnH83A4LOTk5ZbZ94IEHin18/fXXExYWxqRJk5g1axb//ve/i+7h5ORU6j1cXFzO+wyA6dOnM378+GLHjhw5wtixYwkLC6Nz587nbS81z7d9Cnd+uZcCi4GvTjVi0a09cTu7e8PFyMrKIiIigrCwMNzdy7fOgtVq5f+WRpJZmATAs1d3YmRXvwu0KskUmMAj3x4gu9DASefW3DMkuML3kKpxMeNA6h+NAwGNA7HROBDQOBCbmhoHkZGRFW5T70IEV1dX8vLyShzPzc0tOl8RN954Iw8//DArV64sChFcXV3Jz88v9frc3NwLPsPPz6/ELIZzPDw8SryKIfZ3ZQ9PHkzI582Vh4hKzuE/K6N4a2IoBkPZixmWh7u7e7k/38t2RvP7QVuAMLJbcyb2a3tRz78+rBGfb4ll1+k0lu6O5+ERITg51LvlUeqUiowDqb80DgQ0DsRG40BA40BsqnsceHh4VLhNvfvJwd/fn9jY2BLHzx0LCAio8D2DgoJITk4u9gyz2Ux8fPFp5Pn5+SQlJV3UM6T2u/fSdgxqZ9u1Y9nOGBZGVHzqz8U6k57Ls8v2AdDE3YkZY0IuOsAwGAzc1D8YgISMPH7Zp9dnRERERESkfOpdiBAaGsqhQ4dKLDS3efPmovMVYbVaiYqKomnTpsWeAbB169Zi127duhWLxVLhZ0jdYDIaeOuGUPwa2V6XeeGH/eyNTqv252bkFnDH/K2k5RQA8PK13WjiUfKVnYq4urs/3m62xRA+33Si0jWKiIiIiEjDUO9ChHHjxmE2m/n444+LjuXl5TFnzhzCw8OLdmY4efIkBw4cKNb2nzsmAHzwwQckJCQwfPjwomOXXnopjRs35oMPPihxrZubG6NGjarKLkkt4uvhzLv/6onJaCC/0MI9C7eTnltQbc/LLTBzx/yt7D5tCysm9glieEjld1RwcTQxsY/t/4WIqGQOxGlHEBERERERubB6tyZCeHg448eP54knniA+Pp527doxb948oqKi+PTTT4uumzJlCmvXrsVqtRYda9WqFRMnTqRbt264uLiwfv16Fi1aRGhoKHfeeWfRda6ursyYMYN77rmH8ePHc9VVV7Fu3Tq++OILXn75ZRo3blyjfZaaFd6mCQ9f2YHXVhzkRFI2jy/ZzaxJvSq9PsI/FZot3LtwB38es71Kc3lnP166NqTK7j+5Xys+XncMq9U2G+Hla7tV2b1FRERERKR+qnchAsD8+fN55pln+Pzzz0lJSaF79+4sX76cwYMHn7fdpEmT2LhxI9988w25ubm0atWKxx57jKeeego3t+J7+k2fPh1HR0def/11vv/+e4KCgnjzzTdL7PAg9dNdg9uy5Xgyqw8m8PPeOD7+4xh3DmlbZfe3WKw8/s0eVkaeASCsdWPeu7EXjqaqmzwU1NiNYR39WHUgnm93RPP4iE54umi/RxERERERKVu9DBFcXFyYOXMmM2fOLPOaNWvWlDj2ySefVOg5d9xxB3fccUdFy5N6wGg08MaEUEa9s46YtFz+8/MBolNzeObqLpX+Qd9qtTLjx/18s/00ACGBnsy+uQ8ujhe/pWRZburXilUH4snON7N022mmDmxd5c8QEREREZH6o96tiSBSU3zcnfjopj40cXcCYP6mE0yavZnEzJJbjFbEu6uOMGdDFABtmroz75awapshMKRDU1o2ts2y+fzPE8Ve7xEREREREfknhQgildCthRc/3DeIboFeAEQcT2b0u+vZc7riuzZYrVY+XX+cN347BECAlwuf3xZe6Z0YzsdoNDC5X0sAjiZkseloUrU9S0RERERE6j6FCCKVFODtyuK7+nNdz0AAYtNyuf7DjXyz7XS57xGfkctdX2xjxvL9ADR2d+Lz28MJ9Hatlpr/bkKfIJwdbF8K5mu7RxEREREROQ+FCCJVwMXRxOsTevDs1V2Ktn98ePEunlu2lzPpuWW2s1qtLNsZzZVv/sEv+2yLKDZxd2L+rWG0bepRI7V7uzlxTY8AAH6LPENsWk6NPFdEREREROoehQgiVcRgMHDroNbMvzUMHzfbGgbzNp2g339+58ZP/uSrLSdJyy4ouj4tHx78JpIHFu0k9ezxq7v78+tDgwk5+3pETbmpfysAzBYrX24+WaPPFhERERGRuqNe7s4gYk8D2/ny/b2DuHfhdnadTsNqhY1Hk9h4NIlnvtvH0I5N6eTnyuydJrLNtjUIfD2cmDEmhBHd/O1Sc/cW3vQI8mbXqVQWRpzi3kvb4+SgjFFERERERIrTTwki1SCosRvf3TOQ7+4ZyC0Dg/E9uzhivtnCr/vP8M6aKLLNBgBG9wjg14eG2C1AOGdKP9tshMTMPFbsi7NrLSIiIiIiUjspRBCpJgaDgdAgb54b3ZXNT17GF7eFM753Cxo52yYAeThaeeO6zrz7r540PrtNpD2N6u5fVMf8jVH2LUZERERERGolvc4gUgNMRgOD2vsyqL0vM8aGsPFgDPEHt3N5J197l1bExdHExL5BfLDmKFtPpLDtRAq9W/nYuywREREREalFNBNBpIa5OJro09IL11oY4U0dEIyTyfZl4cO1R+1cjYiIiIiI1DYKEUSkSDNPF67vHQjAb/vPcPhMhp0rEhERERGR2kQhgogUM21wWwy2NR/5cO0x+xYjIiIiIiK1ikIEESmmta87I0KaA7BsZzTRqTl2rkhERERERGoLhQgiUsJdQ9oCUGixMnudZiOIiIiIiIiNQgQRKaF7C28GtbPtHLEo4hTJWfl2rkhERERERGoDhQgiUqq7h9pmI+QUmJm3Mcq+xYiIiIiISK2gEEFESjWgbRO6t/ACYN6mKLLzC+1ckYiIiIiI2JtCBBEplcFg4O6zayOkZhewKOKUnSsSERERERF7U4ggImW6smtzWvu6AzB73THyCy12rkhEREREROxJIYKIlMlkNHDn4DYAxKTl8v2uGDtXJCIiIiIi9qQQQUTO69pegfg1cgbgw7VHsVisdq5IRERERETsRSGCiJyXs4OJ2y9pDcCR+Ex+3htn54pERERERMReFCKIyAXdGN4KbzdHAF75KZKcfLOdKxIREREREXtQiCAiF+Th7MAjV3YEIDo1hw/WHrVzRSIiIiIiYg8KEUSkXP4V1pKuAZ6AbW2Ek0nZdq5IRERERERqmkIEESkXk9HAi2O6ApBfaGHGj/vtXJGIiIiIiNQ0hQgiUm69WzXmul6BAPy2/wxrDsbbuSIREREREalJChFEpEL+PaITHs4OALzww37yCrXIooiIiIhIQ6EQQUQqxK+RCw9e3h6A44lZfLY+yr4FiYiIiIhIjVGIICIVdvOAYNr7eQDw7qrDxKbl2LkiERERERGpCQoRRKTCHE1GXrjGtshidr6ZV346YOeKRERERESkJihEEJGLMqCdL6O6+QPww64Y/jyWZOeKRERERESkuilEEJGL9uSozrg6mgB4+ru95BZokUURERERkfpMIYKIXLRAb1fuvbQdAEfiM/nvz3qtQURERESkPlOIICKVcufgNvRs6Q3A3I1RrD4Yb9+CKslqtRKTmkN+ocXepYiIiIiI1DoKEUSkUhxMRt6e2BN3J9trDY8u3k1iZp6dq6o4q9XK2kMJTPhoEwP+u4rxH23S6xkiIiIiIv+gEEFEKq1lEzdeHBMCQGJmHo8v2Y3VarVzVeVjtVpZuf8MY2dt5ObPItgSlQLArlOpvP7rQTtXJyIiIiJSuyhEEJEqcV2vQEZ1t+3W8PuBeBZsPmnnis7PYrHy855YRr2zntvnb2XXqVQA3JxMtPBxBWD2+uNsPJpoxypFRERERGoXhQgiUiUMBgOvjO2Gv5cLAC/9uJ8j8Rl2rqp02fmF3PDxn9y9YDv7Y9MBaOTswH2XtmPD45cy79YwXByNWK3wyNe7SM8tsHPFIiIiIiK1g0IEEakyXm6OvDEhFIMBcgss3P/lTvIKa9e6AmaLlfu/3EFEVDIA3m6OPHxFB9b/+1IevrIjPu5OtG3qwVOjugAQk5bL88v22bNkEREREZFaQyGCiFSp/m2bcNeQtgDsj03njV8P2bmi4mYs38/KSNsOEpe092X945dy32Xt8XJ1LHbd5PCWDOnQFIClO6L5cXdsjdcqIiIiIlLbKEQQkSr30OUdCAn0BODjdcdYdeCMnSuy+Wz9ceZujAKgU/NGvD+pFx7ODqVeazAYmDmuO95utnDhyW/3EJeWW1OlioiIiIjUSgoRRKTKOTkYefuGnrg6mrBa4Z4FO9h9OtWuNf2yL44ZP+4HwK+RM59N7Yuni+N52/h5uvCfa7sBkJZTwKNLdtWZXSdERERERKqDQgQRqRZtm3rw9g2hGA2QU2Dm1rlbOJmUbZdadp5K5YFFO7BabbsvfDa1LwHeruVqO6KbP9f3agHAusOJfP7nieosVURERESkVlOIICLV5squzXlhTAgAiZn53DwngqTMvBqt4VRyNrfP20JugQWjAd6/sRchgV4Vusdz13Qh8Gzo8PKPkRxNyKyOUkVEREREaj2FCCJSrW7q14q7h9oWWjyemMVt87aSk18zOzakZuczdU4EiZn5ALw4JoRhnfwqfB9PF0den9ADgwHyCi28tfJwVZcqIiIiIlInKEQQkWr32FUdubZnIGB7teC+L3dgtlTv2gLxGbnc8PGfHE3IAuDOwW2Y3K/VRd+vX5smXNMjAIAfd8cQlZhVJXWKiIiIiNQlChFEpNoZDAZevb47g9r5ArAy8gzPfb+32hYpPJWczfgPN3EgLgOAa3oE8PjwTpW+77kZFRYrfPTH0UrfT0RERESkrlGIICI1wsnByAeTe9GpeSMAvvjzJG/8dghLFc9IOBCXzvUfbOTE2UUc/xUWxJsTQzEaDZW+d6fmnlze2fY6xJJtp7Xlo4iIiIg0OAoRRKTGNHJxZN6tYQR4uQDw7qoj3DF/K6nZ+VVy/20nUpjw4SbiM2yLN949tC2vXNsNUxUECOdMH9YOgAKzldnrjlXZfUVERERE6gKFCCJSo5p5ujD/tnBaNnYD4PcD8Yx6Zz3bT6ZU6r5rDyUwefZm0nMLAXhyZCceH94Jg6HqAgSAXi196NemMQALI06SklU1AYiIiIiISF2gEEFEalw7Pw+W3z+I4V2bAxCdmsOEDzcxe92xCq+TYLVa+WbbaW6ft4WcAjNGA7x2fXemDW5bHaUDcM/Z2QjZ+WbmboyqtueIiIiIiNQ2ChFExC48XRz5YHIvnhvdBUeTgUKLlZd+jOTOz7eRll1Qrnv8eSyJ6z7YyMOLd1FgtuJkMjJrUi8m9A2q1toHtfOlewsvAOZujCIzr7BanyciIiIiUlsoRBARuzEYDNwysDWL7xpAoLcrAL/uP8Pwt//gPz9HsvFIInmF5hLt9sWkcfNnEdzw8Z/sOJkKgLebI3Nu6cvwEP8aqXv6UNtshLScAhZuPlHtzxQRERERqQ0c7F2AiEhokDc/3j+IRxbvYmVkPLFpuXy09hgfrT2Gq6OJ/m2bMLi9LyGBXszfdILvd8UUtXVxNHLrwNbcOaQtXq6ONVbzlV2a0c7PgyPxmcxed5wp/YNxcTTV2PNFREREROyhXs5EyMvL4/HHHycgIABXV1fCw8P57bffLthu6dKlTJw4kTZt2uDm5kbHjh15+OGHSU1NLXFtcHAwBoOhxD933XVXNfRIpP7zdnPikyl9eGlsCD2CvDm3HmJOgZlVB+J5/of9jPtwU1GAYDIamBTekrWPDuOx4Z1qNEAAMBoN3D3Etu5CfEYe32w/XaPPFxERERGxh3o5E2Hq1KksWbKEBx98kPbt2zN37lxGjhzJ6tWrGTRoUJntpk2bRkBAAJMnT6Zly5bs2bOH9957j59++ont27fj6upa7PrQ0FAefvjhYsc6dOhQLX0SaQgMBgOT+7Vicr9WpGTls/5IIn8cSuCPwwmcSc8ruu7q7v48fGVHWvu627FauCY0gDd+O0R0ag4frj3KxD5BOJjqZTYrIiIiIgLUwxAhIiKCRYsWMXPmTB555BEApkyZQkhICI899hgbN24ss+2SJUsYOnRosWO9e/fm5ptvZsGCBdx+++3FzgUGBjJ58uQq74OIgI+7E6N7BDC6RwBWq5VDZzLZE51GF39PugR42rs8ABxNRu4c0oZnl+3jVHIOP+6JZUxooL3LEhERERGpNvXuV2ZLlizBZDIxbdq0omMuLi7cdtttbNq0iVOnTpXZ9p8BAsC1114LQGRkZKlt8vPzycrKqlzRInJeBoOBjs0bMa53i1oTIJwzoU8Qvh5OALy/+ghmS8W2qBQRERERqUvq3UyEHTt20KFDBzw9i/+gERYWBsDOnTsJCir/9m9xcXEA+Pr6lji3atUq3NzcMJvNtGrVioceeogHHnjggveMj48nISGh2LEjR44AkJmZSXp6ernrk7rpXPCkAKp+mNw3gLdWR3HoTCYLNx5hTPdm5WqncSCgcSA2GgcCGgdio3EgUHPjIDMzs8Jt6l2IEBsbi79/yS3ezh2LiYkpce58Xn31VUwmE+PGjSt2vHv37gwaNIiOHTuSlJTE3LlzefDBB4mJieHVV1897z1nzZrFCy+8UOq5iIiIouBC6r+IiAh7lyBVINAMXo4m0goMvP7rQVwT9uNYgXleGgcCGgdio3EgoHEgNhoHAtU/Dk6ePFnhNvUuRMjJycHZ2bnEcRcXl6Lz5bVw4UI+/fRTHnvsMdq3b1/s3Pfff1/s41tuuYURI0bwxhtvcN9999GiRYsy7zt9+nTGjx9f7NiRI0cYO3YsYWFhdO7cudw1St2UlZVFREQEYWFhuLvbd3FAqRqZvnG88NNhUvMNxLi3Z2q/sr8GnKNxIKBxIDYaBwIaB2KjcSBQc+OgrNf2z6fehQiurq7k5eWVOJ6bm1t0vjzWrVvHbbfdxlVXXcXLL798wesNBgMPPfQQv/zyC2vWrDnvgot+fn74+fmVes7Dw6PEqxhSf7m7u+vzXU/cNNCDL7bEcDQhi083nebmQe3xcivftpMaBwIaB2KjcSCgcSA2GgcC1T8OPDw8Ktym3i2s6O/vT2xsbInj544FBARc8B67du3immuuISQkhCVLluDgUL6s5dxaC8nJyRWoWETqAweTkceHdwIgLaeAWWuP2LkiEREREZGqV+9ChNDQUA4dOlRiccLNmzcXnT+fo0ePMnz4cPz8/Pjpp58qlMwcO3YMgKZNm1asaBGpF67o0ow+rXwAmLMhipjU8r8+JSIiIiJSF9S7EGHcuHGYzWY+/vjjomN5eXnMmTOH8PDwotkCJ0+e5MCBA8XaxsXFceWVV2I0Gvnll1/KDAOSk5Mxm83FjhUUFPDf//4XJycnhg0bVsW9EpG6wGAw8MRI22yE/EILb/x2yM4ViYiIiIhUrXq3JkJ4eDjjx4/niSeeID4+nnbt2jFv3jyioqL49NNPi66bMmUKa9euxWr9a0/34cOHc+zYMR577DHWr1/P+vXri841a9aMK664ArAtqvjSSy8xbtw4WrduTXJyMgsXLmTv3r288sorNG/evOY6LCK1Su9WjbmySzN+3X+Gb7af5vZLWtOpud5nFBEREZH6od6FCADz58/nmWee4fPPPyclJYXu3buzfPlyBg8efN52u3btAuC1114rcW7IkCFFIUK3bt3o0qULX3zxBQkJCTg5OREaGsrXX39dYtcFEWl4HhvekZWRZ7BY4bUVB/lsal97lyQiIiIiUiXqZYjg4uLCzJkzmTlzZpnXrFmzpsSxv89KOJ/evXuX2OJRROScdn6NmNg3iC8jTrHqQDx/HkuiX5sm9i5LRERERKTSqiREyMjI4MSJE6SkpJT6g/iFZgDIX7777ju2bNlS5vnmzZszfPjwoo/j4uJYsWLFBe87derUYh/PnTv3gm2GDx9e7NWMFStWEBcXd942oaGhxRav3LlzJzt37jxvm4bYJ7PZTEpKComJiZhMpqLjdblPZWmoffIvMOJgaEyh1cDD89ex5M5w/P39i86vWLGC6OjoUsdBbe3TP9WHz9M/2atP2dnZLF68uNRx8Hd1qU/18fNUnX0aNGhQsY/rQ5/q4+epuvt07vsDg8HANddcU3S8LvfpnPr0eTqnuvpU2veJdb1PpVGfbMrqU1k/L0DV9ik6OvqC7f6pUiFCUlIS9957L998802JhQbB9pt9g8FQ6jkpXUxMDIWFheW+Pjc3lxMnTlT4OeVpk5ubW+zjuLi4C7YLDg4u9nFqamqF62tIfcrKyjpvu7rYpwtpSH3qYspnd6E/0bmO/Lwvnlv/FiLExcVx+vRpoOxxUBv79M97/11d/Tz9895/V1N9MpvNReOhImpzn+rj56k6+9S3b/HXnupDn+rj56mm+pSQkFDiuXW9T6U9V32yKavN378/qC99+ue9/059svlnny7088K5515sn+Lj4yvcrlIhwh133MEPP/zA/fffzyWXXIKPj09lbidAQEAAgYGBZZ7/56KNLi4utGrVqsLPKU8bFxeX8z67NN7e3iU+vtCzGmKfziWLPj4+5/3NY13qU3k1pD41Mxs4csxCttnIB3+eYcIlhXg4OxQ9+0LjoDb26Z/3Pt+zS6M+lX5fk8lEixYtLjgT4Z9qc5/q4+epOvvk7Ox83meXprb3qT5+nqq7T+f+XvjnDmF1uU9l3Vd9+ss/25T2/UFd71Np1Cebsvp0vu8Tq7JPDg4VjwQM1vIuBFAKDw8Ppk+fXupChFIx+/btIyQkhL1799K1a1d7lyPVLD09ndWrVzNs2DA8PbVyf3329dZTPLZkNwB3XNKap0Z1KTqncSCgcSA2GgcCGgdio3EgUHPj4GJ+DjVW5oFubm4lpl+IiMhfxvVqQZ9Wtllan22I4kBcup0rEhERERG5eJUKESZPnsy3335bVbWIiNQ7RqOBGWNDMBkNmC1Wnv52LxbLRU8AExERERGxq0qtiTBu3DjWrl3L8OHDmTZtGkFBQaW+z9mrV6/KPEZEpE7r7O/JLQOCmb3+OFtPpPDN9tOM7xNk77JERERERCqsUiHC37cj+u2330qc1+4MIiI2D17RgR92x3AmPY///HyAK7o0q9xUMBERERERO6hUiDBnzpyqqkNEpF7zcHbg2au7cs/C7SRn5fPaLwf592UVX0VXRERERMSeKhUi3HzzzVVVh4hIvTeyW3Muae/LusOJfBlxkpGdtC2uiIiIiNQtVTabNjMzk8jISCIjI8nMzKyq24qI1BsGg4EXx4Tg5GDEaoWXVhxBayyKiIiISF1S6RBhy5YtDBs2DB8fH0JCQggJCcHHx4dLL72UrVu3VkWNIiL1Rmtfd+4e0haAA2eyWB9nsHNFIiIiIiLlV6nXGTZv3szQoUNxcnLi9ttvp3PnzgBERkby5ZdfMnjwYNasWUNYWFiVFCsiUh/cPbQt3+2M5kRSNstPGrktNZcunp72LktERERE5IIqFSI89dRTBAYGsn79epo3b17s3PPPP8/AgQN56qmnSt25QUSkoXJxNDFjTAhTPosgz2Lg6R8OsvjuppiMmpUgIiIiIrVbpWciPPvssyUCBIBmzZoxbdo0ZsyYUZlHNDjfffcdW7ZssXcZUs3MZjMpKSkkJiZiMpnsXY7YSR8vN7amubP9VDp3vvk1g5rk2LsksQN9PRDQOBAbjQMBjQOxqalxEB0dXeE2lQoRjEYjhYWFZZ43m80YjdoJvSJiYmLO+99U6pesrCx7lyB21Mlq5JChC+lWF1YluOGefoLGRgUJDZW+HghoHIiNxoGAxoHYVPc4iI+Pr3CbSoUIAwYM4P333+fGG2+kVavi+52fPHmSWbNmMXDgwMo8osEJCAggMDDQ3mVINTuXLPr4+ChhbsDMZjNXxceyJC0YC0Y20ZE7glJwVPbaoOjrgYDGgdhoHAhoHIhNTY0DB4eKRwIGq9V60RuM7dixg8GDB1NYWMi1115Lhw4dADh48CDLli3DwcGBdevW0aNHj4t9RIOxb98+QkJC2Lt3L127drV3OVLN0tPTWb16NcOGDcNTC+o1WOfGQaSpDR+uPwnAHZe05qlRXexcmdQkfT0Q0DgQG40DAY0DsampcXAxP4dWaiZCz5492bx5M0899RTff/892dnZALi5uTF8+HBeeuklunTRN8MiIudzx8AgNkWlset0GrPXH2dYJz8GtPW1d1kiIiIiIiVUetJsly5d+Pbbb0lPTyc2NpbY2FjS09NZunSpAgQRkXJwNBl5Y2IoLo5GrFZ45OtdpOcW2LssEREREZESquzNW6PRSLNmzWjWrJkWUxQRqaC2TT14amRnAGLScnn++312rkhEREREpKQKvc7w4osvYjAYeOqppzAajbz44osXbGMwGHjmmWcuukARkYZicr9W/BYZzx+HEli6PZqBbX25vncLe5clIiIiIlKkQiHC888/j8Fg4PHHH8fJyYnnn3/+gm0UIoiIlI/BYGDmuO5c9dYfpGYX8O+lu/H3dqkT6yMciEtn1YF4+gY3pm9wY3uXIyIiIiLVpEIhgsViOe/HIiJSOc08Xfhwcm9u+nQzBWYrd36+jaV3D6B9s0b2Lq2EpMw8lu2M4Zvtp9kXkw6Ak8nIl9P60buVj52rExEREZHqoMULRERqmX5tmjBznG1r3IzcQqbO2UJ8Rq6dq7LJL7SwYm8st8/bSvgrv/Pi8v1FAQJAvtnCXV9sIy6tdtQrIiIiIlWrUiGCyWRi4cKFZZ7/6quvMJlMlXmEiEiDNLZnII9c2QGA6NQcbp+3lez8QrvWdCo5m6vfXcddX2xnZeQZCi1WAFo2duPBy9vz6FUdAUjIyOPOz7eSW2C2Z7kiIiIiUg0q9DrDP1mt1vOeN5vNGAyGyjxCRKTBumdYO04mZ/P11tPsPp3G/V/u5KObemMy1vzX1W0nUpg2fytJWfkAeDg7MKqbP9f3bkHfYJ+ir/UxqTks2HySXafTeHLpHl6f0EN/D4iIiIjUI5V+naGsbw7T09P55Zdf8PWt/QuCiYjURgaDgZev7cYl7W1fR1dGnmHG8v0XDHCr2g+7YvjXJ38WBQh3Dm7Dlqcu59Vx3Qlr3bjY3wPPje5K2NmFFZfuiObT9cdrtFYRERERqV4VDhFeeOEFTCYTJpMJg8HA5MmTiz7++z8+Pj58/vnn3HDDDdVRt4hIg+BoMvL+pF50am5bWHHuxig+WXesRp5ttVp5b9Vh7vtyB/mFFkxGA/+5rhtPjOyMq1Ppr6o5ORiZNbkXAV4uALzyUyR/HEqokXpFREREpPpV+HWGsLAwpk+fjtVqZdasWVxxxRV06NCh2DUGgwF3d3d69+7NddddV2XFiog0RJ4ujnw2tS9j399AfEYer/x0gOSsAh67qiPGanq1Ib/QwhNL9/DN9tMANHJ24IPJvRnU/sKzy3w9nPl4Sh/GfbiR3AIL9y7czvf3DiLY171aahURERGRmlPhEGHEiBGMGDECgKysLO666y7Cw8OrvDAREflLgLcrc27py5RPI0jKyufDtUeJTs1h5rjuuDhW7QK2CRl53Pfldv48lgxA4Nlnd6jANpMhgV68Nq4H93+5g/TcQm6fv5Vvpw+gkYtjldYqIiIiIjWrUmsizJkzRwGCiEgN6RrgxbfTB9Kmqe03+j/siuGmTzeTcnatgsqyWq0s3nqKy99YWxQghAZ58909AysUIJxzTY8A7hrSFoAj8Zm8/uuhKqlTREREROynUrsznHP69Gl27NhBWloaFoulxPkpU6ZUxWNERBq8lk3cWHr3AKbN30ZEVDJbolK4/oONzL0ljJZN3C76vieTsnny2z2sP5JYdGxMaACvXl+5mQ6PXtWRHSdT2Hw8mS/+PMFN/VvRtqnHRd9PREREROyrUiFCbm4uN998M9988w0WiwWDwVC0avjfV+tWiCAiUnW83ZyYf1sYjy7ZzQ+7YjiWmMW1szYw++Y+9GzpU6F7mS1W5mw4zuu/HiKnwAyAXyNnXhwTwvCQ5pWu1WQ08OzoLlz97noKLVb+81Mks2/uW+n7ioiIiIh9VOp1hieffJKlS5fy8ssvs2bNGqxWK/PmzePXX39lxIgR9OjRg127dlVVrSIicpaLo4m3J4Zy91Db6wJJWfnc8PGfPLZkFxHHky+4DWSh2cKmo0lcN2sDL/0YWRQg/CssiN/+b0iVBAjndA3wYnzvFgCsjIxnw99mO4iIiIhI3VKpmQhLlizhlltu4fHHHycpKQmAwMBALr30Ui6//HIuvfRS3n//fT744IMqKVZERP5iNBp4fHgngnzceGbZXvIKLXy99TRfbz1NqyZujOvVgut6tyDQ2xWAmNQc/jiUwNpDCaw/kkhGbmHRvYKbuPHKdd0Y0PbCuy9cjIev7Mjy3bFk55t56cdIlt83CFM17SwhIiIiItWnUiFCfHw8YWFhALi62r5JzcrKKjp//fXX8+KLLypEEBGpRjeGt6RDMw8+WXeM3yPjKbRYOZGUzeu/HeKNlYcIC25MSnY+h85klmhrMhq445I2PHh5+yrf5eHvmnm6cNeQtrzx2yEiY9P5ZttpJvQNqrbniYiIiEj1qFSI0KxZs6IZCG5ubvj4+HDw4EFGjx4NQHp6Orm5uZWvUkREzqtPcGP6BDcmKTOPZTtjWLLtNPtj07FaYfPx5GLX+rg5MrhDU4Z0aMol7ZvStJFzjdR4xyVt+DLiJLFpucz89SCjuvvj7lwl6/uKiIiISA2p1Hdv4eHhrF+/nscffxyA0aNHM3PmTPz9/bFYLLz55pv069evSgoVEZELa+LhzK2DWnProNbsi0njm23RrDpwhiYezgw5GxyEBHrZ5VUCVycTjw3vyENf7SIhI4+P1h7l/67sWON1iIiIiMjFq1SIcP/997N48WLy8vJwdnZmxowZbNq0iZtuugmAtm3b8s4771RJoSIiUjFdA7zoGuDFs6O72LuUImN6BDJnQxS7T6fx8bpj3BDWkoCzazaIiIiISO1Xqd0ZBg0axNtvv42zs20qbFBQEJGRkezYsYPdu3cTGRlJx476LZOIiNgYjQaeHmULNXILLMz85aCdKxIRERGRiqhUiFDqDY1GevToQUhICA4OetdVRESKC2vdmBFnt5D8dkc0u06l2rcgERERESm3Cv2U/8cff1zUQwYPHnxR7UREpH7694hOrIw8Q4HZyovL97P4zv4YteWjiIiISK1XoRBh6NChGAzl/ybParViMBgwm80VLkxEROqvVk3cmTogmE/WHWfbiRQ+WXeMO4e0tXdZIiIiInIBFQoRVq9eXV11iIhIA/Pg5R34bf8ZopKy+d+vBxnU3peuAV72LktEREREzqNCIcKQIUOqqw4REWlg3J0deOuGnlz/wUYKzFYeXLSTH+4bhIujyd6liYiIiEgZqmxhxdjYWHbt2kVWVlZV3VJEROq50CBvHrisPQCH4zP5788H7FyRiIiIiJxPpUOEZcuW0alTJ1q0aEGvXr3YvHkzAImJifTs2ZPvvvuuso8QEZF6bPrQtvRu5QPA3I1RrDkYb+eKRERERKQslQoRfvjhB6677jp8fX157rnnsFqtRed8fX0JDAxkzpw5lS5SRETqLweTkTcnhOLuZHuN4dElu0nOyrdzVSIiIiJSmkqFCC+++CKDBw9m/fr13HPPPSXO9+/fnx07dlTmESIi0gC0bOLG89d0BSAhI49/f7O7WDAtIiIiIrVDpUKEvXv3MmHChDLPN2vWjPh4TUsVEZELG9e7BSO7NQfg1/1n+HrrKTtXJCIiIiL/VKkQwc3N7bwLKR47dowmTZpU5hEiItJAGAwGXh7bjWaezgC88MN+jidqsV4RERGR2qRSIcKwYcOYN28ehYWFJc7FxcXxySefcOWVV1bmESIi0oD4uDvx+vhQALLzzdz9xTay80v+HSMiIiIi9lGpEOGll17i9OnT9O3bl48++giDwcAvv/zC008/Tbdu3bBarTz33HNVVauIiDQAg9r7cteQtgAciMvgsSVaH0FERESktqhUiNCpUyc2bNhAkyZNeOaZZ7BarcycOZNXXnmFbt26sW7dOoKDg6uoVBERaSgeubIDg9r5ArB8dyyz1x23c0UiIiIiApUIEQoKCti9ezeenp6sXLmSxMRENm/ezKZNmzhz5gyrVq2ic+fOVVlrueXl5fH4448TEBCAq6sr4eHh/Pbbb+VqGx0dzYQJE/D29sbT05MxY8Zw7NixUq/99NNP6dy5My4uLrRv35533323KrshItJgOZiMvPuvngR6uwLwn58j2Xgk0c5ViYiIiMhFhwhGo5HevXuzdOlSAHx8fOjbty/h4eE0bdq0ygq8GFOnTuWNN95g0qRJvP3225hMJkaOHMn69evP2y4zM5Nhw4axdu1annzySV544QV27NjBkCFDSEpKKnbtRx99xO23307Xrl1599136d+/P/fffz+vvvpqdXZNRKTB8HF34qObeuPsYMRihXu/3EF0ao69yxIRERFp0C46RDCZTLRq1Yq8vLyqrKfSIiIiWLRoEf/5z3+YOXMm06ZNY9WqVbRq1YrHHnvsvG1nzZrF4cOHWb58OY899hgPPfQQv/76K7Gxsbz++utF1+Xk5PDUU08xatQolixZwh133MH8+fOZNGkSM2bMICUlpbq7KSLSIIQEevGf67oBkJyVz12fbyO3wGznqkREREQarkqtiXDffffx8ccfk5ycXFX1VNqSJUswmUxMmzat6JiLiwu33XYbmzZt4tSpsvcdX7JkCX379qVv375Fxzp16sRll13G119/XXRs9erVJCUlMX369GLt77nnHrKysvjxxx+rsEciIg3bdb1acHP/VgDsiU7j6e/2aqFFERERETtxqExjs9mMs7Mzbdu2Zdy4cQQHB+Pq6lrsGoPBwEMPPVSpIitix44ddOjQAU9Pz2LHw8LCANi5cydBQUEl2lksFnbv3s2tt95a4lxYWBi//vorGRkZNGrUiB07dgDQp0+fYtf17t0bo9HIjh07mDx5cpk1xsfHk5CQUOzYkSNHALjppptwd3cvR0+lLjObzaSnp+Pp6YnJZLJ3OWInGgflZzUYcer6L/K9gliy7TQrv/6URnE77F1WldA4ENA4EBuNAwGNA7GpiXFQ6OxFfKMOFW5XqRDhkUceKfrzp59+Wuo1NR0ixMbG4u/vX+L4uWMxMTGltktOTiYvL++CbTt27EhsbCwmkwk/P79i1zk5OdGkSZMyn3HOrFmzeOGFF0o9dy6gEBGR4ozbd+N/89s4NGpCSuvLObh1HblRO+1dloiIiEidYnL3wWvADXj0uIqC5OgKt69UiHD8eO3bcisnJwdnZ+cSx11cXIrOl9UOKFfbnJwcnJycSr2Pi4tLmc84Z/r06YwfP77YsSNHjjB27Fh69uypmQgNgBJmAY2Di5F37Efiu92IwehA8+ufxm/PFzhl1+1dGzQOBDQOxEbjQEDjQGyqYxxYTM6kt+hHpn8frCbHi77PRYcIOTk5vP322wwbNozRo0dfdAFVzdXVtdTFHnNzc4vOl9UOKFdbV1dX8vPzS71Pbm5umc84x8/Pr8QshnM+//xzunbtet72Uvelp6ezevVqhg0bVuLVG2k4NA4uzvLdMdy7cAdWBxdMQ+/lm3sG4ufpYu+yLprGgYDGgdhoHAhoHIhNVY6DtOwCFkSc4MM1R8nILSw6PrhDU667sgnXflax+110iODq6spHH31Ely5dLvYW1cLf35/o6JJTMmJjYwEICAgotV3jxo1xdnYuuu58bf39/TGbzcTHxxcLA/Lz80lKSirzGSIiUnlXdw/gZHI2r604SExaLrfN28pXd/bDzalSk+tERERE6o38QgtrDsbz7Y5ofo+MJ99sKTrXq6U3jw3vRL82Tdi3b1+F712p77h69+7N3r17K3OLKhcaGsrq1auLpn6cs3nz5qLzpTEajXTr1o2tW7eWOLd582batGlDo0aNit1j69atjBw5sui6rVu3YrFYynyGiIhUjbuHtOVkUjaLtpxiT3Qa93+5g49u6oPJaLB3aSIiIiJ2YbVa2XEqlW+3R7N8dwwp2QXFznds1ohHrurI5Z39MBgu/numSm3x+NZbb7Fo0SJmz55NYWHhhRvUgHHjxmE2m/n444+LjuXl5TFnzhzCw8OLdmY4efIkBw4cKNF2y5YtxYKEgwcPsmrVqmJrGFx66aU0btyYDz74oFj7Dz74ADc3N0aNGlUdXRMRkbMMBgMzxoZwSXtfAFZGxjNj+X47VyUiIiJiH38cSmDE2+u4btZGPv/zRFGA4OZk4rpegXxxWzg/PXAJV3RpVqkAASo5E2Hq1KkYjUbuvPNO7r//fgIDA0vd4nHXrl2VKrIiwsPDGT9+PE888QTx8fG0a9eOefPmERUVVWwHiSlTprB27dpie41Pnz6dTz75hFGjRvHII4/g6OjIG2+8QbNmzXj44YeLrnN1dWXGjBncc889jB8/nquuuop169bxxRdf8PLLL9O4ceMa66+ISEPlaDIya1Ivxn+4iQNxGczdGEXLxm7cOqi1vUsTERERqRFHEzJ55cdIfj8QX3TMaIBB7ZtyXc9AruzarMpf+azU3Ro3bkyTJk3o2LFjVdVTJebPn88zzzzD559/TkpKCt27d2f58uUMHjz4vO0aNWrEmjVreOihh3jppZewWCwMHTqUN998k6ZNmxa7dvr06Tg6OvL666/z/fffExQUxJtvvskDDzxQnV0TEZG/aeTiyGdT+zL2/Q3EZ+Qx48f9+Hu5MKJbye16RUREROqLtOwC3ll1mHkboyi02H4x7u5k4q4hbZnYN6haF52uVIiwZs2aKiqjarm4uDBz5kxmzpxZ5jVl1d6iRQsWL15crufccccd3HHHHRdTooiIVJEAb1c+m9qXCR9tIjvfzP2LdjDb2YEhHZpeuLGIiIhIHWK2WFkYcZI3fj1Y9MqCwQATegfx8FUd8GtU/TtWVWpNBBERkdogJNCLDyf3xslkpMBs5c7PtxJxPNneZYmIiIhUmbi0XP718Z88893eogAhLLgxP9w7iFfHda+RAAEqORMBwGw288UXX/Djjz9y4sQJAFq1asXVV1/NpEmTMJlMlS5SRETkQgZ3aMo7/+rJPQu3k1tg4da5W1h4RzjdW3jbuzQRERGRSll7KIGHvtpJclY+AC18XHlyZGdGhDSv9EKJFVWpmQhpaWkMHDiQW2+9lV9//ZWCggIKCgr47bffuOWWWxg0aBDp6elVVauIiMh5DQ9pzsxx3QHIzCvk5s8iOHQmw85ViYiIiFycQouV11Yc4ObPIooChIl9gvjtoSGM7OZf4wECVDJEeOqpp9i2bRvvvvsuCQkJbN++ne3btxMfH897773H1q1beeqpp6qqVhERkQu6rlcLZozpCkBKdgGTZ2/mRFKWnav6S6HZwsG4DJZuP82M5fu5a9Fefos2FNstSERERCQ1D277Yjez1hwFbNs1vjmxB6+O646rk/1m/FfqdYZvv/2W6dOnM3369GLHHR0dufvuu4mMjGTJkiW8++67lSpSRESkIm7qH0xmnplXVxwgPiOPSbM3s/iu/vh7uV64cRWzWq2sPZTAir1x7I9N50BcBvmFln9cZaL95mgeuNKrxusTERGR2mf90WRe220iq9A2s79T80a8d2Mv2vl52LmySoYISUlJ593esVOnTiQna2ErERGpeXcPbUtmXgHvrz7K6ZQcJn70Jx9O7k2XAM8aq2HzsSRm/nKQrSdSSj3vaDLgaDKSnW/mzVXHae/vw0htTykiItKgrdgby71f78Nitb2q8K+wIJ4b3RUXx9qx3mClQoR27drx/fffl5iJcM73339P27ZtK/MIERGRi/bIlR3JzC1k3qYTnEzO5tpZG5gxNoQJfYKq9bl7o9OY+ctB1h5KKDrm6miiewsvugR40jXAiy7+nrTz82B31BkmzdlGntnAQ1/txN/LhZ4tfaq1PhEREamdNhxJ5P4vd2KxgqPRyoujO/Gv/u3sXVYxlQoRpk+fzr333svIkSN58MEH6dChAwAHDx7knXfe4bfffuO9996rkkJFREQqymAw8Pw1XWnm5cL/fjlIXqGFx5bsZltUCi+MqfpE/2hCJm/8eogf98QWHXNxNDJ1QGvuGtIGbzenEm06+LlzSwcLnxwwkVdo4Y75W/l2+kCCGrtVaW0iIiJSu+08lcod87eSb7bgYDRwW0czo7r62busEiodIsTHx/Pf//6XX375pdg5R0dHnn32We6+++5KFSgiIlIZBoOB6UPbERrkzf1f7iAxM5+vtp5iT3QaH0zuRasm7pV+RmRsOh+sOcry3TFYzq6P6GA0cENYEPdd2p5mnufft7mzt5Unr2rHjBVHSMzM55a5W/jmrgF4uTlWujYRERGp/Q6fyWDqnAiy880YDPDKNR1xPrPX3mWVqlIhAsDzzz/Pvffey8qVKzlx4gQArVq14vLLL8fX17fSBYqIiFSFAW19+fH+S7hv4Q4iopLZH5vO1e+u5/XxPbiya/OLuufWqGRmrTnKqgPxRccMBhgbGsiDl7evUEAxvpc/Z7ItfPzHMY7EZ3L3gm3MvSUMJ4dKbaQkIiIitdyp5Gxu+jSC1OwCAF4aG8Lwzj6sPmPnwspQ6RABwNfXlxtuuKEqbiUiIlJtmnm6sOCOcGb+cpCP/zhGRm4h0z7fRq+W3ozrHcTVPfzxdDn/b//P7bYwa/VRIqL+WjzYaICruwdwz7B2dGze6KLq+/fwTpxMymbFvjg2Hk3iqW/38Nq47nbZA1pERESqX0JGHjd9upm49FwAHr2qI5PCW5Genm7nyspWqRBh5cqVrFq1ildeeaXU80899RSXXXYZl156aWUeIyIiUmUcTUaeHNmZXi19eHTxLjLyCtl+MpXtJ1N54Yd9DA9pzrjeLRjQ1heT0UBiZh67T6ey+3Ta2X9SSczML7qfk8nIuD4tuHNwm0q/GmE0GnhzYiixn/zJrlOpLN52mq4Bnkwd2Lqy3RYREZFaJi2ngJs/iyAqKRuAaYPbMH1o7d+YoFIhwowZM2jZsmWZ56Ojo3nppZcUIoiISK0zPKQ53Vp4sXDzCb7ZFk1cei55hRaW7Yxh2c4Y/L1cMBoMRKfmlNre3cnE5H6tuG1Qa/wusOZBRbg6mZg9pQ9j399AdGoOr644yGWdm2mhRRERkXokJ9/M7fO2sD/WNuNgQp8WPDGiU52YfVipFy337NlDeHh4mef79u3L7t27K/MIERGRahPo7cqjV3Viw78vZf6tYVzTI6BoDYLYtNwSAUJrX3fGhAbw/OgubPz3ZTwxsnOVBgjnNG3kzGvjugOQU2DmyW/3YLVaq/w5IiIiUvMKzBbuWbidLVEpAAzv2pxXru1WJwIEqORMhLy8PPLz8897Pjs7uzKPEBERqXYmo4HBHZoyuENT0nIKWL47hl/3ncHV0UT3IC96tPAmJMCrRndLGNjOlwl9WvD11tOsO5zI0u3RXN+7RY09X0RERKqexWLlkcW7ihZlHtiuCW//KxQHU91ZSLlSIUJISAjffvst//d//1finNVqZenSpXTp0qUyjxAREalRXq6OTApvxaTwVvYuhadGdmH1wQQSMvKY8eN+hnRsiq+Hs73LEhERkYtgtVp5/od9LNsZA0CPFl58dFMfnB1Mdq6sYioVd9x3331s2LCB8ePHs2fPHgoLCyksLGT37t2MHz+eTZs2cd9991VVrSIiIg2Kl5sjL17TFYDU7AKe/36fnSsSERGRi/XmysPM33QCgPZ+Hsy9JQwP5yrZMLFGVariyZMnc/ToUWbMmMHSpUsxGm2ZhMViwWAw8PTTT3PzzTdXSaEiIiIN0Yhu/lzVtRm/7DvD8t2xjA09w+Vdmtm7LBEREamAz9Yf553fDwO2NZk+vy0cH3cnO1d1cSodezz33HNMnjyZb7/9lmPHjgHQtm1bxo4dS9u2tX97ChERkdruxTEhbDyaREZuIU9/t5ewNo3xdKm59RlERETk4n2z7TQvLt8PgK+HE1/cHk5zr6pfmLmmVMncibZt2/LII49Uxa1ERETkH5p5uvDUyM78e+ke4tJzefXnA7x8bTd7lyUiIiLnkV9o4Y3fDvHRH0cBaOTswLxbw2jt627nyiqnSkKE48eP8/PPP3PihO39juDgYIYPH07r1q2r4vYiIiIN3sS+QSzbGcOmY0ks2HySa3oEEN6mib3LEhERkVJEJWZx/6Id7D6dBoCbk4lPp/ala4CXnSurvEqHCA8//DBvv/02Foul2HGj0ciDDz7I//73v8o+QkREpMEzGAz857puXPXWH+QVWnj8m90su3cQXq56rUFERKS2sFqtLN0ezbPL9pKVbwagi78n7/yrJ+38POxcXdWo1O4Mr7/+Om+++SbXXXcdmzZtIjU1ldTUVDZt2sS4ceN48803efPNN6uqVhERkQYt2Nedh6/sAEBUUjb3LtxOodlygVYiIiJSE9JzC3jwq508vHhXUYBw26DWfHvPgHoTIEAlQ4RPPvmEa665hq+//prw8HA8PT3x9PQkPDycRYsWMXr0aD766KOqqlVERKTBu31QG648uzvDusOJzDi7UJOIiIjYh9VqZc3BeEa9s45lO2MA2wKKc2/pyzNXd8HZwWTnCqtWpUKEqKgorrrqqjLPX3XVVURFRVXmESIiIvI3RqOBNyeG0sXfE4B5m07w+aYo+xYlIiLSQEUcT2biR38ydc4WTiXnADC4Q1N+fmAwQzv62bm66lGpNRH8/PzYtWtXmed37dpF06ZNK/MIERER+Qd3Zwdm39yHMe9vICEjj+d/2E+wrzuXtNffuSIiIjVhz+k0/vfrQdYeSig65u5k4qErOnDrwNYYjQY7Vle9KjUTYfz48cyePZv//ve/ZGVlFR3Pysri1VdfZfbs2UycOLHSRYqIiEhxAd6ufDKlD84ORswWK9MXbOdIfKa9yxIREanXjsRnMH3BNka/t74oQHB2MDJtcBvWPX4pt1/Spl4HCFDJmQgzZsxg586dPPnkkzz77LMEBAQAEBMTQ2FhIcOGDePFF1+skkJFRESkuNAgb/43vgf3fbmDjNxCbpu3he+mD8TH3cnepYmIiNQrp5KzeWvlYb7dcRqL1XbMwWjghrAg7h3WnuZeLvYtsAZVKkRwc3Pj999/Z9myZfz888+cOHECgOHDhzNy5EhGjx6NwVC/UxgRERF7Gt0jgCPxmbz9+2FOJGVz94JtzL81HCeHSk02FBERESA+PZd3Vx1h0ZaTFJht6YHBANeGBvLg5R1o2cTNzhXWvEqFCOeMGTOGMWPGVMWtREREpIIevLw9RxMyWb47lj+PJfPvpbt5fXwPBfkiIiIXKSUrnw/XHmXepihyC/7aTnl41+b835Ud6NCskR2rs69K/ZrCZDKxcOHCMs9/9dVXmEz1azsLERGR2sZgMPC/8T3oEeQNwNLt0cz85aB9ixIREamDjsRn8uIP+xn82mo++uNYUYAwuENTvr93IB/e1LtBBwhQyZkIVqv1vOfNZrN+CyIiIlIDXBxNfHpzH8Z9sJGopGxmrTlKM08Xbh4QbO/SREREarW8QjMr9saxYPNJIo4nFzvXN9iHR67sSHibJnaqrvap9OsMZYUE6enp/PLLL/j6+lb2ESIiIlIOvh7OzLs1jOs/2EhiZj7P/7CPpo2cGdnN396liYiI1DpH4jP5eusplmw7TXJWfrFzA9o24Y5L2jC0Y1P9YvwfKhwivPDCC0U7LhgMBiZPnszkyZNLvdZqtXL//fdXrkIREREpt1ZN3JkzNYwbPt5EVr6ZB7/aSRN3J/0GRUREGjyr1cru02n8si+OX/efKbE1so+bI+P7BHFD3yDaNPWwU5W1X4VDhLCwMKZPn47VamXWrFlcccUVdOjQodg1BoMBd3d3evfuzXXXXVdlxYqIiMiFdWvhxQeTe3Pr3C3kF1q4ff5WFt/Vn07NPe1dmoiISI0yW6z8eSzJFhzsO0Ncem6Ja8JbN+bG8JYMD2mOs4PW9LuQCocII0aMYMSIEQBkZWVx1113ER4eXuWFiYiIyMUb3KEpr43rzv99vYuM3EKmfraFpdMHEODtau/SREREql18Ri5fbznFlxGniE7NKXG+R5A3V3ZpxoiQ5pp1UEGVWhNhzpw5VVWHnPXdd9+xZcsWe5ch1cxsNpOSkkJiYqJ2MGnANA4Eqn8cXN7UlZUJHsSl53LNG78ytWUqHg7nXxhZap6+HghoHIiNxsHFs1ohKtuRrakuRGY4Y+GvtQyMWAl2K6BTozw6eeTj6ZgAJw7zxwn4w441l6WmxkF0dHSF21QqRJg/f365rpsyZUplHtOgxMTEUFhYaO8ypIZkZWXZuwSpBTQOBKpvHLSwQhdTEPvNzUjMd+CTo+6McD6Iq0F/19RG+nogoHEgNhoH5VdoNXDY7EtkoR9p1uIz7hobsunoEE9rUwrOVjOkQ0o6pNip1oqq7nEQHx9f4TaVChGmTp1a5rm/r2CpEKH8AgICCAwMtHcZUs3OJYs+Pj5KmBswjQOBmhkHLa3wQ1wOO9JcSbO6stISws0tU2mkGQm1hr4eCGgciI3GQfnlW2BbqisbklzJNP/138pksNK1UR59fXJo4VKIweAOuNuv0ItQU+PAwaHikUClQoTjx4+XOGY2m4mKimLWrFmcPHmSefPmVeYRDc7YsWPp2rWrvcuQapaens7q1asZNmwYnp5a6Kyh0jgQqLlxMNVi5Ymle/hq6ykS8x1Ylh7MwjvC8WvkUm3PlPLT1wMBjQOx0Ti4sOz8Qhb8eZKP/zhGYmZe0fGWjd2Y3K8l43sH4ePuZMcKK6+mxsG+fft4+umnK9SmUiFCq1atSj3epk0bLr30UkaNGsV7773H+++/X5nHiIiISCUZjQb+c103DAZYtOUUR+Iz+dfHf/LltH4KEkREpE5Izc7nqy2n+PiPYyRl5Rcdb+Przn2XtWN09wAcTEY7VtgwVCpEuJCrr76aZ555RiGCiIhILWA0Gnjl2m6ALUg4mpClIEFERGq1zLxCVu4/ww+7YvjjcAIF5r9exWvb1J37L2vP1d0DMBkN57mLVKVqDRGOHj1KXl7ehS8UERGRGnEuSDAY4MuIvwUJd/TDz1NBgoiI2F9ugZk1B+P5YVcsvx84Q26Bpdj59n4e3HdZe0Z181d4YAeVChH++KP0zTBSU1P5448/eOeddxg7dmxlHiEiIiJVzGg08PJY24yEc0HC+I828cVt4QQ1drNzdSIi0lBl5BYwf9MJZq87Rkp2QbFz3m6OjAjxZ3R3f/q1aYJR4YHdVCpEGDp0aLFdGM6xWq2YTCbGjx/Pu+++W5lHiIiISDU4FyQYDQYWbD7JiaRsxn24kS9uC6d9s0b2Lk9ERBqQtJwC5m6I4rMNx0nL+Ss8cHcycWXX5ozu4c+gdk1xctB6B7VBpUKE1atXlzhmMBjw8fGhVatWeHp6kpJSV3bgFBERaViMRgMvjQ3Bw8WBj9Ye40x6HhM+2sS8W8Po3sLb3uWVUGi2cCwxi/0x6Rw8k0EXf09G9wiwd1kiInKRUrLy+WzDceZuiCIjr7DoeKfmjbh7aFuu7NIcVydtc1nbVCpEGDJkSKnH8/Ly+P7771mwYAErVqwgNze3Mo8RERGRamIwGHhiRGe8XB15bcVBUrILuPGTzXwypQ/92zaxa22xaTn8HhnP/th09sWkcyA2nbzC4u/FbjqWxPOju+q3UyIidYjZYmX2umO88/thsvLNRcdDAj2579L2XNG5mV5XqMWqbGFFq9XK77//zoIFC/j2229JT0+nadOm3HjjjVX1CBEREakm04e2o5GLI88u20tmXiE3z4lg1o29uLxLsxqv5fCZDD5ce4xlO6MptFjPe+3CzSc5fCaDDyb3xtfDuYYqFBGRi3UkPoNHFu9m56nUomM9WnjxwOXtGdbRr9TX5aV2qXSIsG3bNhYsWMCiRYuIi4vDYDBwww03cO+999KvXz8NAhERkTripn6t8HRx4OGvd5FfaOHOL7bxyrUhTOgTVCN/n287kcwHa46xMvJMsePuTiY6+3vSNcCTrgFedAnwxMvVkXu/3MGuU6lsiUrhmnfX8/GUPoQEelV7nSIiUnFmi5VP1h3jjd8OkX92VllrX3eeG92FIR2a6ufGOuSiQoRjx46xYMECFixYwOHDhwkMDGTSpEmEhYUxceJErr/+evr371/VtYqIiEg1GxMaiIezA9MXbCev0MLj3+zhqy2neGpUF3q38qny51ksVtYciufDNceIiEouOm4wwPCuzbljcBtCW3iXOq31q2n9ePLbPSzdHk1MWi7Xf7CR18Z1Z0xoYJXXKSIiF++fsw8MBrhtYGseuaojLo5a86CuqXCI0L9/fyIiIvD19WXcuHHMnj2bQYMGAXD06NEqL7Ch+e6779iyZUuZ55s3b87w4cOLPo6Li2PFihUXvO/UqVOLfTx37twLthk+fDjNmzcv+njFihXExcWdt01oaCihoaFFH+/cuZOdO3eet01D7JPZbCYlJYXExERMpr++cNblPpVFfbIprU/R0dGljoNz6mKf6uPnqSb6lJ2dzeLFi0sdB39Xk32ad2sY0xdsJzkrn+0nU7n+g410aZTL5U2zaOxkKdGmop+nHLOBHWkuHDL7EZWUXXTcZLDSwzOXAU1y8C1MYPeqPewuo08ujiZeH98DU1oMS45ayCu08MCinXz960Yua5rF33OH2j72zn0vdY7+f7JpaH069/2BwWDgmmuuKTpel/t0Tn36PJ1TXX0q7fvEutonqxU2JruyKtEds9X2RbmJUyFjmmdwe5/exQKEutKn86nKz1NZPy9A1fYpOjr6gu3+qcIhwubNm2ndujVvvPEGo0aNwsGhypZVECAmJobCwsILX3hWbm4uJ06cqPBzytPmnwtixsXFXbBdcHBwsY9TU1MrXF9D6lNWVtZ529XFPl2I+mQTFxfH6dOngbLHQV3sU338PNVEn8xmc9F4qIjq7NPQ0FBWPzKUD9YcZfa6oxRaYH+GCwcynOhsiqeHYyzOBnOZ9yjr85RscSWy0I+jZh/MmABbgODh7MCk8JakRnyLW34BWbFQ1lfIv/fJYDDQ1TGeNKcU1uS3IR8HNiS7EZ+WzUDHKM7Njq3tY69v377FPtb/TzYNtU8JCQklnlvX+1Tac9Unm7La/P37g7rYJ4sVNha04rDZ4+xRKyEOZ+hpjMaaYCU3t3h4Whf6VJ57/11V9OlCPy+ce+7F9ik+Pr7C7SqcALz33nssXLiQa6+9lsaNG3P99ddzww03MHTo0Ao/XEoKCAggMLDsaZh/T7YAXFxcaNWqVYWfU542Li4u5312aby9vUt8fKFnNcQ+nUsWfXx8zvubx7rUp/JSn/569oXGQV3s04WoT6Xf12Qy0aJFiwvORPin6u6Tl6sj/x7RiV6emby79gR70l2wYGSfuTlHrH60c8+njXsBrd3yad7cr8RzW7VqRYEFYnMdiM51JDLDmZN5jsWua+3rzk39WnF97xZ4uToyN37zRfVpANApP40vT3uRkO/AYbMvzb1duaxpdrE+/b2PtWnsOTsXXxRS/z/99XFD6tO5vxeaNm1a4rl1tU9l3Vd9+ss/25T2/UFd61NAUCu+ifHkcK7ta5u3o5nrA9IJcjUBLYvufb5nl6Y2fZ5KU5V9Ot/3iVXZp4uZFGCwWq3nX/a4DMePH2fBggUsXLiQAwcO0Lx5c4YNG8aiRYtYsmQJ11577cXctsHat28fISEh7N27l65du9q7HKlm6enprF69mmHDhuHp6WnvcsRONA4E6tY42H06lZd/jGTz8eQS54IauzKwrS/92zYhr8DCztOp7DqVyoG4DMz/2GHBaIBLOzVjSv9WDGrnW6XbeCVk5HH9Bxs5mWwLD2aM6cpN/YOr7P7VpS6NA6k+GgcCdX8cZOQWcMf8rfx5zPZ3RWd/T+bd2he/Ri4XaCl/V1Pj4GJ+Dr3odxFat27N008/zdNPP120Q8NXX32F1Wpl+vTp/Pzzz1xzzTVcfvnlJRIZERERqXu6t/Bm0bR+rDoQz9Lt0Ww8mkhKdgEAp5JzWJR8ikVbTpXZvmkjZ67v1YJJ4S0JauxWLTU2beTMvFvDGPfBRpKy8nn2+300beTM8BD/anmeiIj8JSEjj6lzItgXkw5AWOvGzL65D54ujhdoKXVJlSxo0Lt3b3r37s3//vc/Vq1axRdffMFXX33F7NmzcXNzIzMzsyoeIyIiInZmMBi4rHMzLuvcDIvFyv7YdDYcSWT9kUS2RCWTW2BbdNHNyUS3QC9Cg7zpcfafAC+XGtnCq7WvO59N7csNH/9JToGZ+xft5IvbnAlr3bjany0i0lCdSs7mpk83Fy2We3nnZrx3Y0/tvlAPGav0ZkYjl19+OXPnzuXMmTN8+eWXXHbZZVX5iAtKTU1l2rRpNG3aFHd3d4YNG8b27dsv2M5isTB37lyuueYagoKCcHd3JyQkhJdeeqnEAhlg+yaqtH/++9//Vke3REREah2j0UBIoBd3DmnL57eFs+u5K1k6fQC/PDiYPc9fxVd39ueJkZ0Z2c2fQG/XGt0DvEeQNx9M7oWD0UB+oYXb523h0JmMGnu+iEhDsi8mjes/2FgUIIzr3YIPJ/dSgFBPVdvWCi4uLkycOJGJEydW1yNKsFgsjBo1il27dvHoo4/i6+vLrFmzGDp0KNu2baN9+/Zlts3OzuaWW26hX79+3HXXXfj5+bFp0yaee+45fv/9d1atWlXim58rrriCKVOmFDvWs2fPaumbiIhIbefsYKJXSx97l1FkaEc//nt9dx5ZvIv03EJu/iyCb+4eQIC3q71LExGpF6xWK/M2RvHKzwfIL7TNRJs2uA1PjOhUo8Gx1Kx6tT/jkiVL2LhxI4sXL2bcuHEATJgwgQ4dOvDcc8+xcOHCMts6OTmxYcMGBgwYUHTsjjvuIDg4uChIuPzyy4u16dChA5MnT66ezoiIiEiljevdgviMXF5bcZDYtFwmf7qZOVP70qqJu71LExGp05Iy83hsyW5+P2DbItBogH+P6MS0wW3tXJlUtyp9ncHelixZQrNmzbjuuuuKjjVt2pQJEyawbNky8vLyymzr5ORULEA459wuE5GRkaW2y8nJKfV1BxEREakd7h7Slpv727a+OpaQxTXvbWDd4QQ7VyUiUndtOJLIiLfXFQUIgd6ufHVnfwUIDUS9momwY8cOevXqhdFYPBsJCwvj448/5tChQ3Tr1q1C94yLiwPA19e3xLm5c+cya9YsrFYrnTt35umnn+bGG2+84D3j4+NJSCj+zcuRI0cAyMzMJD09vUI1St2TlZVV7N/SMGkcCGgc1JSHhgZhtJqZ8+dp0nIKuPmzCP7v0tbcFBZYK6bcahwIaByITW0eBwVmC+//cYI5m05zbvPeKzr58tyIdni6OujnmCpUU+PgYjZBqFchQmxsLIMHDy5x3N/ftq1TTExMhUOE1157DU9PT0aMGFHs+IABA5gwYQKtW7cmJiaG999/n0mTJpGWlsbdd9993nvOmjWLF154odRzERERRcGF1H8RERH2LkFqAY0DAY2DmhBqgCntDXx5xEiB1cD/fj/Oml1HmdjGglMtWftL40BA40BsatM4yDXDjkQDf8QZicm2Ba9ORivXBVvo5x3Htj/180t1qe5xcPLkyQq3qbUhgsViIT8/v1zXOjs7YzAYyMnJwdnZucR5FxcXwPbqQUW88sorrFy5klmzZuHt7V3s3IYNG4p9fOutt9K7d2+efPJJpk6diqtr2Ys2TZ8+nfHjxxc7duTIEcaOHUtYWBidO3euUJ1S92RlZREREUFYWBju7novt6HSOBDQOKhpw4CRcZk8uGQ/cel5bE00kuPoyZvXd6G5Z8nvIWqKxoGAxoHY1JZxYLVa2RWdwbe74lixP4Gcs1v4AnT0c+e1sZ1o7etmt/rqu5oaB2W9tn8+tTZE+OOPPxg2bFi5ro2MjKRTp064urqWuu7BuTULzveD/T999dVXPP3009x2220XnFkAtjUV7r33Xu666y62bdvGoEGDyrzWz88PPz+/Us95eHjg6elZ7jqlbnN3d9fnWzQOBNA4qEn9PD1Zfn8Tpn+xnYioZPbFZnLj3F28869QBrQt+fpiTdI4ENA4EBt7jAOr1crRhExWHYjn662nORJffKp7M09n/hXWkruGtNX2jTWkuseBh4dHhdvU2hChU6dOzJkzp1zXnntdwd/fn9jY2BLnzx0LCAgo1/1+++03pkyZwqhRo/jwww/LWTEEBQUBkJycXO42IiIiUvN8PZz54vZwXly+jy/+PEliZh6TZm/m7iFteeiKDjia6tXa0yIipSowW9gbncbWqBQiopLZGpVMSnZBsWscjAYu7eTHDWFBDG7fFAd9fWzwam2I0Lx5c6ZOnVqhNqGhoaxbtw6LxVJsccXNmzfj5uZGhw4dLniPzZs3c+2119KnTx++/vprHBzK/5/o2LFjgG1HCBEREandnByMvDS2G90CvXju+33kFliYteYoG44k8vYNPQn21XRyEalf0nML2H4ihW0nUtgalcKOUynk/u01hb9r4+vOxL5BXNerBU0b2e91L6l9am2IcDHGjRvHkiVLWLp0KePGjQMgMTGRxYsXM3r06GLrJRw9ehSAtm3/2oYkMjKSUaNGERwczPLly8t8/SEhIaFEUJCRkcFbb72Fr68vvXv3ruquiYiISDWZ2LclvVv5cN+XO4mMTWfX6TRGvbOOF8aEcH2v2rF7g4hIRVmtVk6n5NgCgxPJbI1K4eCZDKzW0q/393Khb3Bj+rZuTN9gHzo2a6Svf1Kqehci9OvXj1tuuYX9+/fj6+vLrFmzMJvNJXZDuOyyywCIiooCbCHAVVddRUpKCo8++ig//vhjsevbtm1L//79AXj//ff57rvvGD16NC1btiQ2NpbPPvuMkydP8vnnn+Pk5FT9nRUREZEq086vEd/dM4DXVhzk0/XHyco388jiXaw5GM/L13bDy9XR3iWKSANmtVrJK7SQnW/GycGIq6MJk7H4D/iFZguRsRlsiUouCg7OpJdcLw7AYIAOfo3oHexDWHBj+gT70MJHiyRK+dSrEMFkMvHTTz/x6KOP8s4775CTk0Pfvn2ZO3cuHTt2PG/bpKQkTp06BcC///3vEudvvvnmohBh4MCBbNy4kdmzZ5OUlIS7uzthYWF89tlnXHrppVXfMREREal2zg4mnrm6C5e09+WRxbtJzMxj+e5YdpxM5a0bQukb3NjeJV4Uq9XK8cQs1h1OJCIqmeaeLtw2qDUB3uVfcFpEqpfFYuXgmQy2RiWz8XA8B0+ZeP/odrILLGTmFZKZW0ihpfgUAicHI25OJtwcTbg4mYhLyyU731zq/V0cjfRo4U2fYB/6BDemV0sfhaNy0epViADg4+PD7NmzmT179nmvOzcD4Zzg4GCsZc3t+YcrrriCK6644mJLFBERkVpsaEc/Vjx4CY8u3sXqgwlEp+Yw8aNN3Htpe+6/tF2dWFQsPbeAjUcS+eNwIn8cSuB0SvFtrj/fdIIbwoKYPrQdzb1c7FSlSMOVV2hmz+k0IqKS2XLcNnMgPbfwb1cYICPrvPfIL7SQX2ghlYIS53w9nOnTyoc+wT70buVD1wAvnBxq/9cuqRvqXYggIiIiUlm+Hs58NrUv8zZG8crPB8gvtPDO74fZcCSRtyaGEtS4dk773XkqlQ/WHGFlZDxmS8lfjvi4OZKSXUC+2cL8TSdYtOUUN4a1ZPrQtvh5KkwQqS7puQVnFzNMZsvxFHaeTiW/sPQFDX1cHWjimE9Q86b4uLvg4eKAu7MDHs4OuDqaKDDbXmvILTCTnW8mp8BMTr4ZT1dHerfyoU8rH1o1cdN6BlJtFCKIiIiIlMJgMDB1YGvC2zTh/i93cDg+k20nUhj59jpeujaEMaGB9i4RsL2usOFIErPWHGHj0aRi55wcjIS3bszg9k0Z3KEpHZp5sPNUKm+uPMwfhxLIL7Qwd2MUX0acZFJ4Kx64rD1ebpriLFJZVquVXafTWL4rho1HkzgQl04puR4AQY1d6dvqrwUNmzpbWLNmDcOGdcHT07NmCxcpB4UIIiIiIufR2d+TH+4bxCs/RTJ/0wky8gp5YNFO1h5M4PkxXfF0sc8P3RaLlV/3xzFrzVF2n04rOu5oMjAmNJCru/sT3roJrk6mYu16tvRh/q1hbDuRwlsrD7HucCJ5hRY+23CcX/bF8fYNofSpo+s/iNjb4TMZLNsZww+7YziRlF3ivMEAHZs1Iqx1Y/oE20IDf6/i65Okp6fXVLkiF0UhgoiIiMgFuDiaeHFMCIPbN+Wxb3aTnJXP0h3RrDuSyNOjOnNNj4AamzqclVfI0u2nmbMximMJf70z7epo4l9hLbn9kvItmti7lQ+f3xbOlqhk/vfLQTYfT7at//Dxn/zfFR24a0jbEqu/i0hJ8Rm5LNl2mu93xnAgLqPYOYMBegZ5E96mCWHBjenVSgsaSt2nEEFERESknC7v0owVLS7h4cW7WHc4kYSMPB5YtJOvt55ixpgQ2jT1qLZnRyVmMX/TCRZvPUVG3l8LsHm5OnLzgGCmDgimsXvFt5nuG9yYRdP62dZ/+OkA+WYLM385yMajibw5MRS/RlorQaQ0B+LSmb3uON/vjCHfXHx9gx4tvBjdI4Cruwdo8VKpdxQiiIiIiFSAn6cL828N49sd0bz8YyRJWflsOJLE8LfWcdfQtkwf2hYXR9OFb1QOVquVdYcTmbcxilUH4/n7RlItfFyZOiCYf4W1xN25ct/SnVv/oU9wY+7/cgfHErPYcCSJkW+v4/UJoQzp0LSSPRGpH6xWK38cTmT2umOsO5xY7Fx7Pw+u6RHA6B4BBPu626lCkeqnEEFERESkggwGA9f1asFlnZrx6i8H+DLiJPlm2w4Oy3ZG88Bl7enXpkm5XisozZH4DL7dEc2ynTEltmcc2K4JUwe05tJOflX+ukFIoBc/3DeIZ5btZen2aBIz87n5swimDgjmnmHtaNrIuUqfJ1JXFJotfLczho//OMqhM5lFx40GuKprc26/pDW9WvpoRwRpEBQiiIiIiFwkLzdHXrm2G+N7t+Cpb/eyPzadE0nZ/N/XuwDw93KhV0sferWy7dXexd+zzL3a0/Jh3ubTrIhMYl9M8YXVXB1NXNcrkJsHBNOhWaNq7ZO7swNvTAhlYFtfnlm2l+x8c9EODjf0DWLakLYEXmQ4IlLXWK1Wft1/hpm/HORI/F/hgZuTiQl9grh1YGtaNqmdW76KVBeFCCIiIiKV1LOlD9/fO5D5m07w5spDZOTa1iyITcvlxz2x/LgnFrD91tLF0YSzgxFnBxPOjkacHYxYLRaOJJiwcrzYfUODvBkbGsC1PVvU+NaL1/duQc+W3vz7mz1ERCWTV2hh3qYTLNh8kmt7BnL30LbVugaE1H1xabnkF1rw93bB0VR6eFabbT6WxKsrDrD9ZGrRsWaeztwysDX/CmupBRKlwVKIICIiIlIFHExGbh3Umkn9WrI3Op3tJ1LYdiKFbSdTSMjIA8Bihex8M9n5ZqDgH3ewTYNu7evOmNAAxoQG0trO71W3aerBV3f2I+J4Mu+tPsK6w4kUWqws3naaJdtPM7KbP7cPak1okLemcQuxaTn8eSyJTUeT+PNYMieTbVscmowGArxdaNnYjZaN3Qhq7EZwE3f6t2mCz0UsBlrdDsSl89qKg6w6EF90zNvNkXuGtuOm/q2qbM0TkbpKIYKIiIhIFXJ2MNH77OsLd2CbDn06JYftJ1M4fCaT3AIzeYUW8grN5BdayCu0kJWbjyErkTuv6kX/jjW3XWR5GAwGwts0IbxNE3adSmXWmiP8su8MViv8uDuWH3fH0i3Qiyn9WzG6R0C9+wErO7+QA3EZ+Lg52T3UqW0ycgtYdziRPw4l8OexJKKSsku9zmyxcio5h1PJOWwgqei4yWggvHVjhoc058ouze26i0Gh2cLqgwl8teUkvx/4axFTF0cjtw5szZ1D2mrmgchZChFEREREqpHBYCDo7G9fy5Kens7q1asJCWhUqwKEf+oR5M1HN/Xh0JkMZq0+wvLdsRRarOyJTuPRJbt5+adIJvYNYnJ4q/P2t7bKKzRzIDaD3dFp7D6Vyu7TaRyOz8By9gfKHi28uK5XC67pEVArf4NeE44lZLLqQDyrDsQTcTyZQou1xDVuTib6BjemX5smeLs5cio5m5PJ2UX/Tsm2zcIxW6xsPJrExqNJPLtsHz1bejO8a3P6BDfGy9URT1cHPF0cqzWYOpGUxddbT7F462niz84YAlvAMaFPEA9e3p5mntqiUeTvFCKIiIiISIV0aNaIt27oyRMjO7Nw80kWRpwkISOP1OwCPlp7jI//OMYVnZtx76Xt6N7C297lnldMag6/7T/Db/vPEHE8mXzz/7d339FVVHv/xz8nvZFeSOgtBAJIzQ9QqQpiREHpNrgKS6rXq3j1WriI8KjY0QAKio9gI6L4iILoBUVFAoJYiIYWICEJJUBIL2d+f8Scy/EEnEjq4f1aKyuy954ze8jXWZMPs2es5x27O+2Mdqed0ePr9mhg+3Dd1KOpBrYPP+/DMp3FwRN5ejvpsD77JbPSuw283V3Vs2WQ+rQJUe/WIercJOCCz0DIKSzRz+ln9NkvWdrwS6YyzhRKknYdPq1d5zx/oIKHm4v8vdzl7+WmRt7l3/293cvbfg8agnw8FOrnobBGngpr5KlQP0+78KGwpEzHzxbpRG6Rjp8tUlZOodb/kqlv9p2025evh6uu7xqlO65orbbhPPMDqAwhAgAAAP6SCH8v3XN1tKYPbKsNv2Tqza2HlJSaLcOQPtuTpc/2ZKl/dJhmDW6rHi2C63q6ksqXlyRnnNXGPVn6bE+mw5swKvh7ualL00B1aRqg2KgA/ZZ1Vmt2pintVIFKygzb8QX5uGtKvzb62xUt5enmPEs5yqyGvkjO0pvfHdKWvScc+psGeWtwTLgGxoSrd+uQKt0t4O/lrr5tQtW3TajmDO+oH9POaMMvmVr/c6YOnMhzGF9catWJ3PIAoCoaebkpwNtdZ/JLdLao9IJjuzcP1LhezRXfJVK+nvyKBFwI/4cAAADgoni4uWj4ZVEaflmUkjNy9Po3B7VmZ7pKrYa+TDmuL1OOq2+bEM0c1E69WwfX+pINwyhfcrHux/I3ZaSdKnAYExngpcEdwtWrZbC6NA1UyxAfu3nGK1J/H9xOSanZWrMzTZ/8lKncolKdyi/Rk+t/1dtJh/Wva2M0NLZxvV6S8mdO5Bbp3e1H9Na2w0o/bf/31KtlkK7qEKFBMeFqG+5XLcdpsVh0WbNAXdYsULOHttf+43k6cipfOQUlyiks/f17iXIKSnW20LEtp6DkvHePnC0stb0ppTKBPu66sVtTjYtrVuOvTgWcCSECAAAAqk2HSH89Neoy3X1VtJZs3q93tx9RcZnVtva9Z4sgjYtrriGxEfL3qrkH1RmGoZ/Tc/TxT0f1yU8ZOpLtGBzENG6kIR0jdHXHxurUxP9Pfyl2cbGod+vyW/bnXt9JG37J1Mub9mnvsVwdzs7XXSt3qnfrYD16Xaw6RvnX1KHViF+OntHyrw/q/3YfVUnZf59z0MjTTTf1aKpbereo8dv7LRaL2ob7VXk/hSVlOpVfrONni2xfFcsWTheU2C11CPX773KH8EaecmuAr54E6hohAgAAAKpdk0BvzRvRSTMGtdXSLw/oraRDKiyxasehU9px6JQ81rioX3SYhl8WqcEdIuRXDbeQn8gtUtLBbH134KQ2/3bc9orBc/VsEaRhnSM1pGPERT380dvDVSO6NdF1XSL1VtJhPbsxRafzS/TdgWzFL9qicb2a6Z6roxXeqP4+lM9qNbTpt2NatuWgth6wfzZATONGurVPC43o2qTe397v5e6qyABvRQZ41/VUgEtC/T4jAAAAoEGL8PfSo8M7atrANlq25aDe23FE2XnFKi6z6vPkLH2enCVPNxcNiglX37ahahLopcb+3ooK9FKAt3uldwdYrYbOFpbqdEGxfkw7o20HT+q7A9nadyy30jn0aBGk+M6RGta5cbX/ounm6qLb+rTU9ZdF6fnP9+rN7w6pzGro7aQjemf7EXVpEqAr2oXqirZh6tEiqF48hLGguEyJO9P0+tcH7Z5B4GKRhnWK1MTLW6pni6AGvSwDQM0hRAAAAECNC/Xz1APDYnTfkGh9u/+kPv7xqNb/nKmcwlIVlVr16c+Z+vTnTLttfDxc1TjAS6G+nsotKlVOYYnOFJQot6hUhuObBW08XF3UtXmghnSM0LWdIxUVWPP/Qh3o46F/Xx+rW3o31+PrkrX5t+MyjP++0eHlTfvl4+Gq/9cqWJe3DVVsVIDaRfgp1M+zxucmlS/v2HXktN7/Pk3/t/uocs55VoCfp5vG9Wqm2/u2bJCv5gRQuwgRAAAAUGvcXMuXMfSLDtPjIzrr633H9fHuDG3ck+XwBP384jIdOJ6nA8cdn9h/Lk83F3VvHqT/1zpYvVuHqGuzwCq9LaA6tQ1vpBWT4rQjNVufJx/Tlr3HbW+AyC8u06bfjmvTb8dt44N9PdQ23E/twv3UPMBdeTlS39Lzv2ayqjLOFGjNznS9vzPN4e+xSaC3Jl3eUmN7NVOjGnw+BQDnQogAAACAOuHh5qJBMREaFBOh0jKrjp0tUsaZQmWcKVDG6UId/f17dn6xGnmWv67P39td/l5u5d+93dUyxFeXNQuod69X7NkyWD1bBuuBYTE6mVukb/af1Nd7j+vrvSd09EyhbVx2XrGSDmYr6WD27y1uWvzrt7qsaaB6tAxSrxbB6tEiSEG+Hn+6z9yiUqWeyNOBE3k6eDxPOw5l6+t9J+zu2nCxSP2jwzSmZzNd3TGCBwsCqDJCBAAAANQ5N1cXRQV6/770IKiup1OtQvw8df1lUbr+sigZhqGjZwq1N+us9mblau+xs0rJytW+Y7nK/f1OjJIyw/YAyqU6IElqFeorfy83ubu6yMPNxfbdw9VFJ3KLdPBEno6dLTrvHKIj/HRT96Ya2a2Jwv3r78MeAdR/hAgAAABALbFYLGoS6K0mgd4a0D7c1m4YhlLSjuvtjd+p2L+ZfszI1Z6jObL+fhfBwRMXXtJRmagAL13dMUI39Wiqzk0CeFAigGpBiAAAAADUMYvFosgAL3ULMTRwYBv5+/srt6hUuw6f0vbUU/o1I0dFpVaVlFlV/Pv3olKrisus8vdyV+swX7UO9VWrUD+1CvVVy1Af+XhwqQ+g+nFmAQAAAOohP083XdkuTFe2C6vrqQCADU9SAQAAAAAAphAiAAAAAAAAUwgRAAAAAACAKYQIAAAAAADAFEIEAAAAAABgCiECAAAAAAAwhRABAAAAAACYQogAAAAAAABMIUQAAAAAAACmECIAAAAAAABTCBEAAAAAAIAphAgAAAAAAMAUQgQAAAAAAGAKIQIAAAAAADCFEAEAAAAAAJhCiAAAAAAAAEwhRAAAAAAAAKYQIgAAAAAAAFMIEQAAAAAAgCmECAAAAAAAwBRCBAAAAAAAYAohAgAAAAAAMIUQAQAAAAAAmEKIAAAAAAAATCFEAAAAAAAAphAiAAAAAAAAUwgRAAAAAACAKYQIAAAAAADAFEIEAAAAAABgCiECAAAAAAAwxSlDhNOnT2vKlCkKCwuTr6+vBg4cqJ07d5raduLEibJYLA5fMTExDmOtVqueeuoptWrVSl5eXurSpYvefvvt6j4cAAAAAADqBbe6nkB1s1qtio+P1+7duzV79myFhoYqISFBAwYM0Pfff6927dr96Wd4enpq2bJldm0BAQEO4x566CE98cQTmjx5snr16qW1a9dqwoQJslgsGjduXLUdEwAAAAAA9YHThQiJiYn69ttvtXr1ao0aNUqSNGbMGEVHR2vOnDl66623/vQz3NzcdMstt1xwTHp6up555hlNnz5dL730kiTpzjvvVP/+/TV79myNHj1arq6uF39AAAAAAADUE063nCExMVERERG68cYbbW1hYWEaM2aM1q5dq6KiIlOfU1ZWppycnPP2r127ViUlJZo2bZqtzWKxaOrUqUpLS9PWrVv/+kEAAAAAAFAPOd2dCLt27VL37t3l4mKfj8TFxemVV15RSkqKOnfufMHPyM/Pl7+/v/Lz8xUUFKTx48frySeflJ+fn91+fH191aFDB4f9VPRfccUVlX7+sWPHdPz4cbu2ffv2SZJyc3MvGF7AOeTl5dl9x6WJOoBEHaAcdQCJOkA56gBS7dVBbm5ulbdxuhAhIyND/fr1c2iPjIyUJB09evSCIUJkZKTuv/9+de/eXVarVevXr1dCQoJ2796tzZs3y83NzbafiIgIWSyW8+7nfBISEjR37txK+5KSkpSZmXnhg4TTSEpKquspoB6gDiBRByhHHUCiDlCOOoBU83Vw+PDhKm9Tr0MEq9Wq4uJiU2M9PT1lsVhUUFAgT09Ph34vLy9JUkFBwQU/53/+53/s/jxu3DhFR0froYceUmJiou2BiRezn2nTpmn06NF2bfv27dOIESMUFxfncHcDnE9eXp6SkpIUFxcnX1/fup4O6gh1AIk6QDnqABJ1gHLUAaTaq4Pk5OQqb1OvQ4SvvvpKAwcONDU2OTlZMTEx8vb2rvS5B4WFhZIkb2/vKs/jnnvu0SOPPKLPP//cFiJczH7Cw8MVHh5eaZ+fn5/8/f2rPEc0TL6+vvy8QR1AEnWActQBJOoA5agDSDVfB+cu2TerXocIMTExev31102NrVhGEBkZqYyMDIf+iraoqKgqz8Pb21shISHKzs6229+mTZtkGIbdkoaL2Q8AAAAAAPVZvQ4RGjdurIkTJ1Zpm65du2rLli2yWq12D1fctm2bfHx8FB0dXeV5nD17VidOnFBYWJjdfpYtW6bk5GR17NjRbj8V/QAAAAAAOBOne8XjqFGjlJWVpTVr1tjaTpw4odWrV2v48OF2zzHYv3+/9u/fb/tzYWGhzp496/CZ8+bNk2EYuuaaa2xtN9xwg9zd3ZWQkGBrMwxDS5YsUZMmTdS3b9/qPjQAAAAAAOpUvb4T4a8YNWqUevfurUmTJmnPnj0KDQ1VQkKCysrKHN6IMHjwYElSamqqJCkzM1PdunXT+PHjFRMTI0nasGGDPvnkE11zzTW64YYbbNs2bdpUf//737Vw4UKVlJSoV69e+vDDD7VlyxatWrVKrq6utXPAAAAAAADUEqcLEVxdXfXJJ59o9uzZevHFF1VQUKBevXppxYoVat++/QW3DQwM1HXXXaeNGzfqjTfeUFlZmdq2basFCxbovvvus1seIUlPPPGEgoKCtHTpUq1YsULt2rXTypUrNWHChJo8RAAAAAAA6oTThQiSFBQUpGXLlmnZsmUXHFdxB0KFwMBAvfnmm6b34+LiogcffFAPPvjgX5kmAAAAAAANitM9EwEAAAAAANQMQgQAAAAAAGAKIQIAAAAAADCFEAEAAAAAAJhCiAAAAAAAAEwhRAAAAAAAAKYQIgAAAAAAAFMIEQAAAAAAgCmECAAAAAAAwBRCBAAAAAAAYAohAgAAAAAAMIUQAQAAAAAAmEKIAAAAAAAATCFEAAAAAAAAphAiAAAAAAAAUwgRAAAAAACAKYQIAAAAAADAFEIEAAAAAABgCiECAAAAAAAwhRABAAAAAACYQogAAAAAAABMIUQAAAAAAACmECIAAAAAAABTCBEAAAAAAIAphAgAAAAAAMAUQgQAAAAAAGAKIQIAAAAAADCFEAEAAAAAAJhCiAAAAAAAAEwhRAAAAAAAAKYQIgAAAAAAAFMIEQAAAAAAgCmECAAAAAAAwBRCBAAAAAAAYAohAgAAAAAAMIUQAQAAAAAAmEKIAAAAAAAATCFEAAAAAAAAphAiAAAAAAAAUwgRAAAAAACAKYQIAAAAAADAFEIEAAAAAABgCiECAAAAAAAwhRABAAAAAACYQogAAAAAAABMIUQAAAAAAACmECIAAAAAAABTCBEAAAAAAIAphAgAAAAAAMAUQgQAAAAAAGAKIQIAAAAAADCFEAEAAAAAAJhCiAAAAAAAAEwhRAAAAAAAAKYQIgAAAAAAAFOcMkQ4ffq0pkyZorCwMPn6+mrgwIHauXOnqW0tFst5v66++mrbuNTU1POOe+edd2rq0AAAAAAAqDNudT2B6ma1WhUfH6/du3dr9uzZCg0NVUJCggYMGKDvv/9e7dq1u+D2b775pkPbjh079MILL2jIkCEOfePHj9e1115r19anT5+LOwgAAAAAAOohpwsREhMT9e2332r16tUaNWqUJGnMmDGKjo7WnDlz9NZbb11w+1tuucWhbfPmzbJYLBo/frxDX/fu3SvdBgAAAAAAZ+N0yxkSExMVERGhG2+80dYWFhamMWPGaO3atSoqKqrS5xUVFen9999X//791bRp00rH5OXlqbi4+KLmDQAAAABAfed0dyLs2rVL3bt3l4uLfT4SFxenV155RSkpKercubPpz/vkk090+vRp3XzzzZX2z507V7Nnz5bFYlGPHj00f/78Spc9nOvYsWM6fvy4Xdu+ffskSbm5ucrJyTE9PzRMeXl5dt9xaaIOIFEHKEcdQKIOUI46gFR7dZCbm1vlbZwuRMjIyFC/fv0c2iMjIyVJR48erVKIsGrVKnl6etqWRlRwcXHRkCFDNHLkSDVp0kQHDhzQs88+q2HDhumjjz5SfHz8eT8zISFBc+fOrbQvKSlJmZmZpueHhi0pKamup4B6gDqARB2gHHUAiTpAOeoAUs3XweHDh6u8Tb0OEaxWq+llAp6enrJYLCooKJCnp6dDv5eXlySpoKDA9P5zcnK0bt06XXvttQoMDLTra968uTZs2GDXduutt6pjx4669957LxgiTJs2TaNHj7Zr27dvn0aMGKG4uDh16NDB9BzRMOXl5SkpKUlxcXHy9fWt6+mgjlAHkKgDlKMOIFEHKEcdQKq9OkhOTq7yNvU6RPjqq680cOBAU2OTk5MVExMjb2/vSp97UFhYKEny9vY2vf/3339fhYWF513K8EfBwcGaNGmSnnjiCaWlpZ33GQrh4eEKDw+vtM/Pz0/+/v6m54iGzdfXl583qANIog5QjjqARB2gHHUAqebrwM/Pr8rb1OsQISYmRq+//rqpsRXLFSIjI5WRkeHQX9EWFRVlev+rVq1SQECArrvuOtPbNGvWTJKUnZ193hABAAAAAICGqF6HCI0bN9bEiROrtE3Xrl21ZcsWWa1Wu4crbtu2TT4+PoqOjjb1ORkZGdq0aZMmTpxY6fKI8zlw4ICk8jdCAAAAAADgTJzuFY+jRo1SVlaW1qxZY2s7ceKEVq9ereHDh9sFAvv379f+/fsr/Zx33nlHVqv1vEsZ/vh2BUlKT0/Xa6+9pi5dutjujAAAAAAAwFnU6zsR/opRo0apd+/emjRpkvbs2aPQ0FAlJCSorKzM4Y0IgwcPliSlpqY6fM6qVasUFRWlAQMGVLqf+++/X/v379fgwYMVFRWl1NRULV26VHl5eXrhhReq+7AAAAAAAKhzThciuLq66pNPPtHs2bP14osvqqCgQL169dKKFSvUvn17U5/x22+/6fvvv9c//vEPuyUR5xoyZIiWLFmil19+WadOnVJgYKD69eunhx9+WN27d6/OQwIAAAAAoF5wuhBBkoKCgrRs2TItW7bsguMquwNBktq3by/DMC647fjx4zV+/Pi/OkUAAAAAABocp3smAgAAAAAAqBmECAAAAAAAwBRCBAAAAAAAYAohAgAAAAAAMIUQAQAAAAAAmEKIAAAAAAAATCFEAAAAAAAAphAiAAAAAAAAUwgRAAAAAACAKYQIAAAAAADAFEIEAAAAAABgCiECAAAAAAAwhRABAAAAAACYQogAAAAAAABMIUQAAAAAAACmECIAAAAAAABTCBEAAAAAAIAphAgAAAAAAMAUQgQAAAAAAGAKIQIAAAAAADCFEAEAAAAAAJhCiAAAAAAAAEwhRAAAAAAAAKYQIgAAAAAAAFMIEQAAAAAAgCmECAAAAAAAwBRCBAAAAAAAYAohAgAAAAAAMIUQAQAAAAAAmEKIAAAAAAAATCFEAAAAAAAAphAiAAAAAAAAUwgRAAAAAACAKYQIAAAAAADAFEIEAAAAAABgCiECAAAAAAAwhRABAAAAAACYQogAAAAAAABMIUQAAAAAAACmECIAAAAAAABTCBEAAAAAAIAphAgAAAAAAMAUQgQAAAAAAGAKIQIAAAAAADCFEAEAAAAAAJhCiAAAAAAAAEwhRAAAAAAAAKYQIgAAAAAAAFMIEQAAAAAAgCmECAAAAAAAwBRCBAAAAAAAYAohAgAAAAAAMIUQAQAAAAAAmEKIAAAAAAAATCFEAAAAAAAApjhViJCRkaEHHnhAAwcOVKNGjWSxWLR58+YqfUZ6errGjBmjwMBA+fv764YbbtCBAwcqHbt8+XJ16NBBXl5eateunRYtWlQNRwEAAAAAQP3kVCHCb7/9pieffFLp6enq3LlzlbfPzc3VwIED9eWXX+pf//qX5s6dq127dql///46efKk3dilS5fqzjvvVGxsrBYtWqQ+ffpo1qxZevLJJ6vrcAAAAAAAqFfc6noC1alHjx46efKkgoODlZiYqNGjR1dp+4SEBO3du1dJSUnq1auXJGnYsGHq1KmTnnnmGS1YsECSVFBQoIceekjx8fFKTEyUJE2ePFlWq1Xz5s3TlClTFBQUVL0HBwAAAABAHXOqOxEaNWqk4ODgv7x9YmKievXqZQsQJCkmJkaDBw/We++9Z2vbtGmTTp48qWnTptltP336dOXl5WndunV/eQ4AAAAAANRXThUiXAyr1aoff/xRPXv2dOiLi4vT/v37dfbsWUnSrl27JMlhbI8ePeTi4mLrBwAAAADAmTjVcoaLkZ2draKiIkVGRjr0VbQdPXpU7du3V0ZGhlxdXRUeHm43zsPDQyEhITp69OgF93Xs2DEdP37crm3Pnj2SpB9//FG5ubkXcyhoAAoKCnT48GHt3LlT3t7edT0d1BHqABJ1gHLUASTqAOWoA0i1VwcVLxEoKioyvU29DRGsVquKi4tNjfX09JTFYrmo/RUUFNg+64+8vLzsxhQUFMjDw6PSz/Hy8rKNO5+EhATNnTu30r4JEyaYnjMAAAAAABfryJEj6t69u6mx9TZE+OqrrzRw4EBTY5OTkxUTE3NR+6tIdypLYAoLC+3GeHt7nzfgKCws/NOkaNq0aQ4PfczJyVFKSoo6d+5caZAB57Jv3z6NGDFCH374odq2bVvX00EdoQ4gUQcoRx1Aog5QjjqAVHt1UFRUpCNHjqh///6mt6m3IUJMTIxef/11U2MrW4JQVcHBwfL09FRGRoZDX0VbVFSUbX9lZWU6duyY3ZKG4uJinTx50jbufMLDwx2WQkhSnz59LuYQ0AC1bdtWsbGxdT0N1DHqABJ1gHLUASTqAOWoA0i1Uwdm70CoUG9DhMaNG2vixIm1tj8XFxd17txZO3bscOjbtm2bWrdurUaNGkmSunbtKknasWOHrr32Wtu4HTt2yGq12voBAAAAAHAml+zbGQ4fPqxff/3Vrm3UqFHavn27XZDw22+/6T//+Y/d8oNBgwYpODhYixcvttt+8eLF8vHxUXx8fM1OHgAAAACAOlBv70T4qx5//HFJ0i+//CJJevPNN/X1119Lkh5++GHbuNtuu01ffvmlDMOwtU2bNk2vvvqq4uPjdd9998nd3V3PPvusIiIidO+999rGeXt7a968eZo+fbpGjx6toUOHasuWLVq5cqXmz5+v4ODg2jhUAAAAAABqldOFCI888ojdn1977TXbf58bIlSmUaNG2rx5s+655x49/vjjslqtGjBggJ577jmFhYXZjZ02bZrc3d31zDPP6KOPPlKzZs303HPP6e67766+g4HTCgsL05w5cxzqCpcW6gASdYBy1AEk6gDlqANI9bsOLMa5/xQPAAAAAABwHpfsMxEAAAAAAEDVECIAAAAAAABTCBEAAAAAAIAphAgAAAAAAMAUQgQAAAAAAGAKIQJQzbZv364ZM2YoNjZWvr6+at68ucaMGaOUlBSHscnJybrmmmvk5+en4OBg3XrrrTp+/HgdzBq1Yf78+bJYLOrUqZND37fffqsrrrhCPj4+aty4sWbNmqXc3Nw6mCVqws6dO3X99dcrODhYPj4+6tSpk1588UW7MdSAc9u7d6/GjRunpk2bysfHRzExMXrssceUn59vN446cB65ubmaM2eOrrnmGgUHB8tisWjFihWVjjV7PWC1WvXUU0+pVatW8vLyUpcuXfT222/X8JHgYpipA6vVqhUrVuj6669Xs2bN5Ovrq06dOunxxx9XYWFhpZ+7fPlydejQQV5eXmrXrp0WLVpUC0eDv6oq54MKJSUl6tixoywWi55++mmH/ro8H7jVyl6AS8iTTz6pb775RqNHj1aXLl2UmZmpl156Sd27d9d3331n+wUyLS1N/fr1U0BAgBYsWKDc3Fw9/fTT+umnn5SUlCQPD486PhJUp7S0NC1YsEC+vr4OfT/88IMGDx6sDh066Nlnn1VaWpqefvpp7d27V59++mkdzBbV6bPPPtPw4cPVrVs3PfLII/Lz89P+/fuVlpZmG0MNOLcjR44oLi5OAQEBmjFjhoKDg7V161bNmTNH33//vdauXSuJOnA2J06c0GOPPabmzZvrsssu0+bNmysdV5XrgYceekhPPPGEJk+erF69emnt2rWaMGGCLBaLxo0bV0tHhqowUwf5+fmaNGmSevfurbvuukvh4eG2c8QXX3yh//znP7JYLLbxS5cu1V133aWbbrpJ//jHP7RlyxbNmjVL+fn5+uc//1mLRwezzJ4PzrVo0SIdPnz4vP11ej4wAFSrb775xigqKrJrS0lJMTw9PY2bb77Z1jZ16lTD29vbOHTokK1t48aNhiRj6dKltTZf1I6xY8cagwYNMvr372/Exsba9Q0bNsyIjIw0zpw5Y2t79dVXDUnGhg0banuqqEZnzpwxIiIijJEjRxplZWXnHUcNOLf58+cbkoyff/7Zrv22224zJBnZ2dmGYVAHzqawsNDIyMgwDMMwtm/fbkgyXn/9dYdxZq8H0tLSDHd3d2P69Om2NqvValx55ZVG06ZNjdLS0po7GPxlZuqgqKjI+Oabbxy2nTt3riHJ2Lhxo60tPz/fCAkJMeLj4+3G3nzzzYavr6/tfIL6xez5oEJWVpYREBBgPPbYY4YkY+HChXb9dX0+YDkDUM369u3rcBdBu3btFBsbq+TkZFvb+++/r+uuu07Nmze3tV111VWKjo7We++9V2vzRc376quvlJiYqOeff96hLycnRxs3btQtt9wif39/W/ttt90mPz8/aqGBe+utt5SVlaX58+fLxcVFeXl5slqtdmOoAeeXk5MjSYqIiLBrj4yMlIuLizw8PKgDJ+Tp6anGjRv/6Tiz1wNr165VSUmJpk2bZmuzWCyaOnWq0tLStHXr1uo9AFQLM3Xg4eGhvn37OrSPHDlSkuyuHzdt2qSTJ0/a1YEkTZ8+XXl5eVq3bl01zBrVzez5oMIDDzyg9u3b65Zbbqm0v67PB4QIQC0wDENZWVkKDQ2VJKWnp+vYsWPq2bOnw9i4uDjt2rWrtqeIGlJWVqaZM2fqzjvvVOfOnR36f/rpJ5WWljrUgoeHh7p27UotNHCff/65/P39lZ6ervbt28vPz0/+/v6aOnWqbZ0rNeD8BgwYIEm644479MMPP+jIkSN69913tXjxYs2aNUu+vr7UwSWqKtcDu3btkq+vrzp06OAwrqIfziUzM1OSbNeP0n9/zn+smR49esjFxYU6cAJJSUl644039Pzzz9stYzlXXZ8PCBGAWrBq1Sqlp6dr7NixkqSMjAxJ5f8K9UeRkZHKzs5WUVFRrc4RNWPJkiU6dOiQ5s2bV2n/n9XC0aNHa3R+qFl79+5VaWmpbrjhBg0dOlTvv/++/va3v2nJkiWaNGmSJGrgUnDNNddo3rx52rhxo7p166bmzZtr3Lhxmjlzpp577jlJ1MGlqirXAxkZGYqIiHD4paJiW2rE+Tz11FPy9/fXsGHDbG0ZGRlydXVVeHi43VgPDw+FhIRQBw2cYRiaOXOmxo4dqz59+px3XF2fD3iwIlDDfv31V02fPl19+vTR7bffLkkqKCiQVH5r0x95eXnZxlTWj4bj5MmTevTRR/XII48oLCys0jF/VgsV/WiYcnNzlZ+fr7vuusv2NoYbb7xRxcXFWrp0qR577DFq4BLRsmVL9evXTzfddJNCQkK0bt06LViwQI0bN9aMGTOog0tUVa4HznddcO44OI8FCxbo888/V0JCggIDA23tBQUF5334NueKhm/FihX66aeflJiYeMFxdX0+IEQAalBmZqbi4+MVEBCgxMREubq6SpK8vb0lqdK7DSpuca4Yg4br4YcfVnBwsGbOnHneMX9WC9RBw1bx8xs/frxd+4QJE7R06VJt3bpVPj4+kqgBZ/bOO+9oypQpSklJUdOmTSWVh0lWq1X//Oc/NX78eM4Fl6iqXA94e3tz3XCJePfdd/Xwww/rjjvu0NSpU+36vL29VVxcXOl2nCsatpycHD344IOaPXu2mjVrdsGxdX0+YDkDUEPOnDmjYcOG6fTp01q/fr2ioqJsfRW3GlXcxniujIwMBQcHcxdCA7d371698sormjVrlo4eParU1FSlpqaqsLBQJSUlSk1NVXZ29p/Wwrl1g4an4uf3xwfqVdyGeurUKWrgEpCQkKBu3brZAoQK119/vfLz87Vr1y7q4BJVleuByMhIZWZmyjAMh3GSqBEnsXHjRt12222Kj4/XkiVLHPojIyNVVlamY8eO2bUXFxfr5MmT1EED9vTTT6u4uFhjx461XTdWvA761KlTSk1NtQVIdX0+IEQAakBhYaGGDx+ulJQUffzxx+rYsaNdf5MmTRQWFqYdO3Y4bJuUlKSuXbvW0kxRU9LT02W1WjVr1iy1atXK9rVt2zalpKSoVatWeuyxx9SpUye5ubk51EJxcbF++OEHaqGB69Gjh6TyejhXxVrFsLAwauASkJWVpbKyMof2kpISSVJpaSl1cImqyvVA165dlZ+fb/ekfknatm2brR8N27Zt2zRy5Ej17NlT7733ntzcHG8ar/g5/7FmduzYIavVSh00YIcPH9apU6cUGxtru2688sorJZUvb2nVqpX27Nkjqe7PB4QIQDUrKyvT2LFjtXXrVq1evfq8D0W56aab9PHHH+vIkSO2ti+++EIpKSkaPXp0bU0XNaRTp0764IMPHL5iY2PVvHlzffDBB7rjjjsUEBCgq666SitXrtTZs2dt27/55pvKzc2lFhq4MWPGSJKWL19u175s2TK5ublpwIAB1MAlIDo6Wrt27VJKSopd+9tvvy0XFxd16dKFOriEmb0euOGGG+Tu7q6EhARbm2EYWrJkiZo0aVLpKwLRcCQnJys+Pl4tW7bUxx9/fN7b0QcNGqTg4GAtXrzYrn3x4sXy8fFRfHx8bUwXNWDWrFkO141Lly6VJE2cOFEffPCBWrVqJanuzwcW44/3QAC4KH//+9/1wgsvaPjw4bZfIM5V8b7XI0eOqFu3bgoMDNTdd9+t3NxcLVy4UE2bNtX27dtZzuCkBgwYoBMnTujnn3+2te3cuVN9+/ZVx44dNWXKFKWlpemZZ55Rv379tGHDhjqcLarDHXfcoddee01jxoxR//79tXnzZq1evVoPPvigFixYIIkacHZfffWVBg0apJCQEM2YMUMhISH6+OOP9emnn+rOO+/Uq6++Kok6cEYvvfSSTp8+raNHj2rx4sW68cYb1a1bN0nSzJkzFRAQUKXrgfvvv18LFy7UlClT1KtXL3344Ydat26dVq1apQkTJtTVYeJP/FkduLi4KDY2Vunp6VqwYIGaNGlit32bNm3s/lEqISFB06dP16hRozR06FBt2bJF//u//6v58+frX//6V60eG8wzcz74o9TUVLVq1UoLFy7UfffdZ9dXp+cDA0C16t+/vyHpvF/n+vnnn40hQ4YYPj4+RmBgoHHzzTcbmZmZdTRz1Ib+/fsbsbGxDu1btmwx+vbta3h5eRlhYWHG9OnTjZycnDqYIapbcXGx8e9//9to0aKF4e7ubrRt29Z47rnnHMZRA85t27ZtxrBhw4zGjRsb7u7uRnR0tDF//nyjpKTEbhx14FxatGhx3uuBgwcP2saZvR4oKyszFixYYLRo0cLw8PAwYmNjjZUrV9biEeGv+LM6OHjw4AWvHW+//XaHz3zllVeM9u3bGx4eHkabNm2M5557zrBarbV/cDDN7PngXBW1sXDhQoe+ujwfcCcCAAAAAAAwhWciAAAAAAAAUwgRAAAAAACAKYQIAAAAAADAFEIEAAAAAABgCiECAAAAAAAwhRABAAAAAACYQogAAAAAAABMIUQAAAAAAACmECIAAAAAAABTCBEAAAAAAIAphAgAAAAAAMAUQgQAAFCvrFixQhaLRTt27KjrqQAAgD8gRAAAAAAAAKYQIgAAAAAAAFMIEQAAQINSXFysRx99VD169FBAQIB8fX115ZVXatOmTQ5jT548qVtvvVX+/v4KDAzU7bffrt27d8tisWjFihW1P3kAABo4t7qeAAAAQFXk5ORo2bJlGj9+vCZPnqyzZ89q+fLlGjp0qJKSktS1a1dJktVq1fDhw5WUlKSpU6cqJiZGa9eu1e233163BwAAQANGiAAAABqUoKAgpaamysPDw9Y2efJkxcTEaNGiRVq+fLkk6cMPP9TWrVv1/PPP6+6775YkTZ06VVdffXWdzBsAAGfAcgYAANCguLq62gIEq9Wq7OxslZaWqmfPntq5c6dt3Pr16+Xu7q7Jkyfb2lxcXDR9+vRanzMAAM6CEAEAADQ4b7zxhrp06SIvLy+FhIQoLCxM69at05kzZ2xjDh06pMjISPn4+Nht27Zt29qeLgAAToMQAQAANCgrV67UxIkT1aZNGy1fvlzr16/Xxo0bNWjQIFmt1rqeHgAATo1nIgAAgAYlMTFRrVu31po1a2SxWGztc+bMsRvXokULbdq0Sfn5+XZ3I+zbt6/W5goAgLPhTgQAANCguLq6SpIMw7C1bdu2TVu3brUbN3ToUJWUlOjVV1+1tVmtVr388su1M1EAAJwQdyIAAIB66bXXXtP69esd2gcMGKA1a9Zo5MiRio+P18GDB7VkyRJ17NhRubm5tnEjRoxQXFyc7r33Xu3bt08xMTH66KOPlJ2dLUl2dzEAAABzCBEAAEC9tHjx4krbDx8+rNzcXC1dulQbNmxQx44dtXLlSq1evVqbN2+2jXN1ddW6det0991364033pCLi4tGjhypOXPm6PLLL5eXl1ctHQkAAM7DYpx7LyAAAICT+/DDDzVy5Eh9/fXXuvzyy+t6OgAANCiECAAAwGkVFBTI29vb9ueysjINGTJEO3bsUGZmpl0fAAD4cyxnAAAATmvmzJkqKChQnz59VFRUpDVr1ujbb7/VggULCBAAAPgLuBMBAAA4rbfeekvPPPOM9u3bp8LCQrVt21ZTp07VjBkz6npqAAA0SIQIAAAAAADAFJe6ngAAAAAAAGgYCBEAAAAAAIAphAgAAAAAAMAUQgQAAAAAAGAKIQIAAAAAADCFEAEAAAAAAJhCiAAAAAAAAEwhRAAAAAAAAKYQIgAAAAAAAFMIEQAAAAAAgCn/HwjBXypCu7naAAAAAElFTkSuQmCC","text/plain":["
"]},"metadata":{},"output_type":"display_data"}],"source":["# Test for seasonality\n","from pandas.plotting import autocorrelation_plot\n","\n","# Draw Plot\n","plt.rcParams.update({'figure.figsize':(10,6), 'figure.dpi':120})\n","autocorrelation_plot(df['Number of Passengers'].tolist())"]},{"cell_type":"markdown","metadata":{},"source":["Alternately, if we want a statistical test, the [CHTest](https://alkaline-ml.com/pmdarima/modules/generated/pmdarima.arima.CHTest.html#pmdarima.arima.CHTest) can determine if seasonal differencing is required to stationarize the series."]},{"cell_type":"markdown","metadata":{},"source":["# **16. Autocorrelation and Partial Autocorrelation Functions** \n","\n","\n","[Table of Contents](#0.1)\n","\n","\n","- **Autocorrelation** is simply the correlation of a series with its own lags. If a series is significantly autocorrelated, that means, the previous values of the series (lags) may be helpful in predicting the current value.\n","\n","\n","- **Partial Autocorrelation** also conveys similar information but it conveys the pure correlation of a series and its lag, excluding the correlation contributions from the intermediate lags."]},{"cell_type":"code","execution_count":18,"metadata":{"trusted":true},"outputs":[{"data":{"image/png":"","text/plain":["
"]},"execution_count":18,"metadata":{},"output_type":"execute_result"},{"data":{"image/png":"","text/plain":["
"]},"metadata":{},"output_type":"display_data"}],"source":["from statsmodels.tsa.stattools import acf, pacf\n","from statsmodels.graphics.tsaplots import plot_acf, plot_pacf\n","\n","# Draw Plot\n","fig, axes = plt.subplots(1,2,figsize=(16,3), dpi= 100)\n","plot_acf(df['Number of Passengers'].tolist(), lags=50, ax=axes[0])\n","plot_pacf(df['Number of Passengers'].tolist(), lags=50, ax=axes[1])"]},{"cell_type":"markdown","metadata":{},"source":["# **17. Computation of Partial Autocorrelation Function** \n","\n","\n","[Table of Contents](#0.1)\n","\n","\n","- The partial autocorrelation function of lag (k) of a series is the coefficient of that lag in the autoregression equation of Y. The autoregressive equation of Y is nothing but the linear regression of Y with its own lags as predictors.\n","\n","\n","- For example, if **Y_t** is the current series and **Y_t-1** is the lag 1 of Y, then the partial autocorrelation of **lag 3 (Y_t-3)** is the coefficient $\\alpha_3$ of Y_t-3 in the following equation:"]},{"cell_type":"markdown","metadata":{},"source":["![Partial Autocorrelation Function](https://www.machinelearningplus.com/wp-content/uploads/2019/02/12_5_Autoregression_Equation-min.png?ezimgfmt=ng:webp/ngcb1)\n","\n","image source : https://www.machinelearningplus.com/wp-content/uploads/2019/02/12_5_Autoregression_Equation-min.png?ezimgfmt=ng:webp/ngcb1"]},{"cell_type":"markdown","metadata":{},"source":["# **18. Lag Plots** \n","\n","\n","[Table of Contents](#0.1)\n","\n","\n","- A **Lag plot** is a scatter plot of a time series against a lag of itself. It is normally used to check for autocorrelation. If there is any pattern existing in the series, the series is autocorrelated. If there is no such pattern, the series is likely to be random white noise.\n"]},{"cell_type":"code","execution_count":19,"metadata":{"trusted":true},"outputs":[{"data":{"image/png":"","text/plain":["
"]},"metadata":{},"output_type":"display_data"}],"source":["from pandas.plotting import lag_plot\n","plt.rcParams.update({'ytick.left' : False, 'axes.titlepad':10})\n","\n","# Plot\n","fig, axes = plt.subplots(1, 4, figsize=(10,3), sharex=True, sharey=True, dpi=100)\n","for i, ax in enumerate(axes.flatten()[:4]):\n"," lag_plot(df['Number of Passengers'], lag=i+1, ax=ax, c='firebrick')\n"," ax.set_title('Lag ' + str(i+1))\n","\n","fig.suptitle('Lag Plots of Air Passengers', y=1.05) \n","plt.show()"]},{"cell_type":"markdown","metadata":{},"source":["# **19. Granger Causality Test** \n","\n","\n","[Table of Contents](#0.1)\n","\n","\n","- **Granger causality test** is used to determine if one time series will be useful to forecast another. It is based on the idea that if X causes Y, then the forecast of Y based on previous values of Y AND the previous values of X should outperform the forecast of Y based on previous values of Y alone.\n","\n","\n","- So, **Granger causality test** should not be used to test if a lag of Y causes Y. Instead, it is generally used on exogenous (not Y lag) variables only. It is implemented in the statsmodel package.\n","\n","\n","- It accepts a 2D array with 2 columns as the main argument. The values are in the first column and the predictor (X) is in the second column. The Null hypothesis is that the series in the second column, does not Granger cause the series in the first. If the P-Values are less than a significance level (0.05) then we reject the null hypothesis and conclude that the said lag of X is indeed useful. The second argument maxlag says till how many lags of Y should be included in the test."]},{"cell_type":"code","execution_count":20,"metadata":{"trusted":true},"outputs":[{"ename":"FileNotFoundError","evalue":"[Errno 2] No such file or directory: '/kaggle/input/dataset/dataset.txt'","output_type":"error","traceback":["\u001b[0;31m---------------------------------------------------------------------------\u001b[0m","\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)","Cell \u001b[0;32mIn[20], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mstatsmodels\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mtsa\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mstattools\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m grangercausalitytests\n\u001b[0;32m----> 2\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[43mpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_csv\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43m/kaggle/input/dataset/dataset.txt\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3\u001b[0m data[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdate\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mto_datetime(data[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdate\u001b[39m\u001b[38;5;124m'\u001b[39m])\n\u001b[1;32m 4\u001b[0m data[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mmonth\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m data\u001b[38;5;241m.\u001b[39mdate\u001b[38;5;241m.\u001b[39mdt\u001b[38;5;241m.\u001b[39mmonth\n","File \u001b[0;32m~/Source/courses/tools4ds/DS701-Course-Notes/.venv/lib/python3.12/site-packages/pandas/io/parsers/readers.py:1026\u001b[0m, in \u001b[0;36mread_csv\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)\u001b[0m\n\u001b[1;32m 1013\u001b[0m kwds_defaults \u001b[38;5;241m=\u001b[39m _refine_defaults_read(\n\u001b[1;32m 1014\u001b[0m dialect,\n\u001b[1;32m 1015\u001b[0m delimiter,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1022\u001b[0m dtype_backend\u001b[38;5;241m=\u001b[39mdtype_backend,\n\u001b[1;32m 1023\u001b[0m )\n\u001b[1;32m 1024\u001b[0m kwds\u001b[38;5;241m.\u001b[39mupdate(kwds_defaults)\n\u001b[0;32m-> 1026\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_read\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n","File \u001b[0;32m~/Source/courses/tools4ds/DS701-Course-Notes/.venv/lib/python3.12/site-packages/pandas/io/parsers/readers.py:620\u001b[0m, in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m 617\u001b[0m _validate_names(kwds\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnames\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m))\n\u001b[1;32m 619\u001b[0m \u001b[38;5;66;03m# Create the parser.\u001b[39;00m\n\u001b[0;32m--> 620\u001b[0m parser \u001b[38;5;241m=\u001b[39m \u001b[43mTextFileReader\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 622\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m chunksize \u001b[38;5;129;01mor\u001b[39;00m iterator:\n\u001b[1;32m 623\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m parser\n","File \u001b[0;32m~/Source/courses/tools4ds/DS701-Course-Notes/.venv/lib/python3.12/site-packages/pandas/io/parsers/readers.py:1620\u001b[0m, in \u001b[0;36mTextFileReader.__init__\u001b[0;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[1;32m 1617\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m kwds[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 1619\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles: IOHandles \u001b[38;5;241m|\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m-> 1620\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_engine \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_make_engine\u001b[49m\u001b[43m(\u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mengine\u001b[49m\u001b[43m)\u001b[49m\n","File \u001b[0;32m~/Source/courses/tools4ds/DS701-Course-Notes/.venv/lib/python3.12/site-packages/pandas/io/parsers/readers.py:1880\u001b[0m, in \u001b[0;36mTextFileReader._make_engine\u001b[0;34m(self, f, engine)\u001b[0m\n\u001b[1;32m 1878\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m mode:\n\u001b[1;32m 1879\u001b[0m mode \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m-> 1880\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;241m=\u001b[39m \u001b[43mget_handle\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1881\u001b[0m \u001b[43m \u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1882\u001b[0m \u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1883\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoding\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1884\u001b[0m \u001b[43m \u001b[49m\u001b[43mcompression\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcompression\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1885\u001b[0m \u001b[43m \u001b[49m\u001b[43mmemory_map\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmemory_map\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1886\u001b[0m \u001b[43m \u001b[49m\u001b[43mis_text\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mis_text\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1887\u001b[0m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoding_errors\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstrict\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1888\u001b[0m \u001b[43m \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstorage_options\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1889\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1890\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1891\u001b[0m f \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles\u001b[38;5;241m.\u001b[39mhandle\n","File \u001b[0;32m~/Source/courses/tools4ds/DS701-Course-Notes/.venv/lib/python3.12/site-packages/pandas/io/common.py:873\u001b[0m, in \u001b[0;36mget_handle\u001b[0;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[1;32m 868\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(handle, \u001b[38;5;28mstr\u001b[39m):\n\u001b[1;32m 869\u001b[0m \u001b[38;5;66;03m# Check whether the filename is to be opened in binary mode.\u001b[39;00m\n\u001b[1;32m 870\u001b[0m \u001b[38;5;66;03m# Binary mode does not support 'encoding' and 'newline'.\u001b[39;00m\n\u001b[1;32m 871\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mencoding \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mmode:\n\u001b[1;32m 872\u001b[0m \u001b[38;5;66;03m# Encoding\u001b[39;00m\n\u001b[0;32m--> 873\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\n\u001b[1;32m 874\u001b[0m \u001b[43m \u001b[49m\u001b[43mhandle\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 875\u001b[0m \u001b[43m \u001b[49m\u001b[43mioargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 876\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mioargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencoding\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 877\u001b[0m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 878\u001b[0m \u001b[43m \u001b[49m\u001b[43mnewline\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 879\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 880\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 881\u001b[0m \u001b[38;5;66;03m# Binary mode\u001b[39;00m\n\u001b[1;32m 882\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(handle, ioargs\u001b[38;5;241m.\u001b[39mmode)\n","\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/kaggle/input/dataset/dataset.txt'"]}],"source":["from statsmodels.tsa.stattools import grangercausalitytests\n","data = pd.read_csv('/kaggle/input/dataset/dataset.txt')\n","data['date'] = pd.to_datetime(data['date'])\n","data['month'] = data.date.dt.month\n","grangercausalitytests(data[['value', 'month']], maxlag=2)"]},{"cell_type":"markdown","metadata":{},"source":["- In the above case, the p-values are zero for all tests. So the ‘month’ indeed can be used to forecast the values."]},{"cell_type":"markdown","metadata":{},"source":["# **20. Smoothening a Time Series** \n","\n","\n","[Table of Contents](#0.1)\n","\n","\n","\n","Smoothening of a time series may be useful in the following circumstances:\n","\n","\n","- Reducing the effect of noise in a signal get a fair approximation of the noise-filtered series.\n","- The smoothed version of series can be used as a feature to explain the original series itself.\n","- Visualize the underlying trend better.\n","\n","\n","We can smoothen a time series using the following methods:\n","\n","\n","- Take a moving average\n","- Do a LOESS smoothing (Localized Regression)\n","- Do a LOWESS smoothing (Locally Weighted Regression)"]},{"cell_type":"markdown","metadata":{},"source":["## **Moving Average** \n","\n","\n","- **Moving average** is the average of a rolling window of defined width. We must choose the window-width wisely, because, large window-size will over-smooth the series. For example, a window-size equal to the seasonal duration (ex: 12 for a month-wise series), will effectively nullify the seasonal effect.\n"," "]},{"cell_type":"markdown","metadata":{},"source":["## **Localized Regression**\n","\n","\n","- LOESS, short for ‘Localized Regression’ fits multiple regressions in the local neighborhood of each point. It is implemented in the statsmodels package, where you can control the degree of smoothing using frac argument which specifies the percentage of data points nearby that should be considered to fit a regression model."]},{"cell_type":"markdown","metadata":{},"source":["# **21. References** \n","\n","[Table of Contents](#0.1)\n","\n","The concepts and code in this notebook is taken from the following websites:-\n","\n","1.\thttps://www.machinelearningplus.com/time-series/time-series-analysis-python/\n","2.\thttps://towardsdatascience.com/an-end-to-end-project-on-time-series-analysis-and-forecasting-with-python-4835e6bf050b\n","3.\thttps://towardsdatascience.com/time-series-analysis-in-python-an-introduction-70d5a5b1d52a\n","4.\thttps://www.digitalocean.com/community/tutorials/a-guide-to-time-series-visualization-with-python-3\n","\n"]},{"cell_type":"markdown","metadata":{"trusted":true},"source":["So, now we will come to the end of this notebook.\n","\n","I hope you find this notebook useful and enjoyable.\n","\n","Your comments and feedback are most welcome.\n","\n","Thank you\n"]},{"cell_type":"markdown","metadata":{},"source":["[Go to Top](#0)"]}],"metadata":{"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.12.4"}},"nbformat":4,"nbformat_minor":4} diff --git a/ds701_book/data/.gitignore b/ds701_book/data/.gitignore new file mode 100644 index 00000000..e7dd2674 --- /dev/null +++ b/ds701_book/data/.gitignore @@ -0,0 +1,4 @@ +dogs-vs-cats/ +dogs-vs-cats.zip +cifar-10-batches-py/ +cifar-10-python.tar.gz diff --git a/ds701_book/example.txt b/ds701_book/example.txt new file mode 100644 index 00000000..6a092d59 --- /dev/null +++ b/ds701_book/example.txt @@ -0,0 +1,3 @@ +Hello World! +How are you? +I'm fine. OK. diff --git a/ds701_book/figs/L06-k-means-on-uniform-data.png b/ds701_book/figs/L06-k-means-on-uniform-data.png new file mode 100644 index 00000000..0524a3d3 Binary files /dev/null and b/ds701_book/figs/L06-k-means-on-uniform-data.png differ diff --git a/ds701_book/figs/L06-kmeans-dataset-comparison.png b/ds701_book/figs/L06-kmeans-dataset-comparison.png new file mode 100644 index 00000000..263832be Binary files /dev/null and b/ds701_book/figs/L06-kmeans-dataset-comparison.png differ diff --git a/ds701_book/figs/L14-gain-ratio.png b/ds701_book/figs/L14-gain-ratio.png new file mode 100644 index 00000000..ed776541 Binary files /dev/null and b/ds701_book/figs/L14-gain-ratio.png differ diff --git a/ds701_book/figs/L14-terrier-savings-logo.webp b/ds701_book/figs/L14-terrier-savings-logo.webp new file mode 100644 index 00000000..fd98a615 Binary files /dev/null and b/ds701_book/figs/L14-terrier-savings-logo.webp differ diff --git a/ds701_book/figs/L17-Daniel-Kahneman--NYT.webp b/ds701_book/figs/L17-Daniel-Kahneman--NYT.webp new file mode 100644 index 00000000..cce618cc Binary files /dev/null and b/ds701_book/figs/L17-Daniel-Kahneman--NYT.webp differ diff --git a/ds701_book/figs/L17-Israeli-Airforce-Pirate.webp b/ds701_book/figs/L17-Israeli-Airforce-Pirate.webp new file mode 100644 index 00000000..2cd50205 Binary files /dev/null and b/ds701_book/figs/L17-Israeli-Airforce-Pirate.webp differ diff --git a/ds701_book/figs/RecSys-figs/dl-recsys-cnn.png b/ds701_book/figs/RecSys-figs/dl-recsys-cnn.png new file mode 100644 index 00000000..24360e65 Binary files /dev/null and b/ds701_book/figs/RecSys-figs/dl-recsys-cnn.png differ diff --git a/ds701_book/figs/RecSys-figs/dl-recsys-mlp.png b/ds701_book/figs/RecSys-figs/dl-recsys-mlp.png new file mode 100644 index 00000000..bf1fb67b Binary files /dev/null and b/ds701_book/figs/RecSys-figs/dl-recsys-mlp.png differ diff --git a/ds701_book/figs/RecSys-figs/dl-recsys-rnn.png b/ds701_book/figs/RecSys-figs/dl-recsys-rnn.png new file mode 100644 index 00000000..8a082252 Binary files /dev/null and b/ds701_book/figs/RecSys-figs/dl-recsys-rnn.png differ diff --git a/ds701_book/figs/ada_lovelace.webp b/ds701_book/figs/ada_lovelace.webp new file mode 100644 index 00000000..a3ffe520 Binary files /dev/null and b/ds701_book/figs/ada_lovelace.webp differ diff --git a/ds701_book/figs/decision_tree.png b/ds701_book/figs/decision_tree.png index d8b5de25..d2f0a10b 100644 Binary files a/ds701_book/figs/decision_tree.png and b/ds701_book/figs/decision_tree.png differ diff --git a/ds701_book/how-random-forests-really-work.ipynb b/ds701_book/how-random-forests-really-work.ipynb new file mode 100644 index 00000000..06828e40 --- /dev/null +++ b/ds701_book/how-random-forests-really-work.ipynb @@ -0,0 +1,3724 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "06edaf34", + "metadata": { + "papermill": { + "duration": 0.088578, + "end_time": "2022-05-23T23:52:03.886920", + "exception": false, + "start_time": "2022-05-23T23:52:03.798342", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## Introduction" + ] + }, + { + "cell_type": "markdown", + "id": "f058b15f", + "metadata": { + "papermill": { + "duration": 0.075585, + "end_time": "2022-05-23T23:52:04.041562", + "exception": false, + "start_time": "2022-05-23T23:52:03.965977", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "Previously I've shown how to create a [linear model and neural net from scratch](https://www.kaggle.com/code/jhoward/linear-model-and-neural-net-from-scratch), and used it to create a solid submission to Kaggle's [Titanic](https://www.kaggle.com/competitions/titanic/) competition. However, for *tabular* data (i.e data that looks like spreadsheet or database tables, such as the data for the Titanic competition) it's more common to see good results by using ensembles of decision trees, such as Random Forests and Gradient Boosting Machines.\n", + "\n", + "In this notebook, we're going to learn all about Random Forests, by building one from scratch, and using it to submit to the Titanic competition! That might sound like a pretty big stretch, but I think you'll be surprised to discover how straightforward it actually is.\n", + "\n", + "We'll start by importing the basic set of libraries we normally need for data science work, and setting numpy to use our display space more efficiently:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "d64452b3", + "metadata": { + "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", + "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5", + "execution": { + "iopub.execute_input": "2022-05-23T23:52:04.194966Z", + "iopub.status.busy": "2022-05-23T23:52:04.194414Z", + "iopub.status.idle": "2022-05-23T23:52:04.615833Z", + "shell.execute_reply": "2022-05-23T23:52:04.614881Z" + }, + "papermill": { + "duration": 0.501423, + "end_time": "2022-05-23T23:52:04.618761", + "exception": false, + "start_time": "2022-05-23T23:52:04.117338", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "from fastai.imports import *\n", + "np.set_printoptions(linewidth=130)" + ] + }, + { + "cell_type": "markdown", + "id": "963b3840", + "metadata": { + "papermill": { + "duration": 0.076015, + "end_time": "2022-05-23T23:52:04.770832", + "exception": false, + "start_time": "2022-05-23T23:52:04.694817", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## Data preprocessing" + ] + }, + { + "cell_type": "markdown", + "id": "c582e910", + "metadata": { + "papermill": { + "duration": 0.07709, + "end_time": "2022-05-23T23:52:04.924481", + "exception": false, + "start_time": "2022-05-23T23:52:04.847391", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "We'll create `DataFrame`s from the CSV files just like we did in the \"*linear model and neural net from scratch*\" notebook, and do much the same preprocessing (so go back and check that out if you're not already familiar with the dataset):" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "8c16f049", + "metadata": { + "execution": { + "iopub.execute_input": "2022-05-23T23:52:05.080498Z", + "iopub.status.busy": "2022-05-23T23:52:05.079868Z", + "iopub.status.idle": "2022-05-23T23:52:05.132162Z", + "shell.execute_reply": "2022-05-23T23:52:05.131303Z" + }, + "papermill": { + "duration": 0.131266, + "end_time": "2022-05-23T23:52:05.134686", + "exception": false, + "start_time": "2022-05-23T23:52:05.003420", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Warning: Your Kaggle API key is readable by other users on this system! To fix this, you can run 'chmod 600 /Users/tomg/.kaggle/kaggle.json'\n", + "titanic.zip: Skipping, found more recently modified local copy (use --force to force download)\n" + ] + } + ], + "source": [ + "import os\n", + "iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')\n", + "\n", + "if iskaggle: path = Path('../input/titanic')\n", + "else:\n", + " import zipfile,kaggle\n", + " path = Path('titanic')\n", + " kaggle.api.competition_download_cli(str(path))\n", + " zipfile.ZipFile(f'{path}.zip').extractall(path)\n", + "\n", + "df = pd.read_csv(path/'train.csv')\n", + "tst_df = pd.read_csv(path/'test.csv')\n", + "modes = df.mode().iloc[0]" + ] + }, + { + "cell_type": "markdown", + "id": "c0a0c35b", + "metadata": { + "papermill": { + "duration": 0.074326, + "end_time": "2022-05-23T23:52:05.283755", + "exception": false, + "start_time": "2022-05-23T23:52:05.209429", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "One difference with Random Forests however is that we don't generally have to create *dummy variables* like we did for non-numeric columns in the linear models and neural network. Instead, we can just convert those fields to *categorical variables*, which internally in Pandas makes a list of all the unique values in the column, and replaces each value with a number. The number is just an index for looking up the value in the list of all unique values." + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "0e222580", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PassengerId 1\n", + "Survived 0.0\n", + "Pclass 3.0\n", + "Name Abbing, Mr. Anthony\n", + "Sex male\n", + "Age 24.0\n", + "SibSp 0.0\n", + "Parch 0.0\n", + "Ticket 1601\n", + "Fare 8.05\n", + "Cabin B96 B98\n", + "Embarked S\n", + "Name: 0, dtype: object\n" + ] + } + ], + "source": [ + "print(modes)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "1071c1a3", + "metadata": { + "execution": { + "iopub.execute_input": "2022-05-23T23:52:05.438310Z", + "iopub.status.busy": "2022-05-23T23:52:05.437730Z", + "iopub.status.idle": "2022-05-23T23:52:05.458998Z", + "shell.execute_reply": "2022-05-23T23:52:05.458205Z" + }, + "papermill": { + "duration": 0.099983, + "end_time": "2022-05-23T23:52:05.461215", + "exception": false, + "start_time": "2022-05-23T23:52:05.361232", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def proc_data(df):\n", + " df['Fare'] = df.Fare.fillna(0)\n", + " df.fillna(modes, inplace=True)\n", + " df['LogFare'] = np.log1p(df['Fare'])\n", + " df['Embarked'] = pd.Categorical(df.Embarked)\n", + " df['Sex'] = pd.Categorical(df.Sex)\n", + "\n", + "proc_data(df)\n", + "proc_data(tst_df)" + ] + }, + { + "cell_type": "markdown", + "id": "64f0196d", + "metadata": { + "papermill": { + "duration": 0.075622, + "end_time": "2022-05-23T23:52:05.612873", + "exception": false, + "start_time": "2022-05-23T23:52:05.537251", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "We'll make a list of the continuous, categorical, and dependent variables. Note that we no longer consider `Pclass` a categorical variable. That's because it's *ordered* (i.e 1st, 2nd, and 3rd class have an order), and decision trees, as we'll see, only care about order, not about absolute value." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "98336680", + "metadata": { + "execution": { + "iopub.execute_input": "2022-05-23T23:52:05.767333Z", + "iopub.status.busy": "2022-05-23T23:52:05.766770Z", + "iopub.status.idle": "2022-05-23T23:52:05.771007Z", + "shell.execute_reply": "2022-05-23T23:52:05.770353Z" + }, + "papermill": { + "duration": 0.082835, + "end_time": "2022-05-23T23:52:05.772965", + "exception": false, + "start_time": "2022-05-23T23:52:05.690130", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "cats=[\"Sex\",\"Embarked\"]\n", + "conts=['Age', 'SibSp', 'Parch', 'LogFare',\"Pclass\"]\n", + "dep=\"Survived\"" + ] + }, + { + "cell_type": "markdown", + "id": "1d73a083", + "metadata": { + "papermill": { + "duration": 0.074566, + "end_time": "2022-05-23T23:52:05.923051", + "exception": false, + "start_time": "2022-05-23T23:52:05.848485", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "Even although we've made the `cats` columns categorical, they are still shown by Pandas as their original values:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "d986270f", + "metadata": { + "execution": { + "iopub.execute_input": "2022-05-23T23:52:06.075283Z", + "iopub.status.busy": "2022-05-23T23:52:06.074927Z", + "iopub.status.idle": "2022-05-23T23:52:06.084866Z", + "shell.execute_reply": "2022-05-23T23:52:06.084138Z" + }, + "papermill": { + "duration": 0.088602, + "end_time": "2022-05-23T23:52:06.087425", + "exception": false, + "start_time": "2022-05-23T23:52:05.998823", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0 male\n", + "1 female\n", + "2 female\n", + "3 female\n", + "4 male\n", + "Name: Sex, dtype: category\n", + "Categories (2, object): ['female', 'male']" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.Sex.head()" + ] + }, + { + "cell_type": "markdown", + "id": "f3213c9b", + "metadata": { + "papermill": { + "duration": 0.075127, + "end_time": "2022-05-23T23:52:06.239653", + "exception": false, + "start_time": "2022-05-23T23:52:06.164526", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "However behind the scenes they're now stored as integers, with indices that are looked up in the `Categories` list shown in the output above. We can view the stored values by looking in the `cat.codes` attribute:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "52d11e29", + "metadata": { + "execution": { + "iopub.execute_input": "2022-05-23T23:52:06.393628Z", + "iopub.status.busy": "2022-05-23T23:52:06.392869Z", + "iopub.status.idle": "2022-05-23T23:52:06.399858Z", + "shell.execute_reply": "2022-05-23T23:52:06.398821Z" + }, + "papermill": { + "duration": 0.085298, + "end_time": "2022-05-23T23:52:06.401867", + "exception": false, + "start_time": "2022-05-23T23:52:06.316569", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0 1\n", + "1 0\n", + "2 0\n", + "3 0\n", + "4 1\n", + "dtype: int8" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.Sex.cat.codes.head()" + ] + }, + { + "cell_type": "markdown", + "id": "41672065", + "metadata": { + "papermill": { + "duration": 0.07507, + "end_time": "2022-05-23T23:52:06.552765", + "exception": false, + "start_time": "2022-05-23T23:52:06.477695", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## Binary splits" + ] + }, + { + "cell_type": "markdown", + "id": "d9a0be88", + "metadata": { + "papermill": { + "duration": 0.078022, + "end_time": "2022-05-23T23:52:06.707522", + "exception": false, + "start_time": "2022-05-23T23:52:06.629500", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "Before we create a Random Forest or Gradient Boosting Machine, we'll first need to learn how to create a *decision tree*, from which both of these models are built.\n", + "\n", + "And to create a decision tree, we'll first need to create a *binary split*, since that's what a decision tree is built from.\n", + "\n", + "A binary split is where all rows are placed into one of two groups, based on whether they're above or below some threshold of some column. For example, we could split the rows of our dataset into males and females, by using the threshold `0.5` and the column `Sex` (since the values in the column are `0` for `female` and `1` for `male`). We can use a plot to see how that would split up our data -- we'll use the [Seaborn](https://seaborn.pydata.org/) library, which is a layer on top of [matplotlib](https://matplotlib.org/) that makes some useful charts easier to create, and more aesthetically pleasing by default:" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "55f4f7a9", + "metadata": { + "execution": { + "iopub.execute_input": "2022-05-23T23:52:06.861689Z", + "iopub.status.busy": "2022-05-23T23:52:06.861126Z", + "iopub.status.idle": "2022-05-23T23:52:07.884110Z", + "shell.execute_reply": "2022-05-23T23:52:07.883114Z" + }, + "papermill": { + "duration": 1.102355, + "end_time": "2022-05-23T23:52:07.886349", + "exception": false, + "start_time": "2022-05-23T23:52:06.783994", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import seaborn as sns\n", + "\n", + "fig,axs = plt.subplots(1,2, figsize=(11,5))\n", + "sns.barplot(data=df, y=dep, x=\"Sex\", ax=axs[0], hue=\"Sex\", palette=[\"#3374a1\",\"#e1812d\"]).set(title=\"Survival rate\")\n", + "sns.countplot(data=df, x=\"Sex\", ax=axs[1], hue=\"Sex\", palette=[\"#3374a1\",\"#e1812d\"]).set(title=\"Histogram\");" + ] + }, + { + "cell_type": "markdown", + "id": "7e9411ae", + "metadata": { + "papermill": { + "duration": 0.076048, + "end_time": "2022-05-23T23:52:08.041349", + "exception": false, + "start_time": "2022-05-23T23:52:07.965301", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "Here we see that (on the left) if we split the data into males and females, we'd have groups that have very different survival rates: >70% for females, and <20% for males. We can also see (on the right) that the split would be reasonably even, with over 300 passengers (out of around 900) in each group.\n", + "\n", + "We could create a very simple \"model\" which simply says that all females survive, and no males do. To do so, we better first split our data into a training and validation set, to see how accurate this approach turns out to be:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "7b21423f", + "metadata": { + "execution": { + "iopub.execute_input": "2022-05-23T23:52:08.197669Z", + "iopub.status.busy": "2022-05-23T23:52:08.197358Z", + "iopub.status.idle": "2022-05-23T23:52:08.381950Z", + "shell.execute_reply": "2022-05-23T23:52:08.381258Z" + }, + "papermill": { + "duration": 0.266668, + "end_time": "2022-05-23T23:52:08.384358", + "exception": false, + "start_time": "2022-05-23T23:52:08.117690", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "from numpy import random\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "random.seed(42)\n", + "trn_df,val_df = train_test_split(df, test_size=0.25)\n", + "trn_df[cats] = trn_df[cats].apply(lambda x: x.cat.codes)\n", + "val_df[cats] = val_df[cats].apply(lambda x: x.cat.codes)" + ] + }, + { + "cell_type": "markdown", + "id": "1ca3d1b4", + "metadata": { + "papermill": { + "duration": 0.076211, + "end_time": "2022-05-23T23:52:08.538389", + "exception": false, + "start_time": "2022-05-23T23:52:08.462178", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "(In the previous step we also replaced the categorical variables with their integer codes, since some of the models we'll be building in a moment require that.)\n", + "\n", + "Now we can create our independent variables (the `x` variables) and dependent (the `y` variable):" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "a1bdbf46", + "metadata": { + "execution": { + "iopub.execute_input": "2022-05-23T23:52:08.693767Z", + "iopub.status.busy": "2022-05-23T23:52:08.692988Z", + "iopub.status.idle": "2022-05-23T23:52:08.701823Z", + "shell.execute_reply": "2022-05-23T23:52:08.700843Z" + }, + "papermill": { + "duration": 0.08891, + "end_time": "2022-05-23T23:52:08.704290", + "exception": false, + "start_time": "2022-05-23T23:52:08.615380", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def xs_y(df):\n", + " xs = df[cats+conts].copy()\n", + " return xs,df[dep] if dep in df else None\n", + "\n", + "trn_xs,trn_y = xs_y(trn_df)\n", + "val_xs,val_y = xs_y(val_df)" + ] + }, + { + "cell_type": "markdown", + "id": "7f1503e1", + "metadata": { + "papermill": { + "duration": 0.075886, + "end_time": "2022-05-23T23:52:08.857760", + "exception": false, + "start_time": "2022-05-23T23:52:08.781874", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "Here's the predictions for our extremely simple model, where `female` is coded as `0`:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "4c717040", + "metadata": { + "execution": { + "iopub.execute_input": "2022-05-23T23:52:09.014276Z", + "iopub.status.busy": "2022-05-23T23:52:09.013681Z", + "iopub.status.idle": "2022-05-23T23:52:09.018107Z", + "shell.execute_reply": "2022-05-23T23:52:09.017344Z" + }, + "papermill": { + "duration": 0.085919, + "end_time": "2022-05-23T23:52:09.020308", + "exception": false, + "start_time": "2022-05-23T23:52:08.934389", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "preds = val_xs.Sex==0" + ] + }, + { + "cell_type": "markdown", + "id": "b4a11532", + "metadata": { + "papermill": { + "duration": 0.078424, + "end_time": "2022-05-23T23:52:09.175518", + "exception": false, + "start_time": "2022-05-23T23:52:09.097094", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "We'll use mean absolute error to measure how good this model is:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "0c0d9955", + "metadata": { + "execution": { + "iopub.execute_input": "2022-05-23T23:52:09.331784Z", + "iopub.status.busy": "2022-05-23T23:52:09.331490Z", + "iopub.status.idle": "2022-05-23T23:52:09.338210Z", + "shell.execute_reply": "2022-05-23T23:52:09.337243Z" + }, + "papermill": { + "duration": 0.087547, + "end_time": "2022-05-23T23:52:09.340660", + "exception": false, + "start_time": "2022-05-23T23:52:09.253113", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "np.float64(0.21524663677130046)" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.metrics import mean_absolute_error\n", + "mean_absolute_error(val_y, preds)" + ] + }, + { + "cell_type": "markdown", + "id": "f1c9360b", + "metadata": { + "papermill": { + "duration": 0.076347, + "end_time": "2022-05-23T23:52:09.494011", + "exception": false, + "start_time": "2022-05-23T23:52:09.417664", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "Alternatively, we could try splitting on a continuous column. We have to use a somewhat different chart to see how this might work -- here's an example of how we could look at `LogFare`:" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "acbdf71a", + "metadata": { + "execution": { + "iopub.execute_input": "2022-05-23T23:52:09.650478Z", + "iopub.status.busy": "2022-05-23T23:52:09.650134Z", + "iopub.status.idle": "2022-05-23T23:52:09.932886Z", + "shell.execute_reply": "2022-05-23T23:52:09.931944Z" + }, + "papermill": { + "duration": 0.363754, + "end_time": "2022-05-23T23:52:09.935270", + "exception": false, + "start_time": "2022-05-23T23:52:09.571516", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df_fare = trn_df[trn_df.LogFare>0]\n", + "fig,axs = plt.subplots(1,2, figsize=(11,5))\n", + "sns.boxenplot(data=df_fare, x=dep, y=\"LogFare\", ax=axs[0], hue=dep, palette=[\"#3374a1\",\"#e1812d\"])\n", + "sns.kdeplot(data=df_fare, x=\"LogFare\", ax=axs[1]);" + ] + }, + { + "cell_type": "markdown", + "id": "79a5316d", + "metadata": { + "papermill": { + "duration": 0.080153, + "end_time": "2022-05-23T23:52:10.094741", + "exception": false, + "start_time": "2022-05-23T23:52:10.014588", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "The [boxenplot](https://seaborn.pydata.org/generated/seaborn.boxenplot.html) above shows quantiles of `LogFare` for each group of `Survived==0` and `Survived==1`. It shows that the average `LogFare` for passengers that didn't survive is around `2.5`, and for those that did it's around `3.2`. So it seems that people that paid more for their tickets were more likely to get put on a lifeboat.\n", + "\n", + "Let's create a simple model based on this observation:" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "1ba429ff", + "metadata": { + "execution": { + "iopub.execute_input": "2022-05-23T23:52:10.253568Z", + "iopub.status.busy": "2022-05-23T23:52:10.253277Z", + "iopub.status.idle": "2022-05-23T23:52:10.258015Z", + "shell.execute_reply": "2022-05-23T23:52:10.257122Z" + }, + "papermill": { + "duration": 0.086971, + "end_time": "2022-05-23T23:52:10.260566", + "exception": false, + "start_time": "2022-05-23T23:52:10.173595", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "preds = val_xs.LogFare>2.7" + ] + }, + { + "cell_type": "markdown", + "id": "f6d760f2", + "metadata": { + "papermill": { + "duration": 0.078029, + "end_time": "2022-05-23T23:52:10.418725", + "exception": false, + "start_time": "2022-05-23T23:52:10.340696", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "...and test it out:" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "faa47c50", + "metadata": { + "execution": { + "iopub.execute_input": "2022-05-23T23:52:10.577363Z", + "iopub.status.busy": "2022-05-23T23:52:10.576760Z", + "iopub.status.idle": "2022-05-23T23:52:10.583341Z", + "shell.execute_reply": "2022-05-23T23:52:10.582517Z" + }, + "papermill": { + "duration": 0.088187, + "end_time": "2022-05-23T23:52:10.585326", + "exception": false, + "start_time": "2022-05-23T23:52:10.497139", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "np.float64(0.336322869955157)" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mean_absolute_error(val_y, preds)" + ] + }, + { + "cell_type": "markdown", + "id": "b1fb2616", + "metadata": { + "papermill": { + "duration": 0.078849, + "end_time": "2022-05-23T23:52:10.742895", + "exception": false, + "start_time": "2022-05-23T23:52:10.664046", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "This is quite a bit less accurate than our model that used `Sex` as the single binary split.\n", + "\n", + "Ideally, we'd like some way to try more columns and breakpoints more easily. We could create a function that returns how good our model is, in order to more quickly try out a few different splits. We'll create a `score` function to do this. Instead of returning the mean absolute error, we'll calculate a measure of *impurity* -- that is, how much the binary split creates two groups where the rows in a group are each similar to each other, or dissimilar.\n", + "\n", + "We can measure the similarity of rows inside a group by taking the standard deviation of the dependent variable. If it's higher, then it means the rows are more different to each other. We'll then multiply this by the number of rows, since a bigger group as more impact than a smaller group:" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "b651bfd0", + "metadata": { + "execution": { + "iopub.execute_input": "2022-05-23T23:52:10.902098Z", + "iopub.status.busy": "2022-05-23T23:52:10.901650Z", + "iopub.status.idle": "2022-05-23T23:52:10.906471Z", + "shell.execute_reply": "2022-05-23T23:52:10.905688Z" + }, + "papermill": { + "duration": 0.087027, + "end_time": "2022-05-23T23:52:10.908542", + "exception": false, + "start_time": "2022-05-23T23:52:10.821515", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def _side_score(side, y):\n", + " tot = side.sum()\n", + " if tot<=1: return 0\n", + " return y[side].std()*tot" + ] + }, + { + "cell_type": "markdown", + "id": "2e592ee2", + "metadata": { + "papermill": { + "duration": 0.078184, + "end_time": "2022-05-23T23:52:11.065440", + "exception": false, + "start_time": "2022-05-23T23:52:10.987256", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "Now we've got that written, we can calculate the score for a split by adding up the scores for the \"left hand side\" (lhs) and \"right hand side\" (rhs):" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "516a5e14", + "metadata": { + "execution": { + "iopub.execute_input": "2022-05-23T23:52:11.227933Z", + "iopub.status.busy": "2022-05-23T23:52:11.227382Z", + "iopub.status.idle": "2022-05-23T23:52:11.231832Z", + "shell.execute_reply": "2022-05-23T23:52:11.231159Z" + }, + "papermill": { + "duration": 0.089432, + "end_time": "2022-05-23T23:52:11.233859", + "exception": false, + "start_time": "2022-05-23T23:52:11.144427", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + " \n", + "def score(col, y, split):\n", + " lhs = col<=split\n", + " return (_side_score(lhs,y) + _side_score(~lhs,y))/len(y)" + ] + }, + { + "cell_type": "markdown", + "id": "9709bc86", + "metadata": { + "papermill": { + "duration": 0.078311, + "end_time": "2022-05-23T23:52:11.393682", + "exception": false, + "start_time": "2022-05-23T23:52:11.315371", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "For instance, here's the impurity score for the split on `Sex`:" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "fae1fdbc", + "metadata": { + "execution": { + "iopub.execute_input": "2022-05-23T23:52:11.552734Z", + "iopub.status.busy": "2022-05-23T23:52:11.552167Z", + "iopub.status.idle": "2022-05-23T23:52:11.559158Z", + "shell.execute_reply": "2022-05-23T23:52:11.558604Z" + }, + "papermill": { + "duration": 0.088961, + "end_time": "2022-05-23T23:52:11.561023", + "exception": false, + "start_time": "2022-05-23T23:52:11.472062", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "np.float64(0.40787530982063946)" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "score(trn_xs[\"Sex\"], trn_y, 0.5)" + ] + }, + { + "cell_type": "markdown", + "id": "364e9f82", + "metadata": { + "papermill": { + "duration": 0.07955, + "end_time": "2022-05-23T23:52:11.720650", + "exception": false, + "start_time": "2022-05-23T23:52:11.641100", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "...and for `LogFare`:" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "e6549176", + "metadata": { + "execution": { + "iopub.execute_input": "2022-05-23T23:52:11.885615Z", + "iopub.status.busy": "2022-05-23T23:52:11.885055Z", + "iopub.status.idle": "2022-05-23T23:52:11.892268Z", + "shell.execute_reply": "2022-05-23T23:52:11.891417Z" + }, + "papermill": { + "duration": 0.09165, + "end_time": "2022-05-23T23:52:11.894535", + "exception": false, + "start_time": "2022-05-23T23:52:11.802885", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "np.float64(0.47180873952099694)" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "score(trn_xs[\"LogFare\"], trn_y, 2.7)" + ] + }, + { + "cell_type": "markdown", + "id": "356238b3", + "metadata": { + "papermill": { + "duration": 0.079081, + "end_time": "2022-05-23T23:52:12.055373", + "exception": false, + "start_time": "2022-05-23T23:52:11.976292", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "As we'd expect from our earlier tests, `Sex` appears to be a better split.\n", + "\n", + "To make it easier to find the best binary split, we can create a simple interactive tool (note that this only works in Kaggle if you click \"Copy and Edit\" in the top right to open the notebook editor):" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "8c928d3b", + "metadata": { + "execution": { + "iopub.execute_input": "2022-05-23T23:52:12.216334Z", + "iopub.status.busy": "2022-05-23T23:52:12.215970Z", + "iopub.status.idle": "2022-05-23T23:52:12.265156Z", + "shell.execute_reply": "2022-05-23T23:52:12.264332Z" + }, + "papermill": { + "duration": 0.132469, + "end_time": "2022-05-23T23:52:12.267078", + "exception": false, + "start_time": "2022-05-23T23:52:12.134609", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "cab1dd9c53a64059a45ac29ce03e807f", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "interactive(children=(Dropdown(description='nm', options=('Age', 'SibSp', 'Parch', 'LogFare', 'Pclass'), value…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "def iscore(nm, split):\n", + " col = trn_xs[nm]\n", + " return score(col, trn_y, split)\n", + "\n", + "from ipywidgets import interact\n", + "interact(nm=conts, split=15.5)(iscore);" + ] + }, + { + "cell_type": "markdown", + "id": "815eb4ff", + "metadata": { + "papermill": { + "duration": 0.079567, + "end_time": "2022-05-23T23:52:12.429393", + "exception": false, + "start_time": "2022-05-23T23:52:12.349826", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "Try selecting different columns and split points using the dropdown and slider above. What splits can you find that increase the purity of the data?\n", + "\n", + "We can do the same thing for the categorical variables:" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "7a01bf2f", + "metadata": { + "execution": { + "iopub.execute_input": "2022-05-23T23:52:12.590541Z", + "iopub.status.busy": "2022-05-23T23:52:12.589936Z", + "iopub.status.idle": "2022-05-23T23:52:12.629855Z", + "shell.execute_reply": "2022-05-23T23:52:12.629042Z" + }, + "papermill": { + "duration": 0.12315, + "end_time": "2022-05-23T23:52:12.632048", + "exception": false, + "start_time": "2022-05-23T23:52:12.508898", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "2194cd3ce3a348ee939d7de2c3c4948c", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "interactive(children=(Dropdown(description='nm', options=('Sex', 'Embarked'), value='Sex'), IntSlider(value=2,…" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "interact(nm=cats, split=2)(iscore);" + ] + }, + { + "cell_type": "markdown", + "id": "2ca21219", + "metadata": { + "papermill": { + "duration": 0.083889, + "end_time": "2022-05-23T23:52:12.798280", + "exception": false, + "start_time": "2022-05-23T23:52:12.714391", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "That works well enough, but it's rather slow and fiddly. Perhaps we could get the computer to automatically find the best split point for a column for us? For example, to find the best split point for `age` we'd first need to make a list of all the possible split points (i.e all the unique values of that field)...:" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "e874abe7", + "metadata": { + "execution": { + "iopub.execute_input": "2022-05-23T23:52:12.966711Z", + "iopub.status.busy": "2022-05-23T23:52:12.966335Z", + "iopub.status.idle": "2022-05-23T23:52:12.974377Z", + "shell.execute_reply": "2022-05-23T23:52:12.973469Z" + }, + "papermill": { + "duration": 0.094016, + "end_time": "2022-05-23T23:52:12.976340", + "exception": false, + "start_time": "2022-05-23T23:52:12.882324", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([ 0.42, 0.67, 0.75, 0.83, 0.92, 1. , 2. , 3. , 4. , 5. , 6. , 7. , 8. , 9. , 10. , 11. , 12. ,\n", + " 13. , 14. , 14.5 , 15. , 16. , 17. , 18. , 19. , 20. , 21. , 22. , 23. , 24. , 24.5 , 25. , 26. , 27. ,\n", + " 28. , 28.5 , 29. , 30. , 31. , 32. , 32.5 , 33. , 34. , 34.5 , 35. , 36. , 36.5 , 37. , 38. , 39. , 40. ,\n", + " 40.5 , 41. , 42. , 43. , 44. , 45. , 45.5 , 46. , 47. , 48. , 49. , 50. , 51. , 52. , 53. , 54. , 55. ,\n", + " 55.5 , 56. , 57. , 58. , 59. , 60. , 61. , 62. , 64. , 65. , 70. , 70.5 , 74. , 80. ])" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nm = \"Age\"\n", + "col = trn_xs[nm]\n", + "unq = col.unique()\n", + "unq.sort()\n", + "unq" + ] + }, + { + "cell_type": "markdown", + "id": "89155c82", + "metadata": { + "papermill": { + "duration": 0.082301, + "end_time": "2022-05-23T23:52:13.139089", + "exception": false, + "start_time": "2022-05-23T23:52:13.056788", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "...and find which index of those values is where `score()` is the lowest:" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "46695da8", + "metadata": { + "execution": { + "iopub.execute_input": "2022-05-23T23:52:13.304274Z", + "iopub.status.busy": "2022-05-23T23:52:13.303102Z", + "iopub.status.idle": "2022-05-23T23:52:13.380984Z", + "shell.execute_reply": "2022-05-23T23:52:13.380401Z" + }, + "papermill": { + "duration": 0.162462, + "end_time": "2022-05-23T23:52:13.382996", + "exception": false, + "start_time": "2022-05-23T23:52:13.220534", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "np.float64(6.0)" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "scores = np.array([score(col, trn_y, o) for o in unq if not np.isnan(o)])\n", + "unq[scores.argmin()]" + ] + }, + { + "cell_type": "markdown", + "id": "a16194f3", + "metadata": { + "papermill": { + "duration": 0.081418, + "end_time": "2022-05-23T23:52:13.547416", + "exception": false, + "start_time": "2022-05-23T23:52:13.465998", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "Based on this, it looks like, for instance, that for the `Age` column, `6` is the optimal cutoff according to our training set.\n", + "\n", + "We can write a little function that implements this idea:" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "dbfea30b", + "metadata": { + "execution": { + "iopub.execute_input": "2022-05-23T23:52:13.713714Z", + "iopub.status.busy": "2022-05-23T23:52:13.713139Z", + "iopub.status.idle": "2022-05-23T23:52:13.796796Z", + "shell.execute_reply": "2022-05-23T23:52:13.795637Z" + }, + "papermill": { + "duration": 0.169239, + "end_time": "2022-05-23T23:52:13.799146", + "exception": false, + "start_time": "2022-05-23T23:52:13.629907", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(np.float64(6.0), np.float64(0.478316717508991))" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def min_col(df, nm):\n", + " col,y = df[nm],df[dep]\n", + " unq = col.dropna().unique()\n", + " scores = np.array([score(col, y, o) for o in unq if not np.isnan(o)])\n", + " idx = scores.argmin()\n", + " return unq[idx],scores[idx]\n", + "\n", + "min_col(trn_df, \"Age\")" + ] + }, + { + "cell_type": "markdown", + "id": "4982e304", + "metadata": { + "papermill": { + "duration": 0.082152, + "end_time": "2022-05-23T23:52:13.965025", + "exception": false, + "start_time": "2022-05-23T23:52:13.882873", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "Let's try all the columns:" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "ea0dd416", + "metadata": { + "execution": { + "iopub.execute_input": "2022-05-23T23:52:14.132925Z", + "iopub.status.busy": "2022-05-23T23:52:14.132393Z", + "iopub.status.idle": "2022-05-23T23:52:14.415875Z", + "shell.execute_reply": "2022-05-23T23:52:14.414970Z" + }, + "papermill": { + "duration": 0.368983, + "end_time": "2022-05-23T23:52:14.418270", + "exception": false, + "start_time": "2022-05-23T23:52:14.049287", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'Sex': (np.int8(0), np.float64(0.40787530982063946)),\n", + " 'Embarked': (np.int8(0), np.float64(0.47883342573147836)),\n", + " 'Age': (np.float64(6.0), np.float64(0.478316717508991)),\n", + " 'SibSp': (np.int64(4), np.float64(0.4783740258817434)),\n", + " 'Parch': (np.int64(0), np.float64(0.4805296527841601)),\n", + " 'LogFare': (np.float64(2.4390808375825834), np.float64(0.4620823937736597)),\n", + " 'Pclass': (np.int64(2), np.float64(0.46048261885806596))}" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cols = cats+conts\n", + "{o:min_col(trn_df, o) for o in cols}" + ] + }, + { + "cell_type": "markdown", + "id": "b9efce23", + "metadata": { + "papermill": { + "duration": 0.084939, + "end_time": "2022-05-23T23:52:14.586316", + "exception": false, + "start_time": "2022-05-23T23:52:14.501377", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "According to this, `Sex<=0` is the best split we can use.\n", + "\n", + "We've just re-invented the [OneR](https://link.springer.com/article/10.1023/A:1022631118932) classifier (or at least, a minor variant of it), which was found to be one of the most effective classifiers in real-world datasets, compared to the algorithms in use in 1993. Since it's so simple and surprisingly effective, it makes for a great *baseline* -- that is, a starting point that you can use to compare your more sophisticated models to.\n", + "\n", + "We found earlier that out OneR rule had an error of around `0.215`, so we'll keep that in mind as we try out more sophisticated approaches." + ] + }, + { + "cell_type": "markdown", + "id": "188dce42", + "metadata": { + "papermill": { + "duration": 0.08557, + "end_time": "2022-05-23T23:52:14.754376", + "exception": false, + "start_time": "2022-05-23T23:52:14.668806", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## Creating a decision tree" + ] + }, + { + "cell_type": "markdown", + "id": "4b46f3c0", + "metadata": { + "papermill": { + "duration": 0.082087, + "end_time": "2022-05-23T23:52:14.936889", + "exception": false, + "start_time": "2022-05-23T23:52:14.854802", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "How can we improve our OneR classifier, which predicts survival based only on `Sex`?\n", + "\n", + "How about we take each of our two groups, `female` and `male`, and create one more binary split for each of them. That is: fine the single best split for females, and the single best split for males. To do this, all we have to do is repeat the previous section's steps, once for males, and once for females.\n", + "\n", + "First, we'll remove `Sex` from the list of possible splits (since we've already used it, and there's only one possible split for that binary column), and create our two groups:" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "c763fa2d", + "metadata": { + "execution": { + "iopub.execute_input": "2022-05-23T23:52:15.108874Z", + "iopub.status.busy": "2022-05-23T23:52:15.107962Z", + "iopub.status.idle": "2022-05-23T23:52:15.113883Z", + "shell.execute_reply": "2022-05-23T23:52:15.113202Z" + }, + "papermill": { + "duration": 0.096166, + "end_time": "2022-05-23T23:52:15.116020", + "exception": false, + "start_time": "2022-05-23T23:52:15.019854", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "cols.remove(\"Sex\")\n", + "ismale = trn_df.Sex==1\n", + "males,females = trn_df[ismale],trn_df[~ismale]" + ] + }, + { + "cell_type": "markdown", + "id": "6fe0016a", + "metadata": { + "papermill": { + "duration": 0.083992, + "end_time": "2022-05-23T23:52:15.291509", + "exception": false, + "start_time": "2022-05-23T23:52:15.207517", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "Now let's find the single best binary split for males...:" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "7e2ddff4", + "metadata": { + "execution": { + "iopub.execute_input": "2022-05-23T23:52:15.458872Z", + "iopub.status.busy": "2022-05-23T23:52:15.458593Z", + "iopub.status.idle": "2022-05-23T23:52:15.681368Z", + "shell.execute_reply": "2022-05-23T23:52:15.680415Z" + }, + "papermill": { + "duration": 0.308595, + "end_time": "2022-05-23T23:52:15.683697", + "exception": false, + "start_time": "2022-05-23T23:52:15.375102", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'Embarked': (np.int8(0), np.float64(0.3875581870410906)),\n", + " 'Age': (np.float64(6.0), np.float64(0.3739828371010595)),\n", + " 'SibSp': (np.int64(4), np.float64(0.3875864227586273)),\n", + " 'Parch': (np.int64(0), np.float64(0.3874704821461959)),\n", + " 'LogFare': (np.float64(2.803360380906535), np.float64(0.3804856231758151)),\n", + " 'Pclass': (np.int64(1), np.float64(0.38155442004360934))}" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "{o:min_col(males, o) for o in cols}" + ] + }, + { + "cell_type": "markdown", + "id": "9a2311f4", + "metadata": { + "papermill": { + "duration": 0.082834, + "end_time": "2022-05-23T23:52:15.849096", + "exception": false, + "start_time": "2022-05-23T23:52:15.766262", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "...and for females:" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "9bf7b509", + "metadata": { + "execution": { + "iopub.execute_input": "2022-05-23T23:52:16.018822Z", + "iopub.status.busy": "2022-05-23T23:52:16.017551Z", + "iopub.status.idle": "2022-05-23T23:52:16.192436Z", + "shell.execute_reply": "2022-05-23T23:52:16.191481Z" + }, + "papermill": { + "duration": 0.262921, + "end_time": "2022-05-23T23:52:16.194640", + "exception": false, + "start_time": "2022-05-23T23:52:15.931719", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'Embarked': (np.int8(0), np.float64(0.4295252982857327)),\n", + " 'Age': (np.float64(50.0), np.float64(0.4225927658431649)),\n", + " 'SibSp': (np.int64(4), np.float64(0.42319212059713535)),\n", + " 'Parch': (np.int64(3), np.float64(0.4193314500446158)),\n", + " 'LogFare': (np.float64(4.256321678298823), np.float64(0.41350598332911376)),\n", + " 'Pclass': (np.int64(2), np.float64(0.3335388911567601))}" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "{o:min_col(females, o) for o in cols}" + ] + }, + { + "cell_type": "markdown", + "id": "aff7ee6c", + "metadata": { + "papermill": { + "duration": 0.082127, + "end_time": "2022-05-23T23:52:16.360962", + "exception": false, + "start_time": "2022-05-23T23:52:16.278835", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "We can see that the best next binary split for males is `Age<=6`, and for females is `Pclass<=2`.\n", + "\n", + "By adding these rules, we have created a *decision tree*, where our model will first check whether `Sex` is female or male, and depending on the result will then check either the above `Age` or `Pclass` rules, as appropriate. We could then repeat the process, creating new additional rules for each of the four groups we've now created.\n", + "\n", + "Rather than writing that code manually, we can use `DecisionTreeClassifier`, from *sklearn*, which does exactly that for us:" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "ec097da2", + "metadata": { + "execution": { + "iopub.execute_input": "2022-05-23T23:52:16.529664Z", + "iopub.status.busy": "2022-05-23T23:52:16.529398Z", + "iopub.status.idle": "2022-05-23T23:52:16.695471Z", + "shell.execute_reply": "2022-05-23T23:52:16.694497Z" + }, + "papermill": { + "duration": 0.252427, + "end_time": "2022-05-23T23:52:16.697904", + "exception": false, + "start_time": "2022-05-23T23:52:16.445477", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "from sklearn.tree import DecisionTreeClassifier, export_graphviz\n", + "\n", + "m = DecisionTreeClassifier(max_leaf_nodes=4).fit(trn_xs, trn_y);" + ] + }, + { + "cell_type": "markdown", + "id": "d99cb625", + "metadata": { + "papermill": { + "duration": 0.083074, + "end_time": "2022-05-23T23:52:16.867207", + "exception": false, + "start_time": "2022-05-23T23:52:16.784133", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "One handy feature or this class is that it provides a function for drawing a tree representing the rules:" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "aecbdf2c", + "metadata": { + "execution": { + "iopub.execute_input": "2022-05-23T23:52:17.035987Z", + "iopub.status.busy": "2022-05-23T23:52:17.035668Z", + "iopub.status.idle": "2022-05-23T23:52:17.053247Z", + "shell.execute_reply": "2022-05-23T23:52:17.052446Z" + }, + "papermill": { + "duration": 0.104803, + "end_time": "2022-05-23T23:52:17.055420", + "exception": false, + "start_time": "2022-05-23T23:52:16.950617", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import graphviz\n", + "\n", + "def draw_tree(t, df, size=10, ratio=0.6, precision=2, **kwargs):\n", + " s=export_graphviz(t, out_file=None, feature_names=df.columns, filled=True, rounded=True,\n", + " special_characters=True, rotate=False, precision=precision, **kwargs)\n", + " return graphviz.Source(re.sub('Tree {', f'Tree {{ size={size}; ratio={ratio}', s))" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "1bdc0cef", + "metadata": { + "execution": { + "iopub.execute_input": "2022-05-23T23:52:17.223303Z", + "iopub.status.busy": "2022-05-23T23:52:17.222716Z", + "iopub.status.idle": "2022-05-23T23:52:18.363065Z", + "shell.execute_reply": "2022-05-23T23:52:18.362266Z" + }, + "papermill": { + "duration": 1.226978, + "end_time": "2022-05-23T23:52:18.365861", + "exception": false, + "start_time": "2022-05-23T23:52:17.138883", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "Tree\n", + "\n", + "\n", + "\n", + "0\n", + "\n", + "Sex ≤ 0.5\n", + "gini = 0.47\n", + "samples = 668\n", + "value = [415, 253]\n", + "\n", + "\n", + "\n", + "1\n", + "\n", + "Pclass ≤ 2.5\n", + "gini = 0.38\n", + "samples = 229\n", + "value = [59, 170]\n", + "\n", + "\n", + "\n", + "0->1\n", + "\n", + "\n", + "True\n", + "\n", + "\n", + "\n", + "2\n", + "\n", + "Age ≤ 6.5\n", + "gini = 0.31\n", + "samples = 439\n", + "value = [356, 83]\n", + "\n", + "\n", + "\n", + "0->2\n", + "\n", + "\n", + "False\n", + "\n", + "\n", + "\n", + "3\n", + "\n", + "gini = 0.06\n", + "samples = 120\n", + "value = [4, 116]\n", + "\n", + "\n", + "\n", + "1->3\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "4\n", + "\n", + "gini = 0.5\n", + "samples = 109\n", + "value = [55, 54]\n", + "\n", + "\n", + "\n", + "1->4\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "5\n", + "\n", + "gini = 0.41\n", + "samples = 21\n", + "value = [6, 15]\n", + "\n", + "\n", + "\n", + "2->5\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "6\n", + "\n", + "gini = 0.27\n", + "samples = 418\n", + "value = [350, 68]\n", + "\n", + "\n", + "\n", + "2->6\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "draw_tree(m, trn_xs, size=10)" + ] + }, + { + "cell_type": "markdown", + "id": "227b4580", + "metadata": { + "papermill": { + "duration": 0.088127, + "end_time": "2022-05-23T23:52:18.559846", + "exception": false, + "start_time": "2022-05-23T23:52:18.471719", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "We can see that it's found exactly the same splits as we did!\n", + "\n", + "In this picture, the more orange nodes have a lower survival rate, and blue have higher survival. Each node shows how many rows (\"*samples*\") match that set of rules, and shows how many perish or survive (\"*values*\"). There's also something called \"*gini*\". That's another measure of impurity, and it's very similar to the `score()` we created earlier. It's defined as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "2fa894c2", + "metadata": { + "execution": { + "iopub.execute_input": "2022-05-23T23:52:18.752681Z", + "iopub.status.busy": "2022-05-23T23:52:18.751920Z", + "iopub.status.idle": "2022-05-23T23:52:18.758546Z", + "shell.execute_reply": "2022-05-23T23:52:18.757602Z" + }, + "papermill": { + "duration": 0.097829, + "end_time": "2022-05-23T23:52:18.760992", + "exception": false, + "start_time": "2022-05-23T23:52:18.663163", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def gini(cond):\n", + " act = df.loc[cond, dep]\n", + " return 1 - act.mean()**2 - (1-act).mean()**2" + ] + }, + { + "cell_type": "markdown", + "id": "7dfd7abc", + "metadata": { + "papermill": { + "duration": 0.084105, + "end_time": "2022-05-23T23:52:18.931050", + "exception": false, + "start_time": "2022-05-23T23:52:18.846945", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "What this calculates is the probability that, if you pick two rows from a group, you'll get the same `Survived` result each time. If the group is all the same, the probability is `1.0`, and `0.0` if they're all different:" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "4dc6dc0d", + "metadata": { + "execution": { + "iopub.execute_input": "2022-05-23T23:52:19.104861Z", + "iopub.status.busy": "2022-05-23T23:52:19.104134Z", + "iopub.status.idle": "2022-05-23T23:52:19.116417Z", + "shell.execute_reply": "2022-05-23T23:52:19.115262Z" + }, + "papermill": { + "duration": 0.103004, + "end_time": "2022-05-23T23:52:19.118690", + "exception": false, + "start_time": "2022-05-23T23:52:19.015686", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(np.float64(0.3828350034484158), np.float64(0.3064437162277842))" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gini(df.Sex=='female'), gini(df.Sex=='male')" + ] + }, + { + "cell_type": "markdown", + "id": "fd8824b5", + "metadata": { + "papermill": { + "duration": 0.084331, + "end_time": "2022-05-23T23:52:19.288293", + "exception": false, + "start_time": "2022-05-23T23:52:19.203962", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "Let's see how this model compares to our OneR version:" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "176028cc", + "metadata": { + "execution": { + "iopub.execute_input": "2022-05-23T23:52:19.457789Z", + "iopub.status.busy": "2022-05-23T23:52:19.457491Z", + "iopub.status.idle": "2022-05-23T23:52:19.466175Z", + "shell.execute_reply": "2022-05-23T23:52:19.465462Z" + }, + "papermill": { + "duration": 0.095697, + "end_time": "2022-05-23T23:52:19.468099", + "exception": false, + "start_time": "2022-05-23T23:52:19.372402", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "np.float64(0.2242152466367713)" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mean_absolute_error(val_y, m.predict(val_xs))" + ] + }, + { + "cell_type": "markdown", + "id": "a4ca0406", + "metadata": { + "papermill": { + "duration": 0.083894, + "end_time": "2022-05-23T23:52:19.636103", + "exception": false, + "start_time": "2022-05-23T23:52:19.552209", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "It's a tiny bit worse. Since this is such a small dataset (we've only got around 200 rows in our validation set) this small difference isn't really meaningful. Perhaps we'll see better results if we create a bigger tree:" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "b81bfa65", + "metadata": { + "execution": { + "iopub.execute_input": "2022-05-23T23:52:19.807239Z", + "iopub.status.busy": "2022-05-23T23:52:19.806662Z", + "iopub.status.idle": "2022-05-23T23:52:19.855092Z", + "shell.execute_reply": "2022-05-23T23:52:19.854063Z" + }, + "papermill": { + "duration": 0.136732, + "end_time": "2022-05-23T23:52:19.857341", + "exception": false, + "start_time": "2022-05-23T23:52:19.720609", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "Tree\n", + "\n", + "\n", + "\n", + "0\n", + "\n", + "Sex ≤ 0.5\n", + "gini = 0.47\n", + "samples = 668\n", + "value = [415, 253]\n", + "\n", + "\n", + "\n", + "1\n", + "\n", + "Pclass ≤ 2.5\n", + "gini = 0.38\n", + "samples = 229\n", + "value = [59, 170]\n", + "\n", + "\n", + "\n", + "0->1\n", + "\n", + "\n", + "True\n", + "\n", + "\n", + "\n", + "8\n", + "\n", + "LogFare ≤ 3.31\n", + "gini = 0.31\n", + "samples = 439\n", + "value = [356, 83]\n", + "\n", + "\n", + "\n", + "0->8\n", + "\n", + "\n", + "False\n", + "\n", + "\n", + "\n", + "2\n", + "\n", + "SibSp ≤ 0.5\n", + "gini = 0.06\n", + "samples = 120\n", + "value = [4, 116]\n", + "\n", + "\n", + "\n", + "1->2\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "5\n", + "\n", + "LogFare ≤ 2.7\n", + "gini = 0.5\n", + "samples = 109\n", + "value = [55, 54]\n", + "\n", + "\n", + "\n", + "1->5\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "3\n", + "\n", + "gini = 0.03\n", + "samples = 67\n", + "value = [1, 66]\n", + "\n", + "\n", + "\n", + "2->3\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "4\n", + "\n", + "gini = 0.11\n", + "samples = 53\n", + "value = [3, 50]\n", + "\n", + "\n", + "\n", + "2->4\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "6\n", + "\n", + "gini = 0.49\n", + "samples = 59\n", + "value = [25, 34]\n", + "\n", + "\n", + "\n", + "5->6\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "7\n", + "\n", + "gini = 0.48\n", + "samples = 50\n", + "value = [30, 20]\n", + "\n", + "\n", + "\n", + "5->7\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "9\n", + "\n", + "Age ≤ 20.5\n", + "gini = 0.24\n", + "samples = 320\n", + "value = [275, 45]\n", + "\n", + "\n", + "\n", + "8->9\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "18\n", + "\n", + "SibSp ≤ 0.5\n", + "gini = 0.43\n", + "samples = 119\n", + "value = [81, 38]\n", + "\n", + "\n", + "\n", + "8->18\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "10\n", + "\n", + "gini = 0.43\n", + "samples = 55\n", + "value = [38, 17]\n", + "\n", + "\n", + "\n", + "9->10\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "11\n", + "\n", + "Age ≤ 32.5\n", + "gini = 0.19\n", + "samples = 265\n", + "value = [237, 28]\n", + "\n", + "\n", + "\n", + "9->11\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "12\n", + "\n", + "Age ≤ 24.75\n", + "gini = 0.22\n", + "samples = 181\n", + "value = [158, 23]\n", + "\n", + "\n", + "\n", + "11->12\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "17\n", + "\n", + "gini = 0.11\n", + "samples = 84\n", + "value = [79, 5]\n", + "\n", + "\n", + "\n", + "11->17\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "13\n", + "\n", + "LogFare ≤ 2.18\n", + "gini = 0.16\n", + "samples = 114\n", + "value = [104, 10]\n", + "\n", + "\n", + "\n", + "12->13\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "16\n", + "\n", + "gini = 0.31\n", + "samples = 67\n", + "value = [54, 13]\n", + "\n", + "\n", + "\n", + "12->16\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "14\n", + "\n", + "gini = 0.21\n", + "samples = 50\n", + "value = [44, 6]\n", + "\n", + "\n", + "\n", + "13->14\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "15\n", + "\n", + "gini = 0.12\n", + "samples = 64\n", + "value = [60, 4]\n", + "\n", + "\n", + "\n", + "13->15\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "19\n", + "\n", + "gini = 0.48\n", + "samples = 60\n", + "value = [36, 24]\n", + "\n", + "\n", + "\n", + "18->19\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "20\n", + "\n", + "gini = 0.36\n", + "samples = 59\n", + "value = [45, 14]\n", + "\n", + "\n", + "\n", + "18->20\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "m = DecisionTreeClassifier(min_samples_leaf=50)\n", + "m.fit(trn_xs, trn_y)\n", + "draw_tree(m, trn_xs, size=12)" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "5d82961e", + "metadata": { + "execution": { + "iopub.execute_input": "2022-05-23T23:52:20.034875Z", + "iopub.status.busy": "2022-05-23T23:52:20.034551Z", + "iopub.status.idle": "2022-05-23T23:52:20.044976Z", + "shell.execute_reply": "2022-05-23T23:52:20.044315Z" + }, + "papermill": { + "duration": 0.10291, + "end_time": "2022-05-23T23:52:20.046905", + "exception": false, + "start_time": "2022-05-23T23:52:19.943995", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "np.float64(0.18385650224215247)" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mean_absolute_error(val_y, m.predict(val_xs))" + ] + }, + { + "cell_type": "markdown", + "id": "ccfb9c35", + "metadata": { + "papermill": { + "duration": 0.085949, + "end_time": "2022-05-23T23:52:20.220973", + "exception": false, + "start_time": "2022-05-23T23:52:20.135024", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "It looks like this is an improvement, although again it's a bit hard to tell with small datasets like this. Let's try submitting it to Kaggle:" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "061feaf1", + "metadata": { + "execution": { + "iopub.execute_input": "2022-05-23T23:52:20.396929Z", + "iopub.status.busy": "2022-05-23T23:52:20.396607Z", + "iopub.status.idle": "2022-05-23T23:52:20.418077Z", + "shell.execute_reply": "2022-05-23T23:52:20.417266Z" + }, + "papermill": { + "duration": 0.112139, + "end_time": "2022-05-23T23:52:20.420499", + "exception": false, + "start_time": "2022-05-23T23:52:20.308360", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "tst_df[cats] = tst_df[cats].apply(lambda x: x.cat.codes)\n", + "tst_xs,_ = xs_y(tst_df)\n", + "\n", + "def subm(preds, suff):\n", + " tst_df['Survived'] = preds\n", + " sub_df = tst_df[['PassengerId','Survived']]\n", + " sub_df.to_csv(f'sub-{suff}.csv', index=False)\n", + "\n", + "subm(m.predict(tst_xs), 'tree')" + ] + }, + { + "cell_type": "markdown", + "id": "d66f526e", + "metadata": { + "papermill": { + "duration": 0.085412, + "end_time": "2022-05-23T23:52:20.593822", + "exception": false, + "start_time": "2022-05-23T23:52:20.508410", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "When I submitted this, I got a score of 0.765, which isn't as good as our linear models or most of our neural nets, but it's pretty close to those results.\n", + "\n", + "Hopefully you can now see why we didn't really need to create dummy variables, but instead just converted the labels into numbers using some (potentially arbitary) ordering of categories. For instance, here's how the first few items of `Embarked` are labeled:" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "44582a9f", + "metadata": { + "execution": { + "iopub.execute_input": "2022-05-23T23:52:20.770788Z", + "iopub.status.busy": "2022-05-23T23:52:20.770281Z", + "iopub.status.idle": "2022-05-23T23:52:20.778767Z", + "shell.execute_reply": "2022-05-23T23:52:20.777390Z" + }, + "papermill": { + "duration": 0.102015, + "end_time": "2022-05-23T23:52:20.781428", + "exception": false, + "start_time": "2022-05-23T23:52:20.679413", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0 S\n", + "1 C\n", + "2 S\n", + "3 S\n", + "4 S\n", + "Name: Embarked, dtype: category\n", + "Categories (3, object): ['C', 'Q', 'S']" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.Embarked.head()" + ] + }, + { + "cell_type": "markdown", + "id": "dac18715", + "metadata": { + "papermill": { + "duration": 0.087676, + "end_time": "2022-05-23T23:52:20.957609", + "exception": false, + "start_time": "2022-05-23T23:52:20.869933", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "...resulting in these integer codes:" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "70dcfd2b", + "metadata": { + "execution": { + "iopub.execute_input": "2022-05-23T23:52:21.134608Z", + "iopub.status.busy": "2022-05-23T23:52:21.133972Z", + "iopub.status.idle": "2022-05-23T23:52:21.141145Z", + "shell.execute_reply": "2022-05-23T23:52:21.140254Z" + }, + "papermill": { + "duration": 0.09824, + "end_time": "2022-05-23T23:52:21.143476", + "exception": false, + "start_time": "2022-05-23T23:52:21.045236", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0 2\n", + "1 0\n", + "2 2\n", + "3 2\n", + "4 2\n", + "dtype: int8" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.Embarked.cat.codes.head()" + ] + }, + { + "cell_type": "markdown", + "id": "3daae03b", + "metadata": { + "papermill": { + "duration": 0.08905, + "end_time": "2022-05-23T23:52:21.320942", + "exception": false, + "start_time": "2022-05-23T23:52:21.231892", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "So let's say we wanted to split into \"C\" in one group, vs \"Q\" or \"S\" in the other group. Then we just have to split on codes `<=0` (since `C` is mapped to category `0`). Note that if we wanted to split into \"Q\" in one group, we'd need to use two binary splits, first to separate \"C\" from \"Q\" and \"S\", and then a second split to separate \"Q\" from \"S\". For this reason, sometimes it can still be helpful to use dummy variables for categorical variables with few levels (like this one).\n", + "\n", + "In practice, I often use dummy variables for <4 levels, and numeric codes for >=4 levels." + ] + }, + { + "cell_type": "markdown", + "id": "2b45887e", + "metadata": { + "papermill": { + "duration": 0.086779, + "end_time": "2022-05-23T23:52:21.494320", + "exception": false, + "start_time": "2022-05-23T23:52:21.407541", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## The random forest" + ] + }, + { + "cell_type": "markdown", + "id": "1976d0fb", + "metadata": { + "papermill": { + "duration": 0.086362, + "end_time": "2022-05-23T23:52:21.667495", + "exception": false, + "start_time": "2022-05-23T23:52:21.581133", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "We can't make the decision tree much bigger than the example above, since some leaf nodes already have only 50 rows in them. That's not a lot of data to make a prediction.\n", + "\n", + "So how could we use bigger trees? One big insight came from Leo Breiman: what if we create lots of bigger trees, and take the average of their predictions? Taking the average prediction of a bunch of models in this way is known as [bagging](https://link.springer.com/article/10.1007/BF00058655).\n", + "\n", + "The idea is that we want each model's predictions in the averaged ensemble to be uncorrelated with each other model. That way, if we average the predictions, the average will be equal to the true target value -- that's because the average of lots of uncorrelated random errors is zero. That's quite an amazing insight!\n", + "\n", + "One way we can create a bunch of uncorrelated models is to train each of them on a different random subset of the data. Here's how we can create a tree on a random subset of the data:" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "08aa92ef", + "metadata": { + "execution": { + "iopub.execute_input": "2022-05-23T23:52:21.850413Z", + "iopub.status.busy": "2022-05-23T23:52:21.849900Z", + "iopub.status.idle": "2022-05-23T23:52:21.854882Z", + "shell.execute_reply": "2022-05-23T23:52:21.854244Z" + }, + "papermill": { + "duration": 0.096075, + "end_time": "2022-05-23T23:52:21.856770", + "exception": false, + "start_time": "2022-05-23T23:52:21.760695", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def get_tree(prop=0.75):\n", + " n = len(trn_y)\n", + " idxs = random.choice(n, int(n*prop))\n", + " return DecisionTreeClassifier(min_samples_leaf=5).fit(trn_xs.iloc[idxs], trn_y.iloc[idxs])" + ] + }, + { + "cell_type": "markdown", + "id": "2325174f", + "metadata": { + "papermill": { + "duration": 0.087327, + "end_time": "2022-05-23T23:52:22.032957", + "exception": false, + "start_time": "2022-05-23T23:52:21.945630", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "Now we can create as many trees as we want:" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "0fcc811f", + "metadata": { + "execution": { + "iopub.execute_input": "2022-05-23T23:52:22.211835Z", + "iopub.status.busy": "2022-05-23T23:52:22.211276Z", + "iopub.status.idle": "2022-05-23T23:52:22.483121Z", + "shell.execute_reply": "2022-05-23T23:52:22.482418Z" + }, + "papermill": { + "duration": 0.364166, + "end_time": "2022-05-23T23:52:22.485456", + "exception": false, + "start_time": "2022-05-23T23:52:22.121290", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "trees = [get_tree() for t in range(100)]" + ] + }, + { + "cell_type": "markdown", + "id": "4171fcfb", + "metadata": { + "papermill": { + "duration": 0.086759, + "end_time": "2022-05-23T23:52:22.659496", + "exception": false, + "start_time": "2022-05-23T23:52:22.572737", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "Our prediction will be the average of these trees' predictions:" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "69a26910", + "metadata": { + "execution": { + "iopub.execute_input": "2022-05-23T23:52:22.835922Z", + "iopub.status.busy": "2022-05-23T23:52:22.835355Z", + "iopub.status.idle": "2022-05-23T23:52:22.976514Z", + "shell.execute_reply": "2022-05-23T23:52:22.975799Z" + }, + "papermill": { + "duration": 0.231614, + "end_time": "2022-05-23T23:52:22.978449", + "exception": false, + "start_time": "2022-05-23T23:52:22.746835", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "np.float64(0.22748878923766816)" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "all_probs = [t.predict(val_xs) for t in trees]\n", + "avg_probs = np.stack(all_probs).mean(0)\n", + "\n", + "mean_absolute_error(val_y, avg_probs)" + ] + }, + { + "cell_type": "markdown", + "id": "2ab6c8b4", + "metadata": { + "papermill": { + "duration": 0.086497, + "end_time": "2022-05-23T23:52:23.151915", + "exception": false, + "start_time": "2022-05-23T23:52:23.065418", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "This is nearly identical to what `sklearn`'s `RandomForestClassifier` does. The main extra piece in a \"real\" random forest is that as well as choosing a random sample of data for each tree, it also picks a random subset of columns for each split. Here's how we repeat the above process with a random forest:" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "3ec0ada5", + "metadata": { + "execution": { + "iopub.execute_input": "2022-05-23T23:52:23.328862Z", + "iopub.status.busy": "2022-05-23T23:52:23.328403Z", + "iopub.status.idle": "2022-05-23T23:52:23.622038Z", + "shell.execute_reply": "2022-05-23T23:52:23.620968Z" + }, + "papermill": { + "duration": 0.385511, + "end_time": "2022-05-23T23:52:23.624308", + "exception": false, + "start_time": "2022-05-23T23:52:23.238797", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "np.float64(0.18834080717488788)" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.ensemble import RandomForestClassifier\n", + "\n", + "rf = RandomForestClassifier(100, min_samples_leaf=5)\n", + "rf.fit(trn_xs, trn_y);\n", + "mean_absolute_error(val_y, rf.predict(val_xs))" + ] + }, + { + "cell_type": "markdown", + "id": "2495c6d7", + "metadata": { + "papermill": { + "duration": 0.086944, + "end_time": "2022-05-23T23:52:23.799357", + "exception": false, + "start_time": "2022-05-23T23:52:23.712413", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "We can submit that to Kaggle too:" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "83c6187f", + "metadata": { + "execution": { + "iopub.execute_input": "2022-05-23T23:52:23.979091Z", + "iopub.status.busy": "2022-05-23T23:52:23.978811Z", + "iopub.status.idle": "2022-05-23T23:52:24.005874Z", + "shell.execute_reply": "2022-05-23T23:52:24.005221Z" + }, + "papermill": { + "duration": 0.120066, + "end_time": "2022-05-23T23:52:24.008236", + "exception": false, + "start_time": "2022-05-23T23:52:23.888170", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "subm(rf.predict(tst_xs), 'rf')" + ] + }, + { + "cell_type": "markdown", + "id": "1afe6ee9", + "metadata": { + "papermill": { + "duration": 0.086793, + "end_time": "2022-05-23T23:52:24.182155", + "exception": false, + "start_time": "2022-05-23T23:52:24.095362", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "I found that gave nearly an identical result as our single tree (which, in turn, was slightly lower than our linear and neural net models in the previous notebook)." + ] + }, + { + "cell_type": "markdown", + "id": "86e64d72", + "metadata": { + "papermill": { + "duration": 0.087065, + "end_time": "2022-05-23T23:52:24.357493", + "exception": false, + "start_time": "2022-05-23T23:52:24.270428", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "One particularly nice feature of random forests is they can tell us which independent variables were the most important in the model, using `feature_importances_`:" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "3af99972", + "metadata": { + "execution": { + "iopub.execute_input": "2022-05-23T23:52:24.533906Z", + "iopub.status.busy": "2022-05-23T23:52:24.533390Z", + "iopub.status.idle": "2022-05-23T23:52:24.766416Z", + "shell.execute_reply": "2022-05-23T23:52:24.765411Z" + }, + "papermill": { + "duration": 0.324146, + "end_time": "2022-05-23T23:52:24.768794", + "exception": false, + "start_time": "2022-05-23T23:52:24.444648", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "pd.DataFrame(dict(cols=trn_xs.columns, imp=m.feature_importances_)).plot('cols', 'imp', 'barh');" + ] + }, + { + "cell_type": "markdown", + "id": "288344e4", + "metadata": { + "papermill": { + "duration": 0.088243, + "end_time": "2022-05-23T23:52:24.947669", + "exception": false, + "start_time": "2022-05-23T23:52:24.859426", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "We can see that `Sex` is by far the most important predictor, with `Pclass` a distant second, and `LogFare` and `Age` behind that. In datasets with many columns, I generally recommend creating a feature importance plot as soon as possible, in order to find which columns are worth studying more closely. (Note also that we didn't really need to take the `log()` of `Fare`, since random forests only care about order, and `log()` doesn't change the order -- we only did it to make our graphs earlier easier to read.)\n", + "\n", + "For details about deriving and understanding feature importances, and the many other important diagnostic tools provided by random forests, take a look at [chapter 8](https://github.com/fastai/fastbook/blob/master/08_collab.ipynb) of [our book](https://www.amazon.com/Deep-Learning-Coders-fastai-PyTorch/dp/1492045527)." + ] + }, + { + "cell_type": "markdown", + "id": "2721efdd", + "metadata": { + "papermill": { + "duration": 0.089775, + "end_time": "2022-05-23T23:52:25.126465", + "exception": false, + "start_time": "2022-05-23T23:52:25.036690", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## Conclusion" + ] + }, + { + "cell_type": "markdown", + "id": "3c6d7087", + "metadata": { + "papermill": { + "duration": 0.090308, + "end_time": "2022-05-23T23:52:25.305738", + "exception": false, + "start_time": "2022-05-23T23:52:25.215430", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "So what can we take away from all this?\n", + "\n", + "I think the first thing I'd note from this is that, clearly, more complex models aren't always better. Our \"OneR\" model, consisting of a single binary split, was nearly as good as our more complex models. Perhaps in practice a simple model like this might be much easier to use, and could be worth considering. Our random forest wasn't an improvement on the single decision tree at all.\n", + "\n", + "So we should always be careful to benchmark simple models, as see if they're good enough for our needs. In practice, you will often find that simple models will have trouble providing adequate accuracy for more complex tasks, such as recommendation systems, NLP, computer vision, or multivariate time series. But there's no need to guess -- it's so easy to try a few different models, there's no reason not to give the simpler ones a go too!\n", + "\n", + "Another thing I think we can take away is that random forests aren't actually that complicated at all. We were able to implement the key features of them in a notebook quite quickly. And they aren't sensitive to issues like normalization, interactions, or non-linear transformations, which make them extremely easy to work with, and hard to mess up!" + ] + }, + { + "cell_type": "markdown", + "id": "7b4df4a4", + "metadata": { + "papermill": { + "duration": 0.087358, + "end_time": "2022-05-23T23:52:25.482959", + "exception": false, + "start_time": "2022-05-23T23:52:25.395601", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "If you found this notebook useful, please remember to click the little up-arrow at the top to upvote it, since I like to know when people have found my work useful, and it helps others find it too. (BTW, be sure you're looking at my [original notebook here](https://www.kaggle.com/jhoward/how-random-forests-work) when you do that, and are not on your own copy of it, otherwise your upvote won't get counted!) And if you have any questions or comments, please pop them below -- I read every comment I receive!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4fecbc09", + "metadata": { + "papermill": { + "duration": 0.088049, + "end_time": "2022-05-23T23:52:25.659455", + "exception": false, + "start_time": "2022-05-23T23:52:25.571406", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + }, + "papermill": { + "default_parameters": {}, + "duration": 33.243357, + "end_time": "2022-05-23T23:52:26.679734", + "environment_variables": {}, + "exception": null, + "input_path": "__notebook__.ipynb", + "output_path": "__notebook__.ipynb", + "parameters": {}, + "start_time": "2022-05-23T23:51:53.436377", + "version": "2.3.4" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": { + "0b8cc982889f45fdb1033ccafe122f0a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatSliderModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatSliderModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "FloatSliderView", + "continuous_update": true, + "description": "split", + "description_tooltip": null, + "disabled": false, + "layout": "IPY_MODEL_65f92e60825b419290deba2c44139999", + "max": 46.5, + "min": -15.5, + "orientation": "horizontal", + "readout": true, + "readout_format": ".2f", + "step": 0.1, + "style": "IPY_MODEL_b13d39f611c4484abb5554ef7c63eb2b", + "value": 15.5 + } + }, + "23b034cbd54b44e79b3a6145ba2958bd": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "31db31dc75ee4255bc151e13488a9742": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "3531f9ef201a4046ad01de663d88b766": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "IntSliderModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "IntSliderModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "IntSliderView", + "continuous_update": true, + "description": "split", + "description_tooltip": null, + "disabled": false, + "layout": "IPY_MODEL_31db31dc75ee4255bc151e13488a9742", + "max": 6, + "min": -2, + "orientation": "horizontal", + "readout": true, + "readout_format": "d", + "step": 1, + "style": "IPY_MODEL_ef14a963d4464fcca9bd5f220e3cc6e8", + "value": 2 + } + }, + "3a8c1e7fe19b4897afbf7a8afdc727e7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "VBoxModel", + "state": { + "_dom_classes": [ + "widget-interact" + ], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "VBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "VBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_ae6af183567d4b24b08275d9c029a4b4", + "IPY_MODEL_3531f9ef201a4046ad01de663d88b766", + "IPY_MODEL_400c599e06fb4b56b19bf0feb4b6fd9a" + ], + "layout": "IPY_MODEL_d59abcbb498440be9a46da1e06af0cad" + } + }, + "3c94e92aa5a1464b97e34c6a67ff58e0": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "400c599e06fb4b56b19bf0feb4b6fd9a": { + "model_module": "@jupyter-widgets/output", + "model_module_version": "1.0.0", + "model_name": "OutputModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/output", + "_model_module_version": "1.0.0", + "_model_name": "OutputModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/output", + "_view_module_version": "1.0.0", + "_view_name": "OutputView", + "layout": "IPY_MODEL_5becffbcacf0414d916acfd1cb12dbfe", + "msg_id": "", + "outputs": [ + { + "data": { + "text/plain": "0.4854373192013831" + }, + "metadata": {}, + "output_type": "display_data" + } + ] + } + }, + "5becffbcacf0414d916acfd1cb12dbfe": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "65f92e60825b419290deba2c44139999": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8774162a3a05417793bd6858db7a9cf0": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a9b3e19b0bb44c8aa1ae814e10407426": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "VBoxModel", + "state": { + "_dom_classes": [ + "widget-interact" + ], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "VBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "VBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_b8a234465ac142698f56edd47c27194a", + "IPY_MODEL_0b8cc982889f45fdb1033ccafe122f0a", + "IPY_MODEL_d0a0589701364ab99133b75660ecac38" + ], + "layout": "IPY_MODEL_8774162a3a05417793bd6858db7a9cf0" + } + }, + "ae6af183567d4b24b08275d9c029a4b4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DropdownModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DropdownModel", + "_options_labels": [ + "Sex", + "Embarked" + ], + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "DropdownView", + "description": "nm", + "description_tooltip": null, + "disabled": false, + "index": 0, + "layout": "IPY_MODEL_f344a411d06b4828b293d8d1b95b5d2d", + "style": "IPY_MODEL_3c94e92aa5a1464b97e34c6a67ff58e0" + } + }, + "b13d39f611c4484abb5554ef7c63eb2b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "SliderStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "SliderStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "", + "handle_color": null + } + }, + "b8a234465ac142698f56edd47c27194a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DropdownModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DropdownModel", + "_options_labels": [ + "Age", + "SibSp", + "Parch", + "LogFare", + "Pclass" + ], + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "DropdownView", + "description": "nm", + "description_tooltip": null, + "disabled": false, + "index": 0, + "layout": "IPY_MODEL_f5726258f2ff4c3c977973151f1bc424", + "style": "IPY_MODEL_ce52044b466649ba9ef4d45558063b6e" + } + }, + "ce52044b466649ba9ef4d45558063b6e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "d0a0589701364ab99133b75660ecac38": { + "model_module": "@jupyter-widgets/output", + "model_module_version": "1.0.0", + "model_name": "OutputModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/output", + "_model_module_version": "1.0.0", + "_model_name": "OutputModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/output", + "_view_module_version": "1.0.0", + "_view_name": "OutputView", + "layout": "IPY_MODEL_23b034cbd54b44e79b3a6145ba2958bd", + "msg_id": "", + "outputs": [ + { + "data": { + "text/plain": "0.48105614369455413" + }, + "metadata": {}, + "output_type": "display_data" + } + ] + } + }, + "d59abcbb498440be9a46da1e06af0cad": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ef14a963d4464fcca9bd5f220e3cc6e8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "SliderStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "SliderStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "", + "handle_color": null + } + }, + "f344a411d06b4828b293d8d1b95b5d2d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f5726258f2ff4c3c977973151f1bc424": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + } + }, + "version_major": 2, + "version_minor": 0 + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/ds701_book/img.jpg b/ds701_book/img.jpg new file mode 100644 index 00000000..ef8594ed Binary files /dev/null and b/ds701_book/img.jpg differ diff --git a/ds701_book/intro_dm_classification.ipynb b/ds701_book/intro_dm_classification.ipynb new file mode 100644 index 00000000..9454fed1 --- /dev/null +++ b/ds701_book/intro_dm_classification.ipynb @@ -0,0 +1,1425 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Module 6: Classification\n", + "\n", + "The following tutorial contains Python examples for solving classification problems. You should refer to the Chapters 3 \n", + "and 4 of the **\"Introduction to Data Mining\"** book to understand some of the concepts introduced in this tutorial. \n", + "This notebook can be downloaded from http://www.cse.msu.edu/~ptan/dmbook/tutorials/tutorial6/tutorial6.ipynb.\n", + "\n", + "The dataset can be downloaded from [here](http://www.cse.msu.edu/~ptan/dmbook/tutorials/tutorial6/vertebrate.csv).\n", + "\n", + "Classification is the task of predicting a nominal-valued attribute (known as class label) based on the values of other\n", + "attributes (known as predictor variables). The goals for this tutorial are as follows:\n", + "1. To provide examples of using different classification techniques from the scikit-learn library package.\n", + "2. To demonstrate the problem of model overfitting.\n", + "\n", + "Read the step-by-step instructions below carefully. To execute the code, click on the corresponding cell and press the SHIFT-ENTER keys simultaneously.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6.1 Vertebrate Dataset\n", + "\n", + "We use a variation of the vertebrate data described in Example 3.1 of Chapter 3. Each vertebrate is classified into one of 5 categories: mammals, reptiles, birds, fishes, and amphibians, based on a set of explanatory attributes (predictor variables). Except for \"name\", the rest of the attributes have been converted into a *one hot encoding* binary representation. To illustrate this, we will first load the data into a Pandas DataFrame object and display its content." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NameWarm-bloodedGives BirthAquatic CreatureAerial CreatureHas LegsHibernatesClass
0human110010mammals
1python000001reptiles
2salmon001000fishes
3whale111000mammals
4frog001011amphibians
5komodo000010reptiles
6bat110111mammals
7pigeon100110birds
8cat110010mammals
9leopard shark011000fishes
10turtle001010reptiles
11penguin101010birds
12porcupine110011mammals
13eel001000fishes
14salamander001011amphibians
\n", + "
" + ], + "text/plain": [ + " Name Warm-blooded Gives Birth Aquatic Creature \\\n", + "0 human 1 1 0 \n", + "1 python 0 0 0 \n", + "2 salmon 0 0 1 \n", + "3 whale 1 1 1 \n", + "4 frog 0 0 1 \n", + "5 komodo 0 0 0 \n", + "6 bat 1 1 0 \n", + "7 pigeon 1 0 0 \n", + "8 cat 1 1 0 \n", + "9 leopard shark 0 1 1 \n", + "10 turtle 0 0 1 \n", + "11 penguin 1 0 1 \n", + "12 porcupine 1 1 0 \n", + "13 eel 0 0 1 \n", + "14 salamander 0 0 1 \n", + "\n", + " Aerial Creature Has Legs Hibernates Class \n", + "0 0 1 0 mammals \n", + "1 0 0 1 reptiles \n", + "2 0 0 0 fishes \n", + "3 0 0 0 mammals \n", + "4 0 1 1 amphibians \n", + "5 0 1 0 reptiles \n", + "6 1 1 1 mammals \n", + "7 1 1 0 birds \n", + "8 0 1 0 mammals \n", + "9 0 0 0 fishes \n", + "10 0 1 0 reptiles \n", + "11 0 1 0 birds \n", + "12 0 1 1 mammals \n", + "13 0 0 0 fishes \n", + "14 0 1 1 amphibians " + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "data = pd.read_csv('data/vertebrate.csv',header='infer')\n", + "data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Given the limited number of training examples, suppose we convert the problem into a binary classification task (mammals versus non-mammals). We can do so by replacing the class labels of the instances to *non-mammals* except for those that belong to the *mammals* class." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NameWarm-bloodedGives BirthAquatic CreatureAerial CreatureHas LegsHibernatesClass
0human110010mammals
1python000001non-mammals
2salmon001000non-mammals
3whale111000mammals
4frog001011non-mammals
5komodo000010non-mammals
6bat110111mammals
7pigeon100110non-mammals
8cat110010mammals
9leopard shark011000non-mammals
10turtle001010non-mammals
11penguin101010non-mammals
12porcupine110011mammals
13eel001000non-mammals
14salamander001011non-mammals
\n", + "
" + ], + "text/plain": [ + " Name Warm-blooded Gives Birth Aquatic Creature \\\n", + "0 human 1 1 0 \n", + "1 python 0 0 0 \n", + "2 salmon 0 0 1 \n", + "3 whale 1 1 1 \n", + "4 frog 0 0 1 \n", + "5 komodo 0 0 0 \n", + "6 bat 1 1 0 \n", + "7 pigeon 1 0 0 \n", + "8 cat 1 1 0 \n", + "9 leopard shark 0 1 1 \n", + "10 turtle 0 0 1 \n", + "11 penguin 1 0 1 \n", + "12 porcupine 1 1 0 \n", + "13 eel 0 0 1 \n", + "14 salamander 0 0 1 \n", + "\n", + " Aerial Creature Has Legs Hibernates Class \n", + "0 0 1 0 mammals \n", + "1 0 0 1 non-mammals \n", + "2 0 0 0 non-mammals \n", + "3 0 0 0 mammals \n", + "4 0 1 1 non-mammals \n", + "5 0 1 0 non-mammals \n", + "6 1 1 1 mammals \n", + "7 1 1 0 non-mammals \n", + "8 0 1 0 mammals \n", + "9 0 0 0 non-mammals \n", + "10 0 1 0 non-mammals \n", + "11 0 1 0 non-mammals \n", + "12 0 1 1 mammals \n", + "13 0 0 0 non-mammals \n", + "14 0 1 1 non-mammals " + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data['Class'] = data['Class'].replace(['fishes','birds','amphibians','reptiles'],'non-mammals')\n", + "data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can apply Pandas cross-tabulation to examine the relationship between the Warm-blooded and Gives Birth attributes with respect to the class. " + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Classmammalsnon-mammals
Warm-bloodedGives Birth
0007
101
1002
150
\n", + "
" + ], + "text/plain": [ + "Class mammals non-mammals\n", + "Warm-blooded Gives Birth \n", + "0 0 0 7\n", + " 1 0 1\n", + "1 0 0 2\n", + " 1 5 0" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.crosstab([data['Warm-blooded'],data['Gives Birth']],data['Class'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The results above show that it is possible to distinguish mammals from non-mammals using these two attributes alone since each combination of their attribute values would yield only instances that belong to the same class. For example, mammals can be identified as warm-blooded vertebrates that give birth to their young. Such a relationship can also be derived using a decision tree classifier, as shown by the example given in the next subsection." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3.2 Decision Tree Classifier\n", + "\n", + "In this section, we apply a decision tree classifier to the vertebrate dataset described in the previous subsection." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn import tree\n", + "\n", + "Y = data['Class']\n", + "X = data.drop(['Name','Class'],axis=1)\n", + "\n", + "clf = tree.DecisionTreeClassifier(criterion='entropy',max_depth=3)\n", + "clf = clf.fit(X, Y)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The preceding commands will extract the predictor (X) and target class (Y) attributes from the vertebrate dataset and create a decision tree classifier object using entropy as its impurity measure for splitting criterion. The decision tree class in Python sklearn library also supports using 'gini' as impurity measure. The classifier above is also constrained to generate trees with a maximum depth equals to 3. Next, the classifier is trained on the labeled data using the fit() function. \n", + "\n", + "We can plot the resulting decision tree obtained after training the classifier. To do this, you must first install both graphviz (http://www.graphviz.org) and its Python interface called pydotplus (http://pydotplus.readthedocs.io/)." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pydotplus \n", + "from IPython.display import Image\n", + "\n", + "dot_data = tree.export_graphviz(clf, feature_names=X.columns, class_names=['mammals','non-mammals'], filled=True, \n", + " out_file=None) \n", + "graph = pydotplus.graph_from_dot_data(dot_data) \n", + "Image(graph.create_png())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, suppose we apply the decision tree to classify the following test examples." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NameWarm-bloodedGives BirthAquatic CreatureAerial CreatureHas LegsHibernatesClass
0gila monster000011non-mammals
1platypus100011mammals
2owl100110non-mammals
3dolphin111000mammals
\n", + "
" + ], + "text/plain": [ + " Name Warm-blooded Gives Birth Aquatic Creature Aerial Creature \\\n", + "0 gila monster 0 0 0 0 \n", + "1 platypus 1 0 0 0 \n", + "2 owl 1 0 0 1 \n", + "3 dolphin 1 1 1 0 \n", + "\n", + " Has Legs Hibernates Class \n", + "0 1 1 non-mammals \n", + "1 1 1 mammals \n", + "2 1 0 non-mammals \n", + "3 0 0 mammals " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "testData = [['gila monster',0,0,0,0,1,1,'non-mammals'],\n", + " ['platypus',1,0,0,0,1,1,'mammals'],\n", + " ['owl',1,0,0,1,1,0,'non-mammals'],\n", + " ['dolphin',1,1,1,0,0,0,'mammals']]\n", + "testData = pd.DataFrame(testData, columns=data.columns)\n", + "testData" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We first extract the predictor and target class attributes from the test data and then apply the decision tree classifier to predict their classes." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NamePredicted Class
0gila monsternon-mammals
1platypusnon-mammals
2owlnon-mammals
3dolphinmammals
\n", + "
" + ], + "text/plain": [ + " Name Predicted Class\n", + "0 gila monster non-mammals\n", + "1 platypus non-mammals\n", + "2 owl non-mammals\n", + "3 dolphin mammals" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "testY = testData['Class']\n", + "testX = testData.drop(['Name','Class'],axis=1)\n", + "\n", + "predY = clf.predict(testX)\n", + "predictions = pd.concat([testData['Name'],pd.Series(predY,name='Predicted Class')], axis=1)\n", + "predictions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Except for platypus, which is an egg-laying mammal, the classifier correctly predicts the class label of the test examples. We can calculate the accuracy of the classifier on the test data as shown by the example given below." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy on test data is 0.75\n" + ] + } + ], + "source": [ + "from sklearn.metrics import accuracy_score\n", + "\n", + "print('Accuracy on test data is %.2f' % (accuracy_score(testY, predY)))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3.3 Model Overfitting\n", + "\n", + "To illustrate the problem of model overfitting, we consider a two-dimensional dataset containing 1500 labeled instances, each of which is assigned to one of two classes, 0 or 1. Instances from each class are generated as follows:\n", + "1. Instances from class 1 are generated from a mixture of 3 Gaussian distributions, centered at [6,14], [10,6], and [14 14], respectively. \n", + "2. Instances from class 0 are generated from a uniform distribution in a square region, whose sides have a length equals to 20.\n", + "\n", + "For simplicity, both classes have equal number of labeled instances. The code for generating and plotting the data is shown below. All instances from class 1 are shown in red while those from class 0 are shown in black." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[,\n", + " ]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "from numpy.random import random\n", + "\n", + "%matplotlib inline\n", + "\n", + "N = 1500\n", + "\n", + "mean1 = [6, 14]\n", + "mean2 = [10, 6]\n", + "mean3 = [14, 14]\n", + "cov = [[3.5, 0], [0, 3.5]] # diagonal covariance\n", + "\n", + "np.random.seed(50)\n", + "X = np.random.multivariate_normal(mean1, cov, int(N/6))\n", + "X = np.concatenate((X, np.random.multivariate_normal(mean2, cov, int(N/6))))\n", + "X = np.concatenate((X, np.random.multivariate_normal(mean3, cov, int(N/6))))\n", + "X = np.concatenate((X, 20*np.random.rand(int(N/2),2)))\n", + "Y = np.concatenate((np.ones(int(N/2)),np.zeros(int(N/2))))\n", + "\n", + "plt.plot(X[:int(N/2),0],X[:int(N/2),1],'r+',X[int(N/2):,0],X[int(N/2):,1],'k.',ms=4)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this example, we reserve 80% of the labeled data for training and the remaining 20% for testing. We then fit decision trees of different maximum depths (from 2 to 50) to the training set and plot their respective accuracies when applied to the training and test sets. " + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Text(0, 0.5, 'Accuracy')" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "#########################################\n", + "# Training and Test set creation\n", + "#########################################\n", + "\n", + "from sklearn.model_selection import train_test_split\n", + "X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.8, random_state=1)\n", + "\n", + "from sklearn import tree\n", + "from sklearn.metrics import accuracy_score\n", + "\n", + "#########################################\n", + "# Model fitting and evaluation\n", + "#########################################\n", + "\n", + "maxdepths = [2,3,4,5,6,7,8,9,10,15,20,25,30,35,40,45,50]\n", + "\n", + "trainAcc = np.zeros(len(maxdepths))\n", + "testAcc = np.zeros(len(maxdepths))\n", + "\n", + "index = 0\n", + "for depth in maxdepths:\n", + " clf = tree.DecisionTreeClassifier(max_depth=depth)\n", + " clf = clf.fit(X_train, Y_train)\n", + " Y_predTrain = clf.predict(X_train)\n", + " Y_predTest = clf.predict(X_test)\n", + " trainAcc[index] = accuracy_score(Y_train, Y_predTrain)\n", + " testAcc[index] = accuracy_score(Y_test, Y_predTest)\n", + " index += 1\n", + " \n", + "#########################################\n", + "# Plot of training and test accuracies\n", + "#########################################\n", + " \n", + "plt.plot(maxdepths,trainAcc,'ro-',maxdepths,testAcc,'bv--')\n", + "plt.legend(['Training Accuracy','Test Accuracy'])\n", + "plt.xlabel('Max depth')\n", + "plt.ylabel('Accuracy')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The plot above shows that training accuracy will continue to improve as the maximum depth of the tree increases (i.e., as the model becomes more complex). However, the test accuracy initially improves up to a maximum depth of 5, before it gradually decreases due to model overfitting." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3.4 Alternative Classification Techniques\n", + "\n", + "Besides decision tree classifier, the Python sklearn library also supports other classification techniques. In this section, we provide examples to illustrate how to apply the k-nearest neighbor classifier, linear classifiers (logistic regression and support vector machine), as well as ensemble methods (boosting, bagging, and random forest) to the 2-dimensional data given in the previous section." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3.4.1 K-Nearest neighbor classifier\n", + "\n", + "In this approach, the class label of a test instance is predicted based on the majority class of its *k* closest training instances. The number of nearest neighbors, *k*, is a hyperparameter that must be provided by the user, along with the distance metric. By default, we can use Euclidean distance (which is equivalent to Minkowski distance with an exponent factor equals to p=2):\n", + "\n", + "\\begin{equation*}\n", + "\\textrm{Minkowski distance}(x,y) = \\bigg[\\sum_{i=1}^N |x_i-y_i|^p \\bigg]^{\\frac{1}{p}}\n", + "\\end{equation*}" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Text(0, 0.5, 'Accuracy')" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from sklearn.neighbors import KNeighborsClassifier\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline\n", + "\n", + "numNeighbors = [1, 5, 10, 15, 20, 25, 30]\n", + "trainAcc = []\n", + "testAcc = []\n", + "\n", + "for k in numNeighbors:\n", + " clf = KNeighborsClassifier(n_neighbors=k, metric='minkowski', p=2)\n", + " clf.fit(X_train, Y_train)\n", + " Y_predTrain = clf.predict(X_train)\n", + " Y_predTest = clf.predict(X_test)\n", + " trainAcc.append(accuracy_score(Y_train, Y_predTrain))\n", + " testAcc.append(accuracy_score(Y_test, Y_predTest))\n", + "\n", + "plt.plot(numNeighbors, trainAcc, 'ro-', numNeighbors, testAcc,'bv--')\n", + "plt.legend(['Training Accuracy','Test Accuracy'])\n", + "plt.xlabel('Number of neighbors')\n", + "plt.ylabel('Accuracy')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3.4.2 Linear Classifiers\n", + "\n", + "Linear classifiers such as logistic regression and support vector machine (SVM) constructs a linear separating hyperplane to distinguish instances from different classes. \n", + "\n", + "For logistic regression, the model can be described by the following equation:\n", + "\\begin{equation*}\n", + "P(y=1|x) = \\frac{1}{1 + \\exp^{-w^Tx - b}} = \\sigma(w^Tx + b)\n", + "\\end{equation*}\n", + "The model parameters (w,b) are estimated by optimizing the following regularized negative log-likelihood function:\n", + "\\begin{equation*}\n", + "(w^*,b^*) = \\arg\\min_{w,b} - \\sum_{i=1}^N y_i \\log\\bigg[\\sigma(w^Tx_i + b)\\bigg] + (1-y_i) \\log\\bigg[\\sigma(-w^Tx_i - b)\\bigg] + \\frac{1}{C} \\Omega([w,b])\n", + "\\end{equation*}\n", + "where $C$ is a hyperparameter that controls the inverse of model complexity (smaller values imply stronger regularization) while $\\Omega(\\cdot)$ is the regularization term, which by default, is assumed to be an $l_2$-norm in sklearn.\n", + "\n", + "For support vector machine, the model parameters $(w^*,b^*)$ are estimated by solving the following constrained optimization problem:\n", + "\\begin{eqnarray*}\n", + "&&\\min_{w^*,b^*,\\{\\xi_i\\}} \\frac{\\|w\\|^2}{2} + \\frac{1}{C} \\sum_i \\xi_i \\\\\n", + "\\textrm{s.t.} && \\forall i: y_i\\bigg[w^T \\phi(x_i) + b\\bigg] \\ge 1 - \\xi_i, \\ \\ \\xi_i \\ge 0 \n", + "\\end{eqnarray*}" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Text(0, 0.5, 'Accuracy')" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAA/oAAAIRCAYAAAD6EsZUAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/TGe4hAAAACXBIWXMAAA9hAAAPYQGoP6dpAACQ3UlEQVR4nOzde1wU9f7H8fdyFxS8BXhBNDXF8hYi4jHvhVlpaCc1SzPDbmrqL1NPaabniCfLrLRjxyPYybxUZmaWaV6ywpOmkqamedcC1FQIL6C78/tjYnXlIiC4sLyej8c82Jn57nc+O8zy5TPzne9YDMMwBAAAAAAAXIKbswMAAAAAAADFh0QfAAAAAAAXQqIPAAAAAIALIdEHAAAAAMCFkOgDAAAAAOBCSPQBAAAAAHAhJPoAAAAAALgQD2cHUFbZbDb99ttvqlSpkiwWi7PDAQBAhmHojz/+UM2aNeXmxrn860VbDwAobQra1pPoF9Fvv/2mkJAQZ4cBAEAOR48eVe3atZ0dRplHWw8AKK2u1daT6BdRpUqVJJk72N/f38nRAAAgpaenKyQkxN5G4frQ1gMASpuCtvUk+kWU3YXP39+fxh8AUKrQzbx40NYDAEqra7X13MAHAAAAAIALIdEHAAAAAMCFkOgDAAAAAOBCuEcfAGQ+RisrK8vZYQD58vT0lLu7u7PDAAAApRyJPoByLysrSwcPHpTNZnN2KMA1Va5cWcHBwQy4BwAA8kSiD6BcMwxDycnJcnd3V0hIiNzcuKMJpZNhGDp37pyOHz8uSapRo4aTIwIAAKUViT6Acu3SpUs6d+6catasKV9fX2eHA+SrQoUKkqTjx48rMDCQbvwAACBXXLoCUK5ZrVZJkpeXl5MjAQom+4TUxYsXnRwJAAAorUj0AUDifmeUGRyrAADgWkj0AQAAAABwIST6AABJUt26dTVjxowCl1+/fr0sFovOnDlTYjEBAACg8Ej0AaA4WK3S+vXSwoXmzz/v/S8JFosl32nixIlFqnfz5s0aMmRIgcu3bdtWycnJCggIKNL2iqJx48by9vZWSkrKDdsmAABAWUOiDwDX6+OPpbp1pU6dpIceMn/WrWsuLwHJycn2acaMGfL393dY9txzz9nLGoahS5cuFajem266qVBPHvDy8rqhz3P/9ttvdf78eT3wwAN69913b8g288NgeAAAoLQi0Xe2G3gVEEAJ+Phj6YEHpGPHHJf/+qu5vASS/eDgYPsUEBAgi8Vin//5559VqVIlffHFFwoPD5e3t7e+/fZb7d+/Xz179lRQUJAqVqyoiIgIffXVVw71Xt1132Kx6D//+Y9iYmLk6+urhg0b6tNPP7Wvv7rr/rx581S5cmV9+eWXCgsLU8WKFdWtWzclJyfb33Pp0iUNHz5clStXVrVq1TRmzBgNHDhQ999//zU/99y5c/XQQw/pkUceUXx8fI71x44dU79+/VS1alX5+fmpVatW+v777+3rly9froiICPn4+Kh69eqKiYlx+KyffPKJQ32VK1fWvHnzJEmHDh2SxWLR4sWL1aFDB/n4+Oj999/X77//rn79+qlWrVry9fVV06ZNtXDhQod6bDabXnnlFTVo0EDe3t6qU6eO/vGPf0iSOnfurKFDhzqUP3HihLy8vLRmzZpr7hOUEbT1pZszfz9l5dgoiTgLWmdZ2UfOVF6P4bJybDgrTsPJZs6caYSGhhre3t5G69atje+//z7PsgkJCYYkh8nb29uhzB9//GE888wzRq1atQwfHx8jLCzM+Ne//uVQ5vz588bTTz9tVK1a1fDz8zN69eplpKSkFCrutLQ0Q5KRlpZWqPc5WLLEMGrXNgzp8lS7trkcwA1x/vx5Y9euXcb58+fNBTabYWRkFGxKSzOMWrUcv8NXThaL+Z1OSytYfTZboeNPSEgwAgIC7PPr1q0zJBnNmjUzVq1aZezbt8/4/fffjaSkJGP27NnGjh07jL179xovvvii4ePjYxw+fNj+3tDQUOP111+3z0syateubSxYsMD45ZdfjOHDhxsVK1Y0fv/9d4dtnT592h6Lp6en0bVrV2Pz5s3Gli1bjLCwMOOhhx6y1/n3v//dqFq1qvHxxx8bu3fvNp588knD39/f6NmzZ76fMz093fDz8zN++ukn49KlS0ZQUJCxYcMG+/o//vjDuPnmm4077rjD+Oabb4xffvnFWLx4sZGYmGgYhmF89tlnhru7uzFhwgRj165dRlJSkjFlyhSHz7p06VKHbQYEBBgJCQmGYRjGwYMHDUlG3bp1jSVLlhgHDhwwfvvtN+PYsWPGtGnTjG3bthn79+833nzzTcPd3d2hLXv++eeNKlWqGPPmzTP27dtnfPPNN8acOXMMwzCM999/36hSpYpx4cIFe/np06cbdevWNWx5HA85jtkrFEvbBDva+nLAmb+fsnJslEScBa2zrOwjZyqvx3BZOTZKIM6Ctk1OTfQXLVpkeHl5GfHx8cbOnTuN2NhYo3LlykZqamqu5RMSEgx/f38jOTnZPl2doMfGxhr169c31q1bZxw8eNB45513DHd3d2PZsmX2Mk8++aQREhJirFmzxvjhhx+MNm3aGG3bti1U7Nfd+C9ZYiYBuSUGFkvpO0gBF5UjacrIyDtxL+kpI6PQ8eeV6H/yySfXfO+tt95qvPXWW/b53BL9F1980T6fkZFhSDK++OILh21dmehLMvbt22d/z6xZs4ygoCD7fFBQkDFt2jT7/KVLl4w6depcM9H/97//bbRo0cI+/+yzzxoDBw60z7/zzjtGpUqV7CchrhYVFWX0798/z/oLmujPmDEj3zgNwzDuuece4//+7/8MwzBPUHh7e9sT+6udP3/eqFKlirF48WL7smbNmhkTJ07Ms34S/RuHtt7FOfP3U1aOjZKIs6B1lpV95Ezl9RguK8dGCcVZ0LbJ48b0G8jd9OnTFRsbq0GDBkmSZs+erRUrVig+Pl5jx47N9T3ZXVTzkpiYqIEDB6pjx46SpCFDhuidd97Rpk2b1KNHD6WlpWnu3LlasGCBOnfuLElKSEhQWFiY/ve//6lNmzbF+yFzY7VKzz5r/qqvZhiSxSKNGCH17Cm5u5d8PABcTqtWrRzmMzIyNHHiRK1YsULJycm6dOmSzp8/ryNHjuRbT7Nmzeyv/fz85O/vr+PHj+dZ3tfXV/Xr17fP16hRw14+LS1Nqampat26tX29u7u7wsPDZbPZ8o0jPj5eDz/8sH3+4YcfVocOHfTWW2+pUqVKSkpKUsuWLVW1atVc35+UlKTY2Nh8t1EQV+9Xq9WqKVOm6IMPPtCvv/6qrKwsZWZm2sc62L17tzIzM9WlS5dc6/Px8bHfivDggw9q69at+umnnxxukUAZVZC2/tlnpa5daeudwWqVhg93zu/HmdsujJKIs6B1dupUNvaRM5XXY9iVvj8lnO85LdHPysrSli1bNG7cOPsyNzc3de3aVRs3bszzfRkZGQoNDZXNZtPtt9+uKVOm6NZbb7Wvb9u2rT799FM99thjqlmzptavX6+9e/fq9ddflyRt2bJFFy9eVNeuXe3vady4serUqaONGzfmmehnZmYqMzPTPp+enl7kz65vvsl5P++VDEM6etQs9+cJCwA3iK+vlJFRsLIbNkjdu1+73OefS+3bF2zbxcTPz89h/rnnntPq1av16quvqkGDBqpQoYIeeOABZWVl5VuPp6enw7zFYsk3Kc+tvJFbI1cIu3bt0v/+9z9t2rRJY8aMsS+3Wq1atGiRYmNjVaFChXzruNb63OLMbbC9q/frtGnT9MYbb2jGjBlq2rSp/Pz8NGLECPt+vdZ2Jenxxx9XixYtdOzYMSUkJKhz584KDQ295vtQyhWkrT92TLqBT61AITjz91NWjo2SiDO7zjxO2pbotl1NeT2Gy8qxcQPyPacNxnfy5ElZrVYFBQU5LA8KCsrzsUmNGjVSfHy8li1bpvnz58tms6lt27Y6dkVD+tZbb6lJkyaqXbu2vLy81K1bN82aNUvt//wnOyUlRV5eXqpcuXKBtytJcXFxCggIsE8hISFF/OSSrhiYqljKASg+Fovk51ew6a67pNq1zffkVVdIiFmuIPWV4Oj13333nR599FHFxMSoadOmCg4O1qFDh0pse7kJCAhQUFCQNm/ebF9mtVq1devWfN83d+5ctW/fXj/++KOSkpLs06hRozR37lxJZs+DpKQknTp1Ktc6mjVrlu/gdjfddJPDoIG//PKLzp07d83P9N1336lnz556+OGH1bx5c918883au3evfX3Dhg1VoUKFfLfdtGlTtWrVSnPmzNGCBQv02GOPXXO7KANowwEA11KCbYVTu+4XVlRUlKKiouzzbdu2VVhYmN555x1NnjxZkpno/+9//9Onn36q0NBQbdiwQc8884xq1qzpcBW/sMaNG6dRo0bZ59PT04ue7NeoUbBy1zqbCcC53N2lN94wR9e3WBy7Z2Un7TNmlIpuhQ0bNtTHH3+s++67TxaLRePHj79md/mSMGzYMMXFxalBgwZq3Lix3nrrLZ0+fTrPR/RdvHhR7733niZNmqTbbrvNYd3jjz+u6dOna+fOnerXr5+mTJmi+++/X3FxcapRo4a2bdummjVrKioqSi+99JK6dOmi+vXrq2/fvrp06ZI+//xzew+Bzp07a+bMmYqKipLVatWYMWNy9E7ITcOGDfXRRx8pMTFRVapU0fTp05WamqomTZpIMrvmjxkzRs8//7y8vLz0l7/8RSdOnNDOnTs1ePBgh88ydOhQ+fn5OTwNAGVYQdv6gvb4QfEq7h5ZZWXbhVEScRa0zqlTpTxu4y3ytl1NeT2GXe37U9C2oiiKNAJAMcjMzDTc3d1zDH40YMAAo0ePHgWu54EHHjD69u1rGIZhnDt3zvD09DQ+++wzhzKDBw82oqOjDcMwjDVr1jgMHpWtTp06xvTp0wu83esaoOfSJXO0xdwGZ7hyCg01jI8+KtJI3AAKJr+BzQostxFVQ0JuyGAweQ3Gd/XfuIMHDxqdOnUyKlSoYISEhBgzZ840OnToYDz77LP2MrkNxpffAHW5DcZ3ZSyGYRhLly41rmxqLl68aAwdOtTw9/c3qlSpYowZM8b461//av87frWPPvrIcHNzy/PJKGFhYcbIkSMNwzCMQ4cOGb179zb8/f0NX19fo1WrVg6j3y9ZssRo0aKF4eXlZVSvXt3o1auXfd2vv/5q3HXXXYafn5/RsGFD4/PPP891ML5t27Y5bP/33383evbsaVSsWNEIDAw0XnzxRWPAgAEOgwtarVbj73//uxEaGmp4enoaderUcRjx3zDMpwb4+voaTz/9dK6f80oMxnfjlGhbb7GYfycuXSr+wHFtzvz9lJVjoyTiLGidmZllYx85U3k9hsvz9+dPZWLU/datWxtDhw61z1utVqNWrVpGXFxcgd5/6dIlo1GjRvZ/8rI/9Oeff+5QbsiQIcadd95pGIZhnDlzxvD09DQ++ugj+/qff/7ZkGRs3LixwLEX20i8V//ys+erVr28rGNHw0hKKtp2AOSrWBJ9wzD/UK9bZxgLFpg/nd3AlBFWq9W45ZZbHEb3L48OHjxouLm5GVu2bLlmWRL9G6dE2/rSNDJ0eeXM309ZOTZKIs6C1llW9pEzlddjuKwcGyUUZ5lI9BctWmR4e3sb8+bNM3bt2mUMGTLEqFy5sv3KzSOPPGKMHTvWXv7ll182vvzyS2P//v3Gli1bjL59+xo+Pj7Gzp077WU6dOhg3Hrrrca6deuMAwcOGAkJCYaPj4/x9ttv28s8+eSTRp06dYy1a9caP/zwgxEVFWVERUUVKvYSe7Zu9lXAjAzDmDDBMHx8zOVubobx5JOGcfx40bcHIIdiS/RRIIcOHTL+/e9/G3v27DG2b99uDBkyxPD09DR27drl7NCcIisry0hOTjb69+9f4Me8kujfOCXe1sP5nPn7KSvHRknEWdA6y8o+cqbyegyXlWOjBOIsaNtkMQzDKLkbA65t5syZmjZtmlJSUtSiRQu9+eabioyMlCR17NhRdevW1bx58yRJI0eO1Mcff6yUlBRVqVJF4eHh+vvf/66WLVva60tJSdG4ceO0atUqnTp1SqGhoRoyZIhGjhxpvwf0woUL+r//+z8tXLhQmZmZio6O1ttvv53vY/uulp6eroCAAKWlpcnf37/oO8BqNUdbTE4279G44w7H+3kPH5aef1764ANzPiBAmjhReuYZqQD3jwLI34ULF3Tw4EHVq1dPPj4+zg7H5R09elR9+/bVTz/9JMMwdNttt2nq1Kn2AVPLm/Xr16tTp0665ZZb9NFHH6lp06bXfE9+x2yxtU2QdAPbejiXM38/ZeXYKIk4C1pnWdlHzlRej+GycmwUc5wFbZucnuiXVTf8n6kNG8xnQiYlmfONG0uvvy5161by2wZcGIk+yhoS/RuH/QkAKG0K2jY57fF6KKT27aUffpD+/W/pppukn3+W7r5buvde6YpHOQEAAAAAyjcS/bLE3V2KjZV++UX6v/+TPDykFSukW2+VnntOSktzdoQAAAAAACcj0S+LAgKkV1+VfvpJuuce6dIl6bXXpIYNpTlzzPtAAAAAAADlEol+WdaokfTZZ9Lnn5uvT5yQhgyRWrUy7+kHAAAAAJQ7JPqu4O67pR07zMH5AgLMAfs6dJD69DFH7QcAAAAAlBsk+q7C01MaMcK8f//JJyU3N/ORfI0bSy+9JJ096+wIAQAAAAA3AIm+q7npJulf/5K2bpU6dpQuXJAmTTK79i9YIPE0RQAAAABwaST6rqp5c2ntWumjj6S6daVff5X695fatTMf0wegWBw9ap5Xy2s6dqz4t2mxWPKdJk6ceF11f/LJJwUu/8QTT8jd3V0ffvhhkbcJAACA4uXh7ABQgiwWqXdvqXt3afp0KS5OSkyUIiKkQYOkKVOk4GBnRwmUWZmZ5tcpNTXvMsHB0qFDkrd38W03OTnZ/nrx4sWaMGGC9uzZY19WsWLF4ttYPs6dO6dFixbp+eefV3x8vP7617/ekO3mJSsrS15eXk6NAQAAoDTgin55UKGC9MIL0p490iOPmMsSEszH8b3yipmtACg0Ly+pTh1zSIzcuLlJISFmueIUHBxsnwICAmSxWByWLVq0SGFhYfLx8VHjxo319ttv29+blZWloUOHqkaNGvLx8VFoaKji4uIkSXXr1pUkxcTEyGKx2Ofz8uGHH6pJkyYaO3asNmzYoKNHjzqsz8zM1JgxYxQSEiJvb281aNBAc+fOta/fuXOn7r33Xvn7+6tSpUq64447tH//fklSx44dNWLECIf67r//fj366KP2+bp162ry5MkaMGCA/P39NWTIEEnSmDFjdMstt8jX11c333yzxo8fr4sXLzrUtXz5ckVERMjHx0fVq1dXTEyMJGnSpEm67bbbcnzWFi1aaPz48fnuDwAAgNKCRL88qVVL+u9/pY0bpdatpYwMacwY6dZbpWXLuH8fuMLZs3lPFy6YZSwWafJkyWbLvQ6bTXrxRbPcteotLu+//74mTJigf/zjH9q9e7emTJmi8ePH691335Ukvfnmm/r000/1wQcfaM+ePXr//fftCf3mzZslSQkJCUpOTrbP52Xu3Ll6+OGHFRAQoLvvvlvz5s1zWD9gwAAtXLhQb775pnbv3q133nnH3tvg119/Vfv27eXt7a21a9dqy5Yteuyxx3Tp0qVCfd5XX31VzZs317Zt2+yJeKVKlTRv3jzt2rVLb7zxhubMmaPXX3/d/p4VK1YoJiZG3bt317Zt27RmzRq1bt1akvTYY49p9+7dDp9927Zt2r59uwYNGlSo2MqTWbNmqW7duvLx8VFkZKQ2bdqUZ9l58+bluN3Ex8cnR7ndu3erR48eCggIkJ+fnyIiInTkyBGHMhs3blTnzp3l5+cnf39/tW/fXufPn7evr1u3bo5tTZ06tfg+OAAApZWBIklLSzMkGWlpac4OpWisVsN4913DqFHDMMwU3zC6djWMHTucHRlwQ50/f97YtWuXcf78eYfl2V+L3Kbu3S+Xs9kMw2LJu2z79o7bq14993JFlZCQYAQEBNjn69evbyxYsMChzOTJk42oqCjDMAxj2LBhRufOnQ2bzZZrfZKMpUuXXnO7e/fuNTw9PY0TJ04YhmEYS5cuNerVq2evd8+ePYYkY/Xq1bm+f9y4cUa9evWMrKysXNd36NDBePbZZx2W9ezZ0xg4cKB9PjQ01Lj//vuvGeu0adOM8PBw+3xUVJTRv3//PMvffffdxlNPPWWfHzZsmNGxY8drbudGyeuYNQzntE2LFi0yvLy8jPj4eGPnzp1GbGysUblyZSM1NTXX8gkJCYa/v7+RnJxsn1JSUhzK7Nu3z6hataoxevRoY+vWrca+ffuMZcuWOdSZmJho+Pv7G3FxccZPP/1k/Pzzz8bixYuNCxcu2MuEhoYakyZNcthWRkZGgT9bmW/rAQAup6BtE1f0yys3N2nAAGnvXulvfzNvIP7qK6lFC2nYMOnUKWdHCJQJFkv+XfOvvJpf0s6ePav9+/dr8ODBqlixon36+9//bu8S/+ijjyopKUmNGjXS8OHDtWrVqiJtKz4+XtHR0apevbokqXv37kpLS9PatWslSUlJSXJ3d1eHDh1yfX9SUpLuuOMOeXp6Fmn72Vq1apVj2eLFi/WXv/xFwcHBqlixol588UWHK8FJSUnq0qVLnnXGxsZq4cKFunDhgrKysrRgwQI99thj1xWnK5s+fbpiY2M1aNAgNWnSRLNnz5avr6/i4+PzfM/Vt5sEBQU5rH/hhRfUvXt3vfLKK2rZsqXq16+vHj16KDAw0F5m5MiRGj58uMaOHatbb71VjRo10oMPPijvqwbEqFSpksO2/Pz8incHAABQCpHol3cVK0r/+Ie0a5fUq5dktUozZ5r378+aJRWyGy3gKjIy8p6WLHEse/KkdPvtkru7Oe/ubs7/8Yf0xReOZQ8dyr3O4onZrGjOnDlKSkqyTz/99JP+97//SZJuv/12HTx4UJMnT9b58+f14IMP6oEHHijUdqxWq959912tWLFCHh4e8vDwkK+vr06dOmVP7ipUqJBvHdda7+bmJuOq24muvs9eUo6kbePGjerfv7+6d++uzz77TNu2bdMLL7ygrKysAm/7vvvuk7e3t5YuXarly5fr4sWLhd5H5UVWVpa2bNmirl272pe5ubmpa9eu2rhxY57vy8jIUGhoqEJCQtSzZ0/t3LnTvs5ms2nFihW65ZZbFB0drcDAQEVGRjo8DeL48eP6/vvvFRgYqLZt2yooKEgdOnTQt99+m2NbU6dOVbVq1dSyZUtNmzYt39tDMjMzlZ6e7jABAFAWkejDdPPNZvayZo3UtKl5RX/oUPMK/1dfOTs64Ibz88t7uvp24ooVzYdYWK3mvNVqzlesaI6FWZB6i0NQUJBq1qypAwcOqEGDBg5TvXr17OX8/f3Vp08fzZkzR4sXL9aSJUt06s9ePJ6enrJmf5A8fP755/rjjz+0bds2hxMKCxcu1Mcff6wzZ86oadOmstls+vrrr3Oto1mzZvrmm29yTd4l6aabbnJ4uoDVatVPP/10zX2QmJio0NBQvfDCC2rVqpUaNmyow4cP59j2mjVr8qzDw8NDAwcOVEJCghISEtS3b99rnhwor06ePCmr1ZrjinxQUJBSUlJyfU+jRo0UHx+vZcuWaf78+bLZbGrbtq2O/fksyuPHjysjI0NTp05Vt27dtGrVKsXExKhXr1724+nAgQOSpIkTJyo2NlYrV67U7bffri5duuiXX36xb2v48OFatGiR1q1bpyeeeEJTpkzR888/n+fniYuLU0BAgH0KCQm5rv0DAIDT3Jg7CVyPS9+3d/GiYbz9tmFUq3b5BuKePQ3jl1+cHRlQ7PK737kwbDbDiIgwvy4REeb8jXD1Pfpz5swxKlSoYLzxxhvGnj17jO3btxvx8fHGa6+9ZhiGYbz22mvGggULjN27dxt79uwxBg8ebAQHBxtWq9UwDMNo2LCh8dRTTxnJycnGqVOnct1mz549jT59+uRYbrVajeDgYGPmzJmGYRjGo48+aoSEhBhLly41Dhw4YKxbt85YvHixYRiGcfLkSaNatWpGr169jM2bNxt79+41/vvf/xo///yzYRiGMXv2bMPX19f47LPPjN27dxuxsbGGv79/jnv0X3/9dYcYli1bZnh4eBgLFy409u3bZ7zxxhtG1apVHfbRunXrDDc3N2PChAnGrl27jO3btxtTp051qGfv3r2Gu7u74e7ubvzvf/+79i/iBipN9+j/+uuvhiQjMTHRYfno0aON1q1bF6iOrKwso379+saLL77oUGe/fv0cyt13331G3759DcMwjO+++86QZIwbN86hTNOmTY2xY8fmua25c+caHh4eDvfxX+nChQtGWlqafTp69KjrtvUAgDKJe/RRdB4e0lNPSb/8Ij37rNkPedkyc3T+sWPN/sgAHFgs5lX8sDDz5428N/9Kjz/+uP7zn/8oISFBTZs2VYcOHTRv3jz7Ff1KlSrplVdeUatWrRQREaFDhw7p888/l9ufzwh87bXXtHr1aoWEhKhly5Y56k9NTdWKFSvUu3fvHOvc3NwUExNjf4Tev/71Lz3wwAN6+umn1bhxY8XGxursn48YqFatmtauXauMjAx16NBB4eHhmjNnjv2e/ccee0wDBw7UgAED1KFDB918883q1KnTNT9/jx49NHLkSA0dOlQtWrRQYmJijsfidezYUR9++KE+/fRTtWjRQp07d84xSnzDhg3Vtm1bNW7cWJGRkdfcbnlVvXp1ubu7KzU11WF5amqqgoODC1SHp6enWrZsqX379tnr9PDwUJMmTRzKhYWF2cdaqFGjhiTlWyY3kZGRunTpkg4dOpTrem9vb/n7+ztMAACURRbD4JlqRZGenq6AgAClpaW5/j8Cu3ZJI0dK2YN2BQdLcXHmYH55PUAcKCMuXLiggwcPql69erk+4gvlk2EYatiwoZ5++mmNGjXK2eE4yO+YdUbbFBkZqdatW+utt96SZN5jX6dOHQ0dOlRjx4695vutVqtuvfVWde/eXdOnT5cktW3bVvXr19d7771nLxcTE6MKFSpowYIFMgxDtWvX1mOPPabJkyfby7Rs2VJ33323pkyZkuu23n//fQ0YMEAnT55UlSpVrhlbuWrrAQBlQkHbJo8bGBPKqiZNpJUrpRUrzIR/3z5p0CBzsL433pDatnV2hABQbE6cOKFFixYpJSVFgwYNcnY4pd6oUaM0cOBAtWrVSq1bt9aMGTN09uxZ+74bMGCAatWqpbi4OEnSpEmT1KZNGzVo0EBnzpzRtGnTdPjwYT3++OP2OkePHq0+ffqoffv26tSpk1auXKnly5dr/fr1ksxR+0ePHq2XXnpJzZs3V4sWLfTuu+/q559/1kcffSTJHJjx+++/V6dOnVSpUiVt3LhRI0eO1MMPP1ygJB8AgLKMRB8FY7FI994r3XWX9Oab0uTJ0g8/SH/5i/TQQ9I//ynVru3sKAHgugUGBqp69er697//TUJYAH369NGJEyc0YcIEpaSkqEWLFlq5cqV9gL4jR47Ybw2RpNOnTys2NlYpKSmqUqWKwsPDlZiY6NANPyYmRrNnz1ZcXJyGDx+uRo0aacmSJWrXrp29zIgRI3ThwgWNHDlSp06dUvPmzbV69WrVr19fktkNf9GiRZo4caIyMzNVr149jRw5stT10AAAoCTQdb+Iyn13vtRU6cUXpblzzeH6fH3N+/efey7nMONAKUbXfZQ1pa3rvitjfwIASpuCtk3cYI2iCQqS5swxr+q3ayedOydNmCA1bix9+KGZ/AMAAAAAbjgSfVyf22+XNmyQFi2SQkKkI0ekBx+UOnSQtm1zdnRAgdG5CWUFxyoAALgWEn1cP4tF6tNH+vlnaeJEs+v+N99I4eHSkCHS8ePOjhDIk7u7uyQpKyvLyZEABXPu3DlJsj+KEAAA4GoMxofi4+srvfSS9Nhj0pgx0sKFZvf+xYvN5UOHSl5ezo4ScODh4SFfX1+dOHFCnp6eDoOGAaWJYRg6d+6cjh8/rsqVK9tPUgEAAFyNwfiKiAF6CuDbb6Vnn5W2bjXnb7lFmj5d6t7d7AUAlBJZWVk6ePCgbDabs0MBrqly5coKDg6WJZe/o7RNxYv9CQAobQraNnFFHyWnXTtp82Zp3jzpb3+T9u41H9HXrZuZ8IeFOTtCQJLk5eWlhg0b0n0fpZ6npydX8gEAwDWR6KNkubmZXfkfeED6xz+k11+XVq6UvvrK7Mo/YYLEc6pRCri5ufF4PQAAALgEbkbFjeHvL/3zn9LOnVKPHtKlS9KMGWZ3/nfekaxWZ0cIAAAAAC6BRB83VsOG0rJl0pdfSk2aSCdPSk8+aT6mb906Z0cHAAAAAGUeiT6c4667pKQk6c03za7727dLnTubXfwPHnR2dAAAAABQZpHow3k8PaVhw6RffpGeeca8n3/JEnOQvhdflDIynB0hAAAAAJQ5JPpwvmrVpJkzzSv8nTtLmZnmwH2NGknz50s88gwAAAAACoxEH6VH06bmaPxLl0o33yz99pv0yCNS27bS9987OzoAAAAAKBNI9FG6WCzS/fdLu3ZJU6dKFSuaSX6bNtLAgWbyDwAAAADIE4k+Sidvb2nMGGnvXunRR81l//2v+Ti+uDjpwgWnhgcAAAAApRWJPkq3GjWkhARp0ybzqv7Zs9Lf/mY+mm/pUskwnB0hAAAAAJQqJPooGyIipMREc3C+WrXMR/D16iV16WI+mg8AAAAAIIlEH2WJxSL17y/t2SONHy/5+Ejr1kktW0pPPy2dPOnsCAEAAADA6Uj0Ufb4+UmTJkm7d0t//av5+L1//Utq2FB6803p4kVnRwgAAAAATkOij7Krbl3pgw+k9eul5s2lM2ekZ581X3/5pZODAwAAAADnINFH2dehg7Rli/TOO1L16uaV/m7dpPvuM0ftBwAAAIByhEQfrsHdXRoyRPrlF2nUKMnDQ/rsM+m226TRo6W0NGdHCAAAAAA3BIk+XEvlytJrr0k//SR1727er//qq9Itt0hz50pWq7MjBAAAAIASRaIP19SokbRihTk1aiQdPy49/rjUurX07bfOjg4AAAAASgyJPlxb9+7Sjh3S669LAQHS1q3SHXdIfftKR444OzoAAAAAKHYk+nB9np7SiBHm/ftPPCFZLNLixeaV/okTpXPnnB0hAAAAABQbEn2UHzfdJM2ebV7V79BBunBBevllM+FftEgyDGdHCAAAAADXzcPZAQA3XIsW0rp10pIl0nPPSYcPS/36STNnSm+8IYWHXy5rtUrffCMlJ0s1apjd/t3db0ycztx2YZREnAWts6zsI2dx9v7h+5O/shAjAAAomwwUSVpamiHJSEtLc3YouB7nzhnG3/9uGL6+hiEZhsViGI89ZhjJyYaxZIlh1K5tLs+eatc2l5c0Z267MEoizoLWWVb2kbM4e//w/clfCcVI21S82J8AgNKmoG2TxTDor1wU6enpCggIUFpamvz9/Z0dDq7Xr79KY8dK8+eb8z4+Ztf+q1ks5s+PPpJ69SqZWD7+WHrggZy3EtyIbRdGScRZ0DrLyj5yFmfvH2du39mfvSBKMEbapuLF/gQAlDYFbZtI9IuIxt9FbdwoDR8u/fBD3mUsFql2bengweLvZmu1SnXrSseO3fhtF0ZJxFnQOvftk+rXL/37yFmcfQw5c/vO/uwFUcIx0jYVL/YnAKC0IdEvYTT+LmztWqlLl2uX8/eXvLyKd9tZWVJ6unO2XRglEWdB6/T1LdiTEpy9j5zF2ceQM7fv7M9eEAWNcd06qWPHQldP21S82J8AgNKmoG0Tg/EBV0tNLVi5gvyzXlKcue3CKIk4C/o4xLKyj5zF2fuH70/+kpOdHQEAACjDSPSBq9WoUbByCQlS69bFu+1Nm6RBg5yz7cIoiTgLWueYMdI//1m823Ylzj6GnLl9Z3/2gihojAX9OwQAAJALuu4XEd35XFj2PbS//ppzsCzpxtxj7IxtF0ZJxFnQOrPv0S/t+8hZnH0M8f3JXwnHSNtUvNifAIDSpqBtk9sNjAkoG9zdpTfeMF9nj4KdLXt+xoySSRScue3CKIk4C1qnl1fZ2EfO4uxjiO9P/spCjAAAoMwj0Qdy06uX+YirWrUcl9euXfKP53LmtgujJOIsaJ1lZR85i7P3D9+f/JWFGAEAQJlG1/0iojtfOWG1St98Yw6MVaOGdMcdN+5KmzO3XRglEWdB6ywr+8hZnL1/+P7krwRipG0qXuxPAEBpw+P1ShiNPwCgtKFtKl7sTwBAacM9+gAAAAAAlEMk+gAAAAAAuBASfQAAAAAAXAiJPgAAAAAALoREHwAAAAAAF0KiDwAAAACACyHRBwAAAADAhZDoAwAAAADgQkj0AQAAAABwIST6AAAAAAC4EBJ9AAAAAABcCIk+AAAAAAAuhEQfAAAAAAAXQqIPAACuy6xZs1S3bl35+PgoMjJSmzZtyrPsvHnzZLFYHCYfH58c5Xbv3q0ePXooICBAfn5+ioiI0JEjRxzKbNy4UZ07d5afn5/8/f3Vvn17nT9/3r7+1KlT6t+/v/z9/VW5cmUNHjxYGRkZxffBAQAopUj0AQBAkS1evFijRo3SSy+9pK1bt6p58+aKjo7W8ePH83yPv7+/kpOT7dPhw4cd1u/fv1/t2rVT48aNtX79em3fvl3jx493OCGwceNGdevWTXfddZc2bdqkzZs3a+jQoXJzu/yvTf/+/bVz506tXr1an332mTZs2KAhQ4YU/04AAKCUsRiGYTg7iLIoPT1dAQEBSktLk7+/v7PDAQDAKW1TZGSkIiIiNHPmTEmSzWZTSEiIhg0bprFjx+YoP2/ePI0YMUJnzpzJs86+ffvK09NT7733Xp5l2rRpozvvvFOTJ0/Odf3u3bvVpEkTbd68Wa1atZIkrVy5Ut27d9exY8dUs2bNa3422noAQGlT0LaJK/oAAKBIsrKytGXLFnXt2tW+zM3NTV27dtXGjRvzfF9GRoZCQ0MVEhKinj17aufOnfZ1NptNK1as0C233KLo6GgFBgYqMjJSn3zyib3M8ePH9f333yswMFBt27ZVUFCQOnTooG+//dZeZuPGjapcubI9yZekrl27ys3NTd9//32ucWVmZio9Pd1hAgCgLCLRBwAARXLy5ElZrVYFBQU5LA8KClJKSkqu72nUqJHi4+O1bNkyzZ8/XzabTW3bttWxY8ckmUl8RkaGpk6dqm7dumnVqlWKiYlRr1699PXXX0uSDhw4IEmaOHGiYmNjtXLlSt1+++3q0qWLfvnlF0lSSkqKAgMDHbbt4eGhqlWr5hlbXFycAgIC7FNISEjRdw4AAE5Eog8AAG6YqKgoDRgwQC1atFCHDh308ccf66abbtI777wjybyiL0k9e/bUyJEj1aJFC40dO1b33nuvZs+e7VDmiSee0KBBg9SyZUu9/vrr9pMIRTVu3DilpaXZp6NHj17npwUAwDlKRaJf3KP1Xr0+e5o2bZq9TN26dXOsnzp1aol9RgAAXE316tXl7u6u1NRUh+WpqakKDg4uUB2enp5q2bKl9u3bZ6/Tw8NDTZo0cSgXFhZmH3W/Ro0akpRvmeDg4BwDAl66dEmnTp3KMzZvb2/5+/s7TAAAlEVOT/RLYrTeK9clJycrPj5eFotFvXv3dig3adIkh3LDhg0rkc8IAIAr8vLyUnh4uNasWWNfZrPZtGbNGkVFRRWoDqvVqh07dtiTdy8vL0VERGjPnj0O5fbu3avQ0FBJ5sn6mjVr5lsmKipKZ86c0ZYtW+zr165dK5vNpsjIyMJ/WAAAyhAPZwcwffp0xcbGatCgQZKk2bNna8WKFYqPj891tF7JvGKf35WCq9ctW7ZMnTp10s033+ywvFKlSgW+4gAAAHIaNWqUBg4cqFatWql169aaMWOGzp49a2/XBwwYoFq1aikuLk6SeZK9TZs2atCggc6cOaNp06bp8OHDevzxx+11jh49Wn369FH79u3VqVMnrVy5UsuXL9f69eslmf8HjB49Wi+99JKaN2+uFi1a6N1339XPP/+sjz76SJJ5db9bt26KjY3V7NmzdfHiRQ0dOlR9+/Yt0Ij7AACUZU5N9LNH6x03bpx9WWFG67XZbLr99ts1ZcoU3XrrrbmWTU1N1YoVK/Tuu+/mWDd16lRNnjxZderU0UMPPaSRI0fKwyP3XZKZmanMzEz7PCPxAgAg9enTRydOnNCECROUkpKiFi1aaOXKlfYB+o4cOeLwbPvTp08rNjZWKSkpqlKlisLDw5WYmOjQDT8mJkazZ89WXFychg8frkaNGmnJkiVq166dvcyIESN04cIFjRw5UqdOnVLz5s21evVq1a9f317m/fff19ChQ9WlSxe5ubmpd+/eevPNN2/AXgEAwLkshmEYztr4b7/9plq1aikxMdGhi9/zzz+vr7/+OtfH32zcuFG//PKLmjVrprS0NL366qvasGGDdu7cqdq1a+co/8orr2jq1Kn67bffHO7lnz59um6//XZVrVpViYmJGjdunAYNGqTp06fnGuvEiRP18ssv51jOs3UBAKUFz30vXuxPAEBpU9C2yeld9wsrKirK4aRA27ZtFRYWpnfeeUeTJ0/OUT4+Pl79+/fPMWDfqFGj7K+bNWsmLy8vPfHEE4qLi5O3t3eOesaNG+fwnvT0dB67AwAAAAAodZya6JfEaL1X+uabb7Rnzx4tXrz4mvVERkbq0qVLOnTokBo1apRjvbe3d64nAAAAAAAAKE2cOup+SYzWe6W5c+cqPDxczZs3v2Y9SUlJcnNzU2BgYME/AAAAAAAApYzTu+6XxGi9ktm1/sMPP9Rrr72WY5sbN27U999/r06dOqlSpUrauHGjRo4cqYcfflhVqlQp+Q8NAAAAAEAJcXqiXxKj9UrSokWLZBiG+vXrl2Ob3t7eWrRokSZOnKjMzEzVq1dPI0eOdLgHHwAAAACAssipo+6XZYzECwAobWibihf7EwBQ2hS0bXLqPfoAAAAAAKB4kegDAAAAAOBCSPQBAAAAAHAhJPoAAAAAALgQEn0AAAAAAFwIiT4AAAAAAC6ERB8AAAAAABdCog8AAAAAgAsh0QcAAAAAwIWQ6AMAAAAA4EJI9AEAAAAAcCEk+gAAAAAAuBASfQAAAAAAXAiJPgAAAAAALoREHwAAAAAAF0KiDwAAAACACyHRBwAAAADAhZDoAwAAAADgQkj0AQAAAABwIST6AAAAAAC4EBJ9AAAAAABcCIk+AAAAAAAuhEQfAAAAAAAXQqIPAAAAAIALIdEHAAAAAMCFkOgDAAAAAOBCSPQBAAAAAHAhJPoAAAAAALgQEn0AAAAAAFwIiT4AAAAAAC6ERB8AAAAAABdCog8AAAAAgAsh0QcAAAAAwIWQ6AMAAAAA4EJI9AEAAAAAcCEk+gAAAAAAuBASfQAAAAAAXAiJPgAAAAAALoREHwAAAAAAF0KiDwAAAACACyHRBwAAAADAhZDoAwAAAADgQkj0AQAAAABwIST6AAAAAAC4EBJ9AAAAAABcCIk+AAAAAAAuhEQfAABcl1mzZqlu3bry8fFRZGSkNm3alGfZefPmyWKxOEw+Pj45yu3evVs9evRQQECA/Pz8FBERoSNHjtjXd+zYMUc9Tz75pEMdV6+3WCxatGhR8X1wAABKKQ9nBwAAAMquxYsXa9SoUZo9e7YiIyM1Y8YMRUdHa8+ePQoMDMz1Pf7+/tqzZ4993mKxOKzfv3+/2rVrp8GDB+vll1+Wv7+/du7cmeOEQGxsrCZNmmSf9/X1zbGthIQEdevWzT5fuXLlonxMAADKFBJ9AABQZNOnT1dsbKwGDRokSZo9e7ZWrFih+Ph4jR07Ntf3WCwWBQcH51nnCy+8oO7du+uVV16xL6tfv36Ocr6+vvnWI5mJ/bXKAADgaui6DwAAiiQrK0tbtmxR165d7cvc3NzUtWtXbdy4Mc/3ZWRkKDQ0VCEhIerZs6d27txpX2ez2bRixQrdcsstio6OVmBgoCIjI/XJJ5/kqOf9999X9erVddttt2ncuHE6d+5cjjLPPPOMqlevrtatWys+Pl6GYeQZV2ZmptLT0x0mAADKIhJ9AABQJCdPnpTValVQUJDD8qCgIKWkpOT6nkaNGik+Pl7Lli3T/PnzZbPZ1LZtWx07dkySdPz4cWVkZGjq1Knq1q2bVq1apZiYGPXq1Utff/21vZ6HHnpI8+fP17p16zRu3Di99957evjhhx22NWnSJH3wwQdavXq1evfuraefflpvvfVWnp8nLi5OAQEB9ikkJKSouwYAAKeyGPmd2kae0tPTFRAQoLS0NPn7+zs7HAAAbnjb9Ntvv6lWrVpKTExUVFSUffnzzz+vr7/+Wt9///0167h48aLCwsLUr18/TZ482V5nv379tGDBAnu5Hj16yM/PTwsXLsy1nrVr16pLly7at29frt38JWnChAlKSEjQ0aNHc12fmZmpzMxM+3x6erpCQkJo6wEApUZB23qu6AMAgCKpXr263N3dlZqa6rA8NTW1wPfFe3p6qmXLltq3b5+9Tg8PDzVp0sShXFhYmMOo+1eLjIyUJHs9eZU5duyYQzJ/JW9vb/n7+ztMAACURST6AACgSLy8vBQeHq41a9bYl9lsNq1Zs8bhCn9+rFarduzYoRo1atjrjIiIcBiVX5L27t2r0NDQPOtJSkqSJHs9eZWpUqWKvL29CxQbAABlFaPuAwCAIhs1apQGDhyoVq1aqXXr1poxY4bOnj1rH4V/wIABqlWrluLi4iSZ9823adNGDRo00JkzZzRt2jQdPnxYjz/+uL3O0aNHq0+fPmrfvr06deqklStXavny5Vq/fr0k8/F7CxYsUPfu3VWtWjVt375dI0eOVPv27dWsWTNJ0vLly5Wamqo2bdrIx8dHq1ev1pQpU/Tcc8/d2B0EAIATkOgDAIAi69Onj06cOKEJEyYoJSVFLVq00MqVK+0D9B05ckRubpc7EJ4+fVqxsbFKSUlRlSpVFB4ersTERIeu+jExMZo9e7bi4uI0fPhwNWrUSEuWLFG7du0kmVf9v/rqK/tJhZCQEPXu3VsvvviivQ5PT0/NmjVLI0eOlGEYatCggf1RgAAAuDoG4ysiBuMDAJQ2tE3Fi/0JAChtGIwPAAAAAIByiEQfAAAAAAAXQqIPAAAAAIALIdEHAAAAAMCFkOgDAAAAAOBCSPQBAAAAAHAhJPoAAAAAALgQEn0AAAAAAFwIiT4AAAAAAC6ERB8AAAAAABdCog8AAAAAgAsh0QcAAAAAwIWQ6AMAAAAA4EJI9AEAAAAAcCEk+gAAAAAAuBASfQAAAAAAXAiJPgAAAAAALoREHwAAAAAAF0KiDwAAAACACyHRBwAAAADAhZDoAwAAAADgQkj0AQAAAABwIaUi0Z81a5bq1q0rHx8fRUZGatOmTXmWnTdvniwWi8Pk4+PjUObq9dnTtGnT7GVOnTql/v37y9/fX5UrV9bgwYOVkZFRYp8RAAAAAIAbwemJ/uLFizVq1Ci99NJL2rp1q5o3b67o6GgdP348z/f4+/srOTnZPh0+fNhh/ZXrkpOTFR8fL4vFot69e9vL9O/fXzt37tTq1av12WefacOGDRoyZEiJfU4AAAAAAG4ED2cHMH36dMXGxmrQoEGSpNmzZ2vFihWKj4/X2LFjc32PxWJRcHBwnnVevW7ZsmXq1KmTbr75ZknS7t27tXLlSm3evFmtWrWSJL311lvq3r27Xn31VdWsWbM4PhoAAAAAADecU6/oZ2VlacuWLeratat9mZubm7p27aqNGzfm+b6MjAyFhoYqJCREPXv21M6dO/Msm5qaqhUrVmjw4MH2ZRs3blTlypXtSb4kde3aVW5ubvr+++9zrSczM1Pp6ekOEwAAAAAApY1TE/2TJ0/KarUqKCjIYXlQUJBSUlJyfU+jRo0UHx+vZcuWaf78+bLZbGrbtq2OHTuWa/l3331XlSpVUq9evezLUlJSFBgY6FDOw8NDVatWzXO7cXFxCggIsE8hISGF+agAAAAAANwQTr9Hv7CioqI0YMAAtWjRQh06dNDHH3+sm266Se+8806u5ePj49W/f/8cA/YV1rhx45SWlmafjh49el31AQAAAABQEgqd6NetW1eTJk3SkSNHrnvj1atXl7u7u1JTUx2Wp6am5nsP/pU8PT3VsmVL7du3L8e6b775Rnv27NHjjz/usDw4ODjHYH+XLl3SqVOn8tyut7e3/P39HSYAAMqi4mzLAQBA6VPoRH/EiBH6+OOPdfPNN+vOO+/UokWLlJmZWaSNe3l5KTw8XGvWrLEvs9lsWrNmjaKiogpUh9Vq1Y4dO1SjRo0c6+bOnavw8HA1b97cYXlUVJTOnDmjLVu22JetXbtWNptNkZGRRfosAACUFcXZlgMAgNKnSIl+UlKSNm3apLCwMA0bNkw1atTQ0KFDtXXr1kIHMGrUKM2ZM0fvvvuudu/eraeeekpnz561j8I/YMAAjRs3zl5+0qRJWrVqlQ4cOKCtW7fq4Ycf1uHDh3NctU9PT9eHH36YY7kkhYWFqVu3boqNjdWmTZv03XffaejQoerbty8j7gMAXF5xt+UAAKB0KfI9+rfffrvefPNN/fbbb3rppZf0n//8RxEREWrRooXi4+NlGEaB6unTp49effVVTZgwQS1atFBSUpJWrlxpH6DvyJEjSk5Otpc/ffq0YmNjFRYWpu7duys9PV2JiYlq0qSJQ72LFi2SYRjq169frtt9//331bhxY3Xp0kXdu3dXu3bt9O9//7uIewMAgLKnuNpyAABQuliMIrbiFy9e1NKlS5WQkKDVq1erTZs2Gjx4sI4dO6ZZs2apc+fOWrBgQXHHW2qkp6crICBAaWlp3K8PACgVCts2lfe2/Fpo6wEApU1B2yaPwla8detWJSQkaOHChXJzc9OAAQP0+uuvq3HjxvYyMTExioiIKFrkAACgRNGWAwDg2gqd6EdEROjOO+/Uv/71L91///3y9PTMUaZevXrq27dvsQQIAACKF205AACurdCJ/oEDBxQaGppvGT8/PyUkJBQ5KAAAUHJoywEAcG2FHozv+PHj+v7773Ms//777/XDDz8US1AAAKDk0JYDAODaCp3oP/PMMzp69GiO5b/++queeeaZYgkKAACUHNpyAABcW6ET/V27dun222/Psbxly5batWtXsQQFAABKDm05AACurdCJvre3t1JTU3MsT05OlodHoW/5BwAANxhtOQAArq3Qif5dd92lcePGKS0tzb7szJkz+tvf/qY777yzWIMDAADFj7YcAADXVujT9q+++qrat2+v0NBQtWzZUpKUlJSkoKAgvffee8UeIAAAKF605QAAuLZCJ/q1atXS9u3b9f777+vHH39UhQoVNGjQIPXr1y/X5/ACAIDShbYcAADXVqQb8fz8/DRkyJDijgUAANwgtOUAALiuIo+4s2vXLh05ckRZWVkOy3v06HHdQQEAgJJHWw4AgGsqdKJ/4MABxcTEaMeOHbJYLDIMQ5JksVgkSVartXgjBAAAxYq2HAAA11boUfefffZZ1atXT8ePH5evr6927typDRs2qFWrVlq/fn0JhAgAAIoTbTkAAK6t0Ff0N27cqLVr16p69epyc3OTm5ub2rVrp7i4OA0fPlzbtm0riTgBAEAxoS0HAMC1FfqKvtVqVaVKlSRJ1atX12+//SZJCg0N1Z49e4o3OgAAUOxoywEAcG2FvqJ/22236ccff1S9evUUGRmpV155RV5eXvr3v/+tm2++uSRiBAAAxYi2HAAA11boRP/FF1/U2bNnJUmTJk3SvffeqzvuuEPVqlXT4sWLiz1AAABQvGjLAQBwbRYje6jd63Dq1ClVqVLFPlpveZCenq6AgAClpaXJ39/f2eEAAHBdbVN5bMuvhbYeAFDaFLRtKtQ9+hcvXpSHh4d++uknh+VVq1blHwMAAMqAkmjLZ82apbp168rHx0eRkZHatGlTnmXnzZsni8XiMPn4+OQot3v3bvXo0UMBAQHy8/NTRESEjhw5Yl/fsWPHHPU8+eSTDnUcOXJE99xzj3x9fRUYGKjRo0fr0qVLRfqMAACUJYXquu/p6ak6derwfF0AAMqo4m7LFy9erFGjRmn27NmKjIzUjBkzFB0drT179igwMDDX9/j7+zsM+nf1CYb9+/erXbt2Gjx4sF5++WX5+/tr586dOU4IxMbGatKkSfZ5X19f+2ur1ap77rlHwcHBSkxMVHJysgYMGCBPT09NmTKlOD46AAClVqFH3X/hhRf0t7/9TadOnSqJeAAAQAkrzrZ8+vTpio2N1aBBg9SkSRPNnj1bvr6+io+Pz/M9FotFwcHB9ikoKChHfN27d9crr7yili1bqn79+urRo0eOEwe+vr4O9VzZhXHVqlXatWuX5s+frxYtWujuu+/W5MmTNWvWLGVlZV335wYAoDQrdKI/c+ZMbdiwQTVr1lSjRo10++23O0wAAKB0K662PCsrS1u2bFHXrl3ty9zc3NS1a1dt3Lgxz/dlZGQoNDRUISEh6tmzp3bu3GlfZ7PZtGLFCt1yyy2Kjo5WYGCgIiMj9cknn+So5/3331f16tV12223ady4cTp37px93caNG9W0aVOHkwjR0dFKT0932N6VMjMzlZ6e7jABAFAWFXrU/fvvv78EwgAAADdKcbXlJ0+elNVqzXFFPigoSD///HOu72nUqJHi4+PVrFkzpaWl6dVXX1Xbtm21c+dO1a5dW8ePH1dGRoamTp2qv//97/rnP/+plStXqlevXlq3bp06dOggSXrooYcUGhqqmjVravv27RozZoz27Nmjjz/+WJKUkpKSa1zZ63ITFxenl19++br2CQAApUGhE/2XXnqpJOIAAAA3iDPb8qioKEVFRdnn27Ztq7CwML3zzjuaPHmybDabJKlnz54aOXKkJKlFixZKTEzU7Nmz7Yn+kCFD7HU0bdpUNWrUUJcuXbR//37Vr1+/SLGNGzdOo0aNss+np6crJCSkSHUBAOBMhe66DwAAIEnVq1eXu7u7UlNTHZanpqYqODi4QHV4enqqZcuW2rdvn71ODw8PNWnSxKFcWFiYw6j7V4uMjJQkez3BwcG5xpW9Ljfe3t7y9/d3mAAAKIsKnei7ubnJ3d09zwkAAJRuxdWWe3l5KTw8XGvWrLEvs9lsWrNmjcNV+/xYrVbt2LFDNWrUsNcZERHhMCq/JO3du1ehoaF51pOUlCRJ9nqioqK0Y8cOHT9+3F5m9erV8vf3z3ESAQAAV1PorvtLly51mL948aK2bdumd999l/vaAAAoA4qzLR81apQGDhyoVq1aqXXr1poxY4bOnj2rQYMGSZIGDBigWrVqKS4uTpI0adIktWnTRg0aNNCZM2c0bdo0HT58WI8//ri9ztGjR6tPnz5q3769OnXqpJUrV2r58uVav369JPPxewsWLFD37t1VrVo1bd++XSNHjlT79u3VrFkzSdJdd92lJk2a6JFHHtErr7yilJQUvfjii3rmmWfk7e1d1F0HAECZUOhEv2fPnjmWPfDAA7r11lu1ePFiDR48uFgCAwAAJaM42/I+ffroxIkTmjBhglJSUtSiRQutXLnSPvDdkSNH5OZ2uQPh6dOnFRsbq5SUFFWpUkXh4eFKTEx0uMoeExOj2bNnKy4uTsOHD1ejRo20ZMkStWvXTpJ51f+rr76yn1QICQlR79699eKLL9rrcHd312effaannnpKUVFR8vPz08CBAzVp0qRC7y8AAMoai2EYRnFUdODAATVr1kwZGRnFUV2pl56eroCAAKWlpXEPHwCgVLjetqm8teXXQlsPAChtCto2FctgfOfPn9ebb76pWrVqFUd1AADgBqMtBwDAdRS6636VKlVksVjs84Zh6I8//pCvr6/mz59frMEBAIDiR1sOAIBrK3Si//rrrzv8c+Dm5qabbrpJkZGRqlKlSrEGBwAAih9tOQAArq3Qif6jjz5aAmEAAIAbhbYcAADXVuh79BMSEvThhx/mWP7hhx/q3XffLZagAABAyaEtBwDAtRU60Y+Li1P16tVzLA8MDNSUKVOKJSgAAFByaMsBAHBthU70jxw5onr16uVYHhoaqiNHjhRLUAAAoOTQlgMA4NoKnegHBgZq+/btOZb/+OOPqlatWrEEBQAASg5tOQAArq3QiX6/fv00fPhwrVu3TlarVVarVWvXrtWzzz6rvn37lkSMAACgGNGWAwDg2go96v7kyZN16NAhdenSRR4e5tttNpsGDBjAfX0AAJQBtOUAALg2i2EYRlHe+MsvvygpKUkVKlRQ06ZNFRoaWtyxlWrp6ekKCAhQWlqa/P39nR0OAACFbpvKe1t+LbT1AIDSpqBtU6Gv6Gdr2LChGjZsWNS3AwAAJ6MtBwDANRX6Hv3evXvrn//8Z47lr7zyiv76178WS1AAAKDk0JYDAODaCp3ob9iwQd27d8+x/O6779aGDRuKJSgAAFByaMsBAHBthU70MzIy5OXllWO5p6en0tPTiyUoAABQcmjLAQBwbYVO9Js2barFixfnWL5o0SI1adKkWIICAAAlh7YcAADXVujB+MaPH69evXpp//796ty5syRpzZo1WrBggT766KNiDxAAABQv2nIAAFxboRP9++67T5988ommTJmijz76SBUqVFDz5s21du1aVa1atSRiBAAAxYi2HAAA12YxDMO4ngrS09O1cOFCzZ07V1u2bJHVai2u2Eo1nq0LAChtito2lde2/Fpo6wEApU1B26ZC36OfbcOGDRo4cKBq1qyp1157TZ07d9b//ve/olYHAABuMNpyAABcU6G67qekpGjevHmaO3eu0tPT9eCDDyozM1OffPIJg/cAAFAG0JYDAOD6CnxF/7777lOjRo20fft2zZgxQ7/99pveeuutkowNAAAUI9pyAADKhwJf0f/iiy80fPhwPfXUU2rYsGFJxgQAAEoAbTkAAOVDga/of/vtt/rjjz8UHh6uyMhIzZw5UydPnizJ2AAAQDGiLQcAoHwocKLfpk0bzZkzR8nJyXriiSe0aNEi1axZUzabTatXr9Yff/xRknECAIDrRFsOAED5cF2P19uzZ4/mzp2r9957T2fOnNGdd96pTz/9tDjjK7V45A4AoLQpSttUntvya6GtBwCUNiX+eD1JatSokV555RUdO3ZMCxcuvJ6qAACAE9CWAwDgeq7rin55xll+AEBpQ9tUvNifAIDS5oZc0QcAAAAAAKULiT4AAAAAAC6ERB8AAAAAABdCog8AAAAAgAsh0QcAAAAAwIWQ6AMAAAAA4EJI9AEAAAAAcCEk+gAAAAAAuBASfQAAAAAAXAiJPgAAAAAALoREHwAAAAAAF0KiDwAAAACACyHRBwAAAADAhZDoAwAAAADgQkj0AQAAAABwIST6AAAAAAC4EBJ9AAAAAABcCIk+AAAAAAAuhEQfAAAAAAAXQqIPAAAAAIALIdEHAAAAAMCFOD3RnzVrlurWrSsfHx9FRkZq06ZNeZadN2+eLBaLw+Tj45Oj3O7du9WjRw8FBATIz89PEREROnLkiH19x44dc9Tz5JNPlsjnAwAAAADgRvJw5sYXL16sUaNGafbs2YqMjNSMGTMUHR2tPXv2KDAwMNf3+Pv7a8+ePfZ5i8XisH7//v1q166dBg8erJdffln+/v7auXNnjhMCsbGxmjRpkn3e19e3GD8ZAAAAAADO4dREf/r06YqNjdWgQYMkSbNnz9aKFSsUHx+vsWPH5voei8Wi4ODgPOt84YUX1L17d73yyiv2ZfXr189RztfXN996AAAAAAAoi5zWdT8rK0tbtmxR165dLwfj5qauXbtq48aNeb4vIyNDoaGhCgkJUc+ePbVz5077OpvNphUrVuiWW25RdHS0AgMDFRkZqU8++SRHPe+//76qV6+u2267TePGjdO5c+fyjTczM1Pp6ekOEwAAcM5teNkMw9Ddd98ti8WSo72/ejsWi0WLFi267s8LAEBp57RE/+TJk7JarQoKCnJYHhQUpJSUlFzf06hRI8XHx2vZsmWaP3++bDab2rZtq2PHjkmSjh8/royMDE2dOlXdunXTqlWrFBMTo169eunrr7+21/PQQw9p/vz5WrduncaNG6f33ntPDz/8cL7xxsXFKSAgwD6FhIRc5x4AAKDsy74N76WXXtLWrVvVvHlzRUdH6/jx43m+x9/fX8nJyfbp8OHDDuuzb8Nr3Lix1q9fr+3bt2v8+PG5nhCYMWNGjtv4rpSQkOCwrfvvv7/InxUAgLLCqV33CysqKkpRUVH2+bZt2yosLEzvvPOOJk+eLJvNJknq2bOnRo4cKUlq0aKFEhMTNXv2bHXo0EGSNGTIEHsdTZs2VY0aNdSlSxft378/127+kjRu3DiNGjXKPp+enk6yDwAo95x5G15SUpJee+01/fDDD6pRo0audVWuXJlb9QAA5Y7TruhXr15d7u7uSk1NdViemppa4AbZ09NTLVu21L59++x1enh4qEmTJg7lwsLCcu3uly0yMlKS7PXkxtvbW/7+/g4TAADlmTNvwzt37pweeughzZo1K9//G5555hlVr15drVu3Vnx8vAzDyLMst+kBAFyF0xJ9Ly8vhYeHa82aNfZlNptNa9ascbhqnx+r1aodO3bYz+J7eXkpIiLCYVR+Sdq7d69CQ0PzrCcpKUmS8rwaAAAAcnLmbXgjR45U27Zt1bNnzzzjmzRpkj744AOtXr1avXv31tNPP6233norz/LcpgcAcBVO7bo/atQoDRw4UK1atVLr1q01Y8YMnT171t79b8CAAapVq5bi4uIkmQ12mzZt1KBBA505c0bTpk3T4cOH9fjjj9vrHD16tPr06aP27durU6dOWrlypZYvX67169dLMu/7W7Bggbp3765q1app+/btGjlypNq3b69mzZrd8H0AAEB5Uhy34X366adau3attm3blu+2xo8fb3/dsmVLnT17VtOmTdPw4cNzLc9tegAAV+HURL9Pnz46ceKEJkyYoJSUFLVo0UIrV660Xxk4cuSI3Nwudzo4ffq0YmNjlZKSoipVqig8PFyJiYkOXfVjYmI0e/ZsxcXFafjw4WrUqJGWLFmidu3aSTKv+n/11Vf2kwohISHq3bu3XnzxxRv74QEAKONu9G143377rSRp7dq12r9/vypXruxQpnfv3rrjjjvsJ/evFhkZqcmTJyszM1Pe3t451nt7e+e6HACAssZi5HezGvKUnp6ugIAApaWlcb8+AKBUcEbbFBkZqdatW9u7xNtsNtWpU0dDhw7NczC+K1mtVt16663q3r27pk+fLsm8yl+/fn2999579nIxMTGqUKGCFixYoJSUFJ08edKhnqZNm+qNN97Qfffdp3r16uW6rX/84x967bXXdOrUqQJ9Ntp6AEBpU9C2qUyNug8AAEoXZ9yGFxwcnGuPgTp16tiT/OXLlys1NVVt2rSRj4+PVq9erSlTpui5554r4T0CAIDzkegDAIAic8ZteAXh6empWbNmaeTIkTIMQw0aNLA/ChAAAFdH1/0iojsfAKC0oW0qXuxPAEBpU9C2yWmP1wMAAAAAAMWPRB8AAAAAABdCog8AAAAAgAsh0QcAAAAAwIWQ6AMAAAAA4EJI9AEAAAAAcCEk+gAAAAAAuBASfQAAAAAAXAiJPgAAAAAALoREHwAAAAAAF0KiDwAAAACACyHRBwAAAADAhZDoAwAAAADgQkj0AQAAAABwIST6AAAAAAC4EBJ9AAAAAABcCIk+AAAAAAAuhEQfAAAAAAAXQqIPAAAAAIALIdEHAAAAAMCFkOgDAAAAAOBCSPQBAAAAAHAhJPoAAAAAALgQEn0AAAAAAFwIiT4AAAAAAC6ERB8AAAAAABdCog8AAAAAgAsh0QcAAAAAwIWQ6AMAAAAA4EJI9AEAAAAAcCEezg4AAAAAuNGOHpVOnMh7fWCgVLu26227MEoizoLWWVb2kTOV12O4rBwbzo6TRB8AAADlSmamFBEhpabmXSY4WDp0SPL2dp1tF0ZJxFnQOvfsKRv7yJnK6zFcnr8/hUXXfQAAAJQrXl5SnTqSWx7/Cbu5SSEhZjlX2nZhlEScBa2zYsWysY+cqbwew+X5+1NYJPoAAAAoVywWafJkyWbLfb3NZq63WFxr24VREnEWtE43t7Kxj5ypvB7D5fn7U+gYDMMwSq5615Wenq6AgAClpaXJ39/f2eEAAEDbVMzYn67NMKTISGnrVslqdVxXtap08uTlf8Jr1za74ubmjjukjz++PN+woXTmTO5lw8OllSsvb3vz5pxl3N2lypWlW2+Vvv768vKoKGnfvtzrrVvXsa7OnaUdO3IvGxgo7dx5ef7ee6Xvv8+9rJ+fWT63fZQdZ/Y+cnNz7Kb86KPSihU56zSMy/snvzqzy11dxmIxt3Xltq+UlCTVqmW+Hj9emj07988mSYmJ5u9LkuLipOnT8y67Zo3UrJn5+o03pL//Pe+yy5dLbdqYr+fMkf72t7zLLlokdelivn7/fWnEiLzLJiSYvy9JWrpUio3NfR9JUv360i+/mPto5UrpkUfyrnfaNPP3JUkbNki9e+dddtIk6amnzN/PbbdJu3blLJP9exw3Tvq//zOX7dwpdeyYd73Dh5u/L0k6eFBq3TrvsoMHS2vXFuy4fOgh8/clSX/8Id18c9713n+/+fuSzHqDg/MuGx0tzZ9/eT63vxHZx7DNZr6+Msbbbze/d0VJ9AvaNnGPPgAAAModi0V66aXLidOVGjZ0/Af85Mm8E/20NMf533+XTp/OvWx2gpt9ta9bt5xlrNbc6zh92owjN1f/r3/mTN5l3d0d59PS8i6bmSm9807+cWa7uotyenre9ebl6jpzYxj5l7vyCurZs/nHcGWSeO5c/mUvXbr8+vz5/MtevFi0shcu5F82K+vy68zM/PdVTMzlYzgrK/96L1xwjCe/sufPmz8tFvNEw8iROctk/37OnXNcll+9Z89efm2z5V/23Llrf3+yZWRcfm0Y+df7xx+O8/mVTU/PWTavvxG5xXgjeh1wRb+IOMsPAChtaJuKF/vTtV26ZF7t+/DDy8vc3KQmTaTPPpNCQy8v373b8YrclXx9zSvq2X7+Oe/uuhUqSPXqma8NQ2re3LzSabNd3vaiRWYC4O1tXpXNtm+fY6J3JU/Py1emJenAAcfk7Uru7lKjRpfnDx1yTMiu5OZmlr2y58PVcV6pSZPLr48ezZk4ZTMM8wrytm1mne7uUlhYzjoNQ+rb19z/2eVuu8288p1XktSwobk/JCklRTp1Kvdykrl/swdCO348/8SuXj3z9yeZ5Y4fz7tsaKjZG0Iyk878BmSrU8cck0AyT+YkJ+ddtnbtyyd10tKkX3+9vI927XI8jjZskKpUMcv+8Yf5+8hLcLDZi0UyE+7Dh/MuGxQkVat2uWybNjm3nf17vOkmc5LM4/HAgbzrrVbNrFsyj/O8eq9IZqxBQQU7LitXlmrWNF9breZAj3nx9788Cr5hmMddXrLHkciW198IwzD/zuzcefkYvp6r+VLB2yYS/SKi8QcAlDa0TcWL/ena1q0zu0y7uTle2V250uyWeyN8+aXjVckbue3CKIk4C1pnWdlHzuTMfVRet10YxR1nQdsmBuMDAABAudOpk3nP85Il5mOwJPPnXXfduBjuust52y6MkoizoHWWlX3kTM7cR+V124XhrDhJ9AEAAFAuWK2OXbkHDpR69pSmTDG7jk+ZcmNH67ZYnLftwiiJOAtaZ1nZR87kzH1UXrddGM6Kk677RUR3PgBAaUPbVLzYn67FajVH6960yRyxO78RtQGgtKLrPgAAACBzoLDYWOndd6W9e6UtW5wdEQCULBJ9AAAAuCybTXriCfN+fHd3acEC6Z57nB0VAJQsEn0AAAC4JJtNeuop6T//MUfXnz9fevBBZ0cFACWPRB8AAAAuxzCkoUOlf//bTPL/+1/zeeMAUB6Q6AMAAMDl/P67+bxqi0WaN0/q39/ZEQHAjePh7AAAAACA4la9uvT119LGjXTXB1D+kOgDAADAJRiGtH271Ly5OR8SYk4AUN7QdR8AAABlnmFIzz0nhYdLH37o7GgAwLm4og8AAIAyzTCkMWOk6dPN+VOnnBsPADgbV/QBAABQZhmG9Le/SdOmmfOzZklPPOHcmADA2Uj0AQAAUCYZhjR+vDR1qjn/1lvS0087NyYAKA1I9AEAwHWZNWuW6tatKx8fH0VGRmrTpk15lp03b54sFovD5OPjk6Pc7t271aNHDwUEBMjPz08RERE6cuRIjnKGYejuu++WxWLRJ5984rDuyJEjuueee+Tr66vAwECNHj1aly5duu7Pi9Jj4kTpH/8wX7/xhjR0qFPDAYBSg3v0AQBAkS1evFijRo3S7NmzFRkZqRkzZig6Olp79uxRYGBgru/x9/fXnj177PMWi8Vh/f79+9WuXTsNHjxYL7/8svz9/bVz585cTwjMmDEjx/slyWq16p577lFwcLASExOVnJysAQMGyNPTU1OmTLnOT43SwDCk48fN19OnS8OHOzceAChNLIZhGM4OoixKT09XQECA0tLS5O/v7+xwAABwStsUGRmpiIgIzZw5U5Jks9kUEhKiYcOGaezYsTnKz5s3TyNGjNCZM2fyrLNv377y9PTUe++9l++2k5KSdO+99+qHH35QjRo1tHTpUt1///2SpC+++EL33nuvfvvtNwUFBUmSZs+erTFjxujEiRPy8vK65mejrS/9DENau1bq0sXZkQDAjVHQtomu+wAAoEiysrK0ZcsWde3a1b7Mzc1NXbt21caNG/N8X0ZGhkJDQxUSEqKePXtq586d9nU2m00rVqzQLbfcoujoaAUGBioyMjJHt/xz587poYce0qxZsxQcHJxjGxs3blTTpk3tSb4kRUdHKz093WF7V8rMzFR6errDhNJn6VLp4kXztcVCkg8AuSHRBwAARXLy5ElZrVaHZFqSgoKClJKSkut7GjVqpPj4eC1btkzz58+XzWZT27ZtdezYMUnS8ePHlZGRoalTp6pbt25atWqVYmJi1KtXL3399df2ekaOHKm2bduqZ8+euW4nJSUl17iy1+UmLi5OAQEB9ikkJKRgOwI3zCuvSL16SX36SFars6MBgNKLe/QBAMANExUVpaioKPt827ZtFRYWpnfeeUeTJ0+WzWaTJPXs2VMjR46UJLVo0UKJiYmaPXu2OnTooE8//VRr167Vtm3bijW2cePGadSoUfb59PR0kv1S5LXXpDFjzNe33y65uzs3HgAozbiiDwAAiqR69epyd3dXamqqw/LU1NRcu9PnxtPTUy1bttS+ffvsdXp4eKhJkyYO5cLCwuyj7q9du1b79+9X5cqV5eHhIQ8P87pF79691bFjR0lScHBwrnFlr8uNt7e3/P39HSaUDq+/Lj33nPn65ZelF190bjwAUNqR6AMAgCLx8vJSeHi41qxZY19ms9m0Zs0ah6v2+bFardqxY4dq1KhhrzMiIsJhVH5J2rt3r0JDQyVJY8eO1fbt25WUlGSfJOn1119XQkKCJLPnwI4dO3Q8e1h2SatXr5a/v3+Okwgo3d54Q8ruaDFhgjkBAPJH130AAFBko0aN0sCBA9WqVSu1bt1aM2bM0NmzZzVo0CBJ0oABA1SrVi3FxcVJkiZNmqQ2bdqoQYMGOnPmjKZNm6bDhw/r8ccft9c5evRo9enTR+3bt1enTp20cuVKLV++XOvXr5dkXpHP7ap8nTp1VK9ePUnSXXfdpSZNmuiRRx7RK6+8opSUFL344ot65pln5O3tXcJ7BcXlX/+SRowwX7/wgjRxojOjAYCyg0QfAAAUWZ8+fXTixAlNmDBBKSkpatGihVauXGkf+O7IkSNyc7vcgfD06dOKjY1VSkqKqlSpovDwcCUmJjpcZY+JidHs2bMVFxen4cOHq1GjRlqyZInatWtX4Ljc3d312Wef6amnnlJUVJT8/Pw0cOBATZo0qfg+PEpc48ZShQpmsj95sjnKPgDg2iyGYRjODqIs4tm6AIDShrapeLE/S4f9+6WbbybJBwCp4G0T9+gDAACg1Hj3XWnnzsvz9euT5ANAYZHoAwAAoFSYO1d69FGpUyfp11+dHQ0AlF0k+gAAAHC6efOk2Fjz9UMPSTVrOjUcACjTSPQBAADgVP/9r/TYY5JhSEOHSq+/Tnd9ALgeJPoAAABwmvnzze76hiE99ZT05psk+QBwvUj0AQAA4BRffCENHGgm+U88Ic2cSZIPAMXBw9kBAAAAoHz6y1+kNm2kJk2kt9+W3LgEBQDFgkQfAAAATuHvL61aJVWoQJIPAMWJP6kAAAC4YZYskaZNuzzv50eSDwDFjSv6AAAAuCGWLpX69pUuXZIaNZJ69HB2RADgmjh/CgAAgBK3bJn04INmkv/QQ9I99zg7IgBwXST6AAAAKFHLl0t//auZ5PftK737ruTu7uyoAMB1kegDAACgxKxYIfXuLV28aF7Rf+89yYObRwGgRDk90Z81a5bq1q0rHx8fRUZGatOmTXmWnTdvniwWi8Pk4+OTo9zu3bvVo0cPBQQEyM/PTxERETpy5Ih9/YULF/TMM8+oWrVqqlixonr37q3U1NQS+XwAAADl1aFDUq9eZpL/wAPS+++T5APAjeDURH/x4sUaNWqUXnrpJW3dulXNmzdXdHS0jh8/nud7/P39lZycbJ8OHz7ssH7//v1q166dGjdurPXr12v79u0aP368wwmBkSNHavny5frwww/19ddf67ffflOvXr1K7HMCAACUR3XrSlOnmlf0FywgyQeAG8ViGIbhrI1HRkYqIiJCM2fOlCTZbDaFhIRo2LBhGjt2bI7y8+bN04gRI3TmzJk86+zbt688PT313nvv5bo+LS1NN910kxYsWKAHHnhAkvTzzz8rLCxMGzduVJs2bQoUe3p6ugICApSWliZ/f/8CvQcAgJJE21S82J9FZxiSxZL3PACgaAraNjntin5WVpa2bNmirl27Xg7GzU1du3bVxo0b83xfRkaGQkNDFRISop49e2rnzp32dTabTStWrNAtt9yi6OhoBQYGKjIyUp988om9zJYtW3Tx4kWH7TZu3Fh16tTJd7uZmZlKT093mAAAAOBozRqpUyfpyusyJPkAcGM5LdE/efKkrFargoKCHJYHBQUpJSUl1/c0atRI8fHxWrZsmebPny+bzaa2bdvq2LFjkqTjx48rIyNDU6dOVbdu3bRq1SrFxMSoV69e+vrrryVJKSkp8vLyUuXKlQu8XUmKi4tTQECAfQoJCbmOTw8AAOB61q2T7rtP+vprKS7O2dEAQPlVpu6UioqKUlRUlH2+bdu2CgsL0zvvvKPJkyfLZrNJknr27KmRI0dKklq0aKHExETNnj1bHTp0KPK2x40bp1GjRtnn09PTSfYBAAD+9PXX0r33SufPS927S5MmOTsiACi/nJboV69eXe7u7jlGu09NTVVwcHCB6vD09FTLli21b98+e50eHh5q0qSJQ7mwsDB9++23kqTg4GBlZWXpzJkzDlf1r7Vdb29veXt7FyguAACA8uSbb6R77pHOnZO6dZOWLJH4twkAnMdpXfe9vLwUHh6uNWvW2JfZbDatWbPG4ap9fqxWq3bs2KEaNWrY64yIiNCePXscyu3du1ehoaGSpPDwcHl6ejpsd8+ePTpy5EiBtwsAAADTd9+ZV/DPnpXuuktaulTK5enHAIAbyKld90eNGqWBAweqVatWat26tWbMmKGzZ89q0KBBkqQBAwaoVq1aivvzJq9JkyapTZs2atCggc6cOaNp06bp8OHDevzxx+11jh49Wn369FH79u3VqVMnrVy5UsuXL9f69eslSQEBARo8eLBGjRqlqlWryt/fX8OGDVNUVFSBR9wHAACAdOmSNGiQlJEhde0qffIJST4AlAZOTfT79OmjEydOaMKECUpJSVGLFi20cuVK+wB9R44ckZvb5U4Hp0+fVmxsrFJSUlSlShWFh4crMTHRoat+TEyMZs+erbi4OA0fPlyNGjXSkiVL1K5dO3uZ119/XW5uburdu7cyMzMVHR2tt99++8Z9cAAAABfg4SF9+qk0ebI0Z45UoYKzIwIASJLFMAzD2UGURTxbFwBQ2tA2FS/2Z97OnyepBwBnKGjb5LR79AEAAFD2/PCDVL++tGqVsyMBAOSFRB8AAAAFsnWrdOedUnKy9OqrEv1CAaB0ItEHAADANW3bZg64d+aM9Je/mI/Qs1icHRUAIDck+gAAAMjXjz+aSf7p01JUlPT551KlSs6OCgCQFxJ9AAAA5GnHDqlLF+nUKal1a+mLLyTGJgSA0o1EHwAAAHl6+23p99+liAjpyy+lgABnRwQAuBYPZwcAAACA0uutt6TAQGnkSKlyZWdHAwAoCK7oAwAAwMGxY5LNZr728JBefpkkHwDKEhJ9AAAA2O3ZY3bTj429nOwDAMoWEn0AAABIkvbulTp1klJSpM2bpT/+cHZEAICiINEHAACA9u0zk/zkZOm226Q1axh4DwDKKhJ9AACAcm7/fjPJ/+03qUkTM8m/6SZnRwUAKCoSfQAAgHLswAEzyT92TAoLk9auNUfZBwCUXST6AAAA5djPP5v35DdubCb5QUHOjggAcL08nB0AAAAAnKd7d2n5cqlZMyk42NnRAACKA4k+AABAOXP0qGS1SnXrmvPR0U4NBwBQzOi6DwAAUI4cO2bek9+hg3l/PgDA9ZDoAwAAlBO//mom+fv3Sx4ekqensyMCAJQEEn0AAIBy4LffpM6dpX37zC7769ZJISHOjgoAUBJI9AEAAFxccrKZ5O/dK9WpYyb5deo4OyoAQElhMD4AAAAXlppqJvl79phX8NevvzwIH1DeWK1WXbx40dlhAHny9PSUu7v7dddDog8AAODC3NzMe/Fr1zav5Ner5+yIgBvPMAylpKTozJkzzg4FuKbKlSsrODhYFoulyHWQ6AMAALiwm26S1q6V0tKk+vWdHQ3gHNlJfmBgoHx9fa8rgQJKimEYOnfunI4fPy5JqlGjRpHrItEHAABwMSdPmlfv//pXc756dXMCyiOr1WpP8qtVq+bscIB8VahQQZJ0/PhxBQYGFrkbP4PxAQAAuJDff5e6dpUefFCaN8/Z0QDOl31Pvq+vr5MjAQom+1i9nvEkSPQBAABcxKlT0p13Sj/+KAUFSW3aODsioPSguz7KiuI4Vkn0AQAAXMDp09Jdd0nbtkmBgeZ9+Y0bOzsqAIAzkOgDAACUcWfOmEn+li2XB99r0sTZUQEAnIVEHwAAXJdZs2apbt268vHxUWRkpDZt2pRn2Xnz5slisThMPj4+Ocrt3r1bPXr0UEBAgPz8/BQREaEjR47Y1z/xxBOqX7++KlSooJtuukk9e/bUzz//7FDH1duxWCxatGhR8X3wUuLCBSk6WvrhB3PAvTVrpFtvdXZUgAuyWqX166WFC82fVquzIyq0unXrasaMGQUuv379elksFh5LWAaR6AMAgCJbvHixRo0apZdeeklbt25V8+bNFR0dbX80UG78/f2VnJxsnw4fPuywfv/+/WrXrp0aN26s9evXa/v27Ro/frzDCYHw8HAlJCRo9+7d+vLLL2UYhu666y5Zr/rHOyEhwWFb999/f7F+/tLA29u8ml+1qvTVV1LTps6OCHBBH38s1a0rdeokPfSQ+bNuXXN5CcjtROWV08SJE4tU7+bNmzVkyJACl2/btq2Sk5MVEBBQpO0VRePGjeXt7a2UlJQbtk1XZDEMw3B2EGVRenq6AgIClJaWJn9/f2eHAwCAU9qmyMhIRUREaObMmZIkm82mkJAQDRs2TGPHjs1Rft68eRoxYkS+V4f69u0rT09PvffeewWOY/v27WrevLn27dun+n8+LN5isWjp0qVFTu7LUltvGFJKinQdj1wGXNaFCxd08OBB1atXL9ceRNf08cfSAw+YX7QrZQ+Y9tFHUq9e1x/oFa5MchcvXqwJEyZoz5499mUVK1ZUxYoVJZnPXrdarfLwKPtPTv/222/Vv39/tWvXTs2aNdOYMWOcGs/Fixfl6el5w7eb3zFb0LaJK/oAAKBIsrKytGXLFnXt2tW+zM3NTV27dtXGjRvzfF9GRoZCQ0MVEhKinj17aufOnfZ1NptNK1as0C233KLo6GgFBgYqMjJSn3zySZ71nT17VgkJCapXr55CQkIc1j3zzDOqXr26Wrdurfj4eOV3fSMzM1Pp6ekOU2mVkSGNHi2dO2fOWywk+UChGIZ09uy1p/R0afjwnEl+dh2S9OyzZrmC1FfAa6zBwcH2KSAgQBaLxT7/888/q1KlSvriiy8UHh4ub29vffvtt9q/f7969uypoKAgVaxYUREREfrqq68c6r26677FYtF//vMfxcTEyNfXVw0bNtSnn35qX3911/158+apcuXK+vLLLxUWFqaKFSuqW7duSk5Otr/n0qVLGj58uCpXrqxq1appzJgxGjhwYIFOus6dO1cPPfSQHnnkEcXHx+dYf+zYMfXr109Vq1aVn5+fWrVqpe+//96+fvny5YqIiJCPj4+qV6+umJgYh896dVtSuXJlzfvzOaSHDh2SxWLR4sWL1aFDB/n4+Oj999/X77//rn79+qlWrVry9fVV06ZNtXDhQod6bDabXnnlFTVo0EDe3t6qU6eO/vGPf0iSOnfurKFDhzqUP3HihLy8vLRmzZpr7pOiItEHAABFcvLkSVmtVgUFBTksDwoKyrPLZaNGjRQfH69ly5Zp/vz5stlsatu2rY4dOyZJOn78uDIyMjR16lR169ZNq1atUkxMjHr16qWvv/7aoa63337bflXriy++0OrVq+Xl5WVfP2nSJH3wwQdavXq1evfuraefflpvvfVWnp8nLi5OAQEB9unqkwalRUaG1L279Oqr0sMPOzsaoIw6d06qWPHaU0CA9OuveddjGNKxY2a5gtSXfXauGIwdO1ZTp07V7t271axZM2VkZKh79+5as2aNtm3bpm7duum+++5zGN8kNy+//LIefPBBbd++Xd27d1f//v116tSpPMufO3dOr776qt577z1t2LBBR44c0XPPPWdf/89//lPvv/++EhIS9N133yk9PT3fk7XZ/vjjD3344Yd6+OGHdeeddyotLU3ffPONfX1GRoY6dOigX3/9VZ9++ql+/PFHPf/887LZbJKkFStWKCYmRt27d9e2bdu0Zs0atW7d+prbvdrYsWP17LPPavfu3YqOjtaFCxcUHh6uFStW6KefftKQIUP0yCOPOIxHM27cOE2dOlXjx4/Xrl27tGDBAnvb+Pjjj2vBggXKzMy0l58/f75q1aqlzp07Fzq+gir7/TuAEnD0qHTiRN7rAwOl2rVdb9uFURJxFrTOsrKPnMXZ+4fvT/7KQowlKSoqSlFRUfb5tm3bKiwsTO+8844mT55s/4etZ8+eGjlypCSpRYsWSkxM1OzZs9WhQwf7e/v3768777xTycnJevXVV/Xggw/qu+++s3dzHD9+vL1sy5YtdfbsWU2bNk3Dhw/PNbZx48Zp1KhR9vn09PRSl+yfPSvde6/0zTdmXjFunLMjAuAskyZN0p133mmfr1q1qpo3b26fnzx5spYuXapPP/00xxXlKz366KPq16+fJGnKlCl68803tWnTJnXr1i3X8hcvXtTs2bPtt0kNHTpUkyZNsq9/6623NG7cOPvV9JkzZ+rzzz+/5udZtGiRGjZsqFv/HE20b9++mjt3ru644w5J0oIFC3TixAlt3rxZVatWlSQ1aNDA/v5//OMf6tu3r15++WX7siv3R0GNGDFCva66FePKExnDhg3Tl19+qQ8++ECtW7fWH3/8oTfeeEMzZ87UwIEDJUn169dXu3btJEm9evXS0KFDtWzZMj344IOSzJ4Rjz76qCzZt3+UABJ94CqZmVJEhJSamneZ4GDp0CFzACRX2XZhlEScBa1zz56ysY+cxdnHEN+f/JWFGAujevXqcnd3V+pVHyg1NVXBwcEFqsPT01MtW7bUvn377HV6eHioyVXPhgsLC9O3337rsCz7ynvDhg3Vpk0bValSRUuXLrX/w3q1yMhITZ48WZmZmfLOZQd7e3vnury0OHdOuu8+6euvJX9/6csvzeMJQBH4+prdY65lwwazC821fP651L59wbZbTFq1auUwn5GRoYkTJ2rFihVKTk7WpUuXdP78+Wte0W/WrJn9tZ+fn/z9/fMdUNXX19ee5EtSjRo17OXT0tKUmprqcCXd3d1d4eHh9hO5eYmPj9fDV3RTevjhh9WhQwe99dZbqlSpkpKSktSyZUt7kn+1pKQkxcbG5ruNgrh6v1qtVk2ZMkUffPCBfv31V2VlZSkzM1O+f/4ud+/erczMTHXp0iXX+nx8fOy3Ijz44IPaunWrfvrpJ4dbJEoCXfeBq3h5SXXqSG55fDvc3KSQELOcK227MEoizoLWWbFi2dhHzuLsY4jvT/7KQoyF4eXlpfDwcId7DG02m9asWeNw1T4/VqtVO3bsUI0/bzD38vJSRESEw6BTkrR3716FhobmWY9hGDIMw6Fr5NWSkpJUpUqVUp3M5+X8ealnT2ndOvPv4MqVUmSks6MCyjCLRfLzu/Z0111mN6u8rrxaLOYf7rvuKlh9xXgF18/Pz2H+ueee09KlSzVlyhR98803SkpKUtOmTZWVlZVvPVcPNmexWPJNynMrf73ju+/atUv/+9//9Pzzz8vDw0MeHh5q06aNzp07Z38saoUKFfKt41rrc4vz4sWLOcpdvV+nTZumN954Q2PGjNG6deuUlJSk6Oho+3691nYls/v+6tWrdezYMSUkJKhz5875tmnFgUQfuIrFIk2eLOX1981mM9eXRE8bZ267MEoizoLW6eZWNvaRszj7GOL7k7+yEGNhjRo1SnPmzNG7776r3bt366mnntLZs2c1aNAgSdKAAQM07or+5ZMmTdKqVat04MABbd26VQ8//LAOHz6sxx9/3F5m9OjRWrx4sebMmaN9+/Zp5syZWr58uZ5++mlJ0oEDBxQXF6ctW7boyJEjSkxM1F//+ldVqFBB3f+88rZ8+XL95z//0U8//aR9+/bpX//6l6ZMmaJhw4bdwL1TfB591Hx0XnaSX8DzKACul7u79MYb5uur/zhnz8+YYZZzsu+++06PPvqoYmJi1LRpUwUHB+vQoUM3NIaAgAAFBQVp8+bN9mVWq1Vbt27N931z585V+/bt9eOPPyopKck+jRo1SnPnzpVk9jxISkrKc/yAZs2a5Tu43U033eQwaOAvv/yicwUYM+G7775Tz5499fDDD6t58+a6+eabtXfvXvv6hg0bqkKFCvluu2nTpmrVqpXmzJmjBQsW6LHHHrvmdq8XXfeBK+zfL+3aJe3dK910U877aN3dpdtvN0/aDh4s5dfj5vDhy72zhg2T/jwZmavdu6Xq1c3XX31lbueqR0Hbt9+48eX5iROlWbPyrvfbb6VGjczXr7wiTZuWd9lVq6SWLc3Xb70lXXGrVQ6ffGLug4gI6Ycfcg4g6+4u9e9vtn8LF0rZA3IvXGgOXJuXOXPMOrduzfn5r6zTMKSbbzb3cX7lsv3zn1L239PvvpPyG/T1pZek7NvYtm6VoqPzLjt6tPT88+brn3+W/ryFLFdDh5p1S9KRI1J4eN5lH3vMjFkyj8GrejA76NvX/H1J5r27deua+ye3Y8hikVq1Mn93knmM56VrV/P3la1OHfOKYm7atpWWLbs8P3x43sewv//l7Uvm9+no0dzrbdTIPIaz/eUv5nczNyEh5u8r+7i84v8Lu+xj46abzO9ctp49pcTE3Ov19TWPs2x9+0r5DZB75d+MvP5GZP9+JMd9dOXfl7KkT58+OnHihCZMmKCUlBS1aNFCK1eutA9CdOTIEbld0YXh9OnTio2NVUpKiqpUqaLw8HAlJiY6dNWPiYnR7NmzFRcXp+HDh6tRo0ZasmSJ/X5HHx8fffPNN5oxY4ZOnz6toKAgtW/fXomJiQoMDJRkXnGaNWuWRo4cKcMw1KBBA02fPr1YunU6w/PPSxs3SgsWmN8FADdQr17mI/SefdYceC9b7dpmkl/Mj9YrqoYNG+rjjz/WfffdJ4vFovHjx1+zu3xJGDZsmOLi4tSgQQM1btxYb731lk6fPp3n/egXL17Ue++9p0mTJum2225zWPf4449r+vTp2rlzp/r166cpU6bo/vvvV1xcnGrUqKFt27apZs2aioqK0ksvvaQuXbqofv366tu3ry5duqTPP//c/oi+zp07a+bMmYqKipLVatWYMWMK9Oi8hg0b6qOPPlJiYqKqVKmi6dOnKzU11d5u+fj4aMyYMXr++efl5eWlv/zlLzpx4oR27typwYMHO3yWoUOHys/Pz+FpACWFRB/lysWL5r2vv/xiTr/+aibA2YYONa+U5MVqvXy1LT1dOnmyYNv944/8y16ZKJ89m3uClL39q8vmV++V9Zw7l3/ZS5cuvz5/Pv+yFy9evjKZ2zgtVqv0++/m6yt7i2VmXrvegtQpSQMHStOnX7ucJF244LiN/GK4Mpm9dCn/sleeBLZa8y979uzl1zZb/mWvvGXQMPIv+8cfjvPXOs6uvFqcX9mrnyp28mTeiX5amuP8qVN5H8PBwY4nYU6dyjuOP/M1u9On8y6b3cuuIMfl1d3m09LyrvfqWykL870vTNns+Mra1fxsQ4cOzXOgp/Xr1zvMv/7663r99devWedjjz2W5xWPmjVrXnNgp27duuU5kFRZFB5utltl8K4DwDX06mWeGf7mGyk52Xye5R13lIor+dmmT5+uxx57TG3btlX16tU1ZswYpzwmdMyYMUpJSdGAAQPk7u6uIUOGKDo6Wu557KtPP/1Uv//+e67Jb1hYmMLCwjR37lxNnz5dq1at0v/93/+pe/fuunTpkpo0aaJZf1716tixoz788ENNnjxZU6dOlb+/v9pfMW7Ca6+9pkGDBumOO+5QzZo19cYbb2jLli3X/DwvvviiDhw4oOjoaPn6+mrIkCG6//77lXbFP0Djx4+Xh4eHJkyYoN9++001atTQk08+6VBPv379NGLECPXr188+aGxJshjXe0NFOZWenq6AgAClpaXJ39/f2eHgClar49/cf/1LWr7c/Afp4MGcCcjp01LlyubrsWPNwY0aNpQaNDCvwh8+bCZmbm7mP1rff2/+I370aM4k60qNG19OKH79NWcydKVbbpE8/jztlpxsJj99+5q9C7K33aSJGU/Dhpfv301NzZnUXql+/cv/FJ44kf9I3/XqSdm3GJ08KeUzBotCQ83EyjDMffLjjznjzE5WQkKkSpXM12fOSL/9lne9tWubZSMjL1/Vz61OyWxfo6OvXU4yk8vscVvOnnW8Qnu1wMDLvSvOnzePmbzcdNPlq+IXLkgHDuRdtlo1KfsJZFlZ0p/jjuWqSpXLz8O+dCnvq9iSOep2rVrma5vN7Fkgmb+bq4+hpk2lbdsu759du/KuN3sshGy7d+f96F9fX7MnQbY9e8y4czuGP/nEPC6z/fKLefIlN97ejmX37XM8cXQlT0/zu5H92Zs3l3buzP24dHe/3NNFMk/+5dVzz2KRwsIuzx85kv/YTVf2vsjvb4RhmF2xt227/Dfr9tsv/30pKtqm4uXM/ZmZafbuGTZMatPmhm4acDkXLlzQwYMHVa9evRuSYMGRzWZTWFiYHnzwQU2ePNnZ4TjNoUOHVL9+fW3evFm33357vmXzO2YL2jaR6BcR/0w5l9Vq/hOdfWX+yungQTNRzf61PP20mexnq1DBTOIbNjSn0aPNJCw3X37peGVw5cr8u3IXJ2duuzBKIs6C1llW9pGzOHv/8P3JX0nESNtUvJy1P7OypAceME9SBwWZJxGLcaBuoNwh0b+xDh8+rFWrVqlDhw7KzMzUzJkzlZCQoB9//FFhV549LycuXryo33//Xc8995wOHjyo77777prvIdF3Iv6ZKnk2m3klPDuBf+SRy//oXJ28X23LFvPqmGT2sPr558vJfc2aeY94fTXDMK8ub95s3vd7vVfbCsOZ2y6MkoizoHWWlX3kLM7eP3x/8lcSMdI2FS9n7M+LF6UHHzR7v3h7m8n+FY/JBlAEJPo31tGjR9W3b1/99NNPMgxDt912m6ZOnerQjb48Wb9+vTp16qRbbrlFH330kZo2bXrN9xRHos89+ig1vvlG+uyzy4n9/v2O9wS3aWN2x5XM7ryenubP7CvzV061a19+3x135D9IWn4sFmnKFHNwsSlTbmyi4MxtF0ZJxFnQOsvKPnIWZ+8fvj/5Kwsx4sa6eNG85SU7yV+2jCQfQNkTEhJSoKvW5UXHjh2v+/GDRcEV/SLiqknBGYZ5L/m+fTm72S9cePn+13/+07xH/koeHua94w0bmv8IZyf6Fy6YiX4pGv8EAJyOtql43cj9efGi9NBD5sDeXl5mku9CYwkCTsUVfZQ1XNFHqZE9Mvgvv5iJe5Uq5vK5c6WRI/MekOrnny8n+u3amaPeX3llvm7dy4PUXYm/0QAAV/L662aS7+kpffwxST4A4PqQ6KPQjh2T1q/PeXU+e1T5ZcukHj3M1wEBZpJvsZgjeF/dxf7KkYT/8heeDQwAKJ+GD5cSE6XBg6V77nF2NACAso5EHzmkpeVM4p96Smrb1lz/3XfmwHi5CQlxfGZ5167mI65uvpmr8AAAXMlmM0+EWyxmG7l0KWM1AACKB4l+OZXdlT77+eYbN0rPPWcm9bk9az08/HKi36SJ1KFDzqvz9etffg57tsqVLz+jHgAAmKxWadAgKTjYHKMmO+EHAKA4kOg7ydGjuSfU2QIDHUeOL4rMTPMe+Oyr8lcOhpeSIr35pjRsmFnW3d3sMpgtONgxie/Q4fK6pk3NrvsAACBvebX1Vqs0aZL5pBl3d+nhh6VmzW58fAAA10Wi7wSZmeYzk1NT8y4THCwdOmQ+Xic/5887JvCtW0udOpnrtm69fBU+N8eOXX7dpIm0eLGZ1DdocPlKPwAAKLyCtPWS9N57JPlAaXcjLtBdzXKNLj4vvfSSJk6cWOS6ly5dqvvvv79A5Z944gn95z//0aJFi/TXv/61SNvEjUei7wReXubAdCdOmPfnXc3NzbzX3cvLnDeMy935fvtNevnly4n9lcm6JI0YcTnRv+UWqWrV3J8z37ChOVBetooVpQcfLPaPCgBAuXSttl4yx6/p2/fGxgWgcIrzAl1hJCcn218vXrxYEyZM0J49e+zLKlasWHwby8e5c+e0aNEiPf/884qPj3d6op+VlSWv7CQJ+XJzdgDlkcUiTZ6cd8Nvs5lnBu+6y3y83JXPlnd3l/79b2ndustJfuXK5h+ghx4yf2arVk36/Xfpf/8zrxhMmCD16ye1auWY5AMAgOJ1rbZekt5+m/vygdIu+6SdWx5Z09UX6IpLcHCwfQoICJDFYnFYtmjRIoWFhcnHx0eNGzfW22+/bX9vVlaWhg4dqho1asjHx0ehoaGKi4uTJNWtW1eSFBMTI4vFYp/Py4cffqgmTZpo7Nix2rBhg44ePeqwPjMzU2PGjFFISIi8vb3VoEEDzZ07175+586duvfee+Xv769KlSrpjjvu0P79+yVJHTt21IgRIxzqu//++/Xoo4/a5+vWravJkydrwIAB8vf315AhQyRJY8aM0S233CJfX1/dfPPNGj9+vC5evOhQ1/LlyxURESEfHx9Vr15dMTExkqRJkybptttuy/FZW7RoofHjx+e7P8oSrug7yV13mUn51q3mvXpXW7Hi8uuff778OjBQmjhRqlfv8pX5atX4RwEAgNImr7bezc0c5Pauu5wXGwDT2bN5r3N3N5+IMXmy1K1b7mVsNumFF8ynTl05KHVu9fr5XV+s2d5//31NmDBBM2fOVMuWLbVt2zbFxsbKz89PAwcO1JtvvqlPP/1UH3zwgerUqaOjR4/aE/TNmzcrMDBQCQkJ6tatm9zd3fPd1ty5c/Xwww8rICBAd999t+bNm+eQDA8YMEAbN27Um2++qebNm+vgwYM6efKkJOnXX39V+/bt1bFjR61du1b+/v767rvvdOnSpUJ93ldffVUTJkzQSy+9ZF9WqVIlzZs3TzVr1tSOHTsUGxurSpUq6fnnn5ckrVixQjExMXrhhRf03//+V1lZWfr8888lSY899phefvllbd68WRF/XiXdtm2btm/fro8//rhQsZVqBookLS3NkGSkpaUVuY6VKw3D7JjvOLVrZxjjxhlGfLxhfPONYRw/XoyBAwBcVnG0TbisJNv6lSuLMVAA+Tp//ryxa9cu4/z58znW5fb9zJ66dzfL2GyGERGRf9kOHRzrrV49Z5miSkhIMAICAuzz9evXNxYsWOBQZvLkyUZUVJRhGIYxbNgwo3PnzobNZsu1PknG0qVLr7ndvXv3Gp6ensaJEycMwzCMpUuXGvXq1bPXu2fPHkOSsXr16lzfP27cOKNevXpGVlZWrus7dOhgPPvssw7LevbsaQwcONA+Hxoaatx///3XjHXatGlGeHi4fT4qKsro379/nuXvvvtu46mnnrLPDxs2zOjYseM1t3Oj5HfMFrRtouu+E2Wf6c8+kebubs5v2CBNmWI+dqddO+mmm5wbJwAAKJq82nqu5gNlR/atOKXB2bNntX//fg0ePFgVK1a0T3//+9/tXeIfffRRJSUlqVGjRho+fLhWrVpVpG3Fx8crOjpa1atXlyR1795daWlpWrt2rSQpKSlJ7u7u6nDl47mukJSUpDvuuEOenp5F2n62Vq1a5Vi2ePFi/eUvf1FwcLAqVqyoF198UUeOHHHYdpcuXfKsMzY2VgsXLtSFCxeUlZWlBQsW6LHHHruuOEsbuu47UfYfjeyuQFarOU83fAAAXANtPVC6ZWTkve7KXu133SXdfrv044/m99jdXWreXPr6a/P7fPU9/IcOlUi4yvgz4Dlz5igyMvKqeM2Ab7/9dh08eFBffPGFvvrqKz344IPq2rWrPvroowJvx2q16t1331VKSoo8PDwclsfHx6tLly6qcOW9Crm41no3NzcZhuGw7Or77CXJ76p7HjZu3Kj+/fvr5ZdfVnR0tAICArRo0SK99tprBd72fffdJ29vby1dulReXl66ePGiHnjggXzfU9aQ6DtZ9pn+zZs5ww8AgCuirQdKr4LeN2+xmD1urzxpN2WK+eSq66m3sIKCglSzZk0dOHBA/fv3z7Ocv7+/+vTpoz59+uiBBx5Qt27ddOrUqf9v735jqiz/OI5/Dii4TM5RUfkjljomUshxyGlmDSQ2sybLaY0nSTh7ImaDHrTWA9aD1lbTtRxrM5vZ5qajpmtCf4lqgxxltWT9dVNz8q9GCOgshPv3QDkL+Xd+cDjXde7zfm3OcZ/rXHz1e1/393zPfe77aMGCBZo9e7YGx7pJ2H/U19err69P33///Yjr+FtbW1VeXq6enh7l5ORoaGhIX375pYqLi0fNsWbNGh05ckQDAwNjntVftGjRiG8XGBwcVGtrqzYOf4XYOJqbm3XXXXfpxRdfDG67ePHiqN/d0NCg8vLyMeeYNWuWysrKdPjwYSUkJKi0tHTSNweiDY2+YcMHjb17b/7NO/wAALgLtR5wB1vetHvppZe0d+9eeb1ePfzww/rnn3/07bff6u+//1ZVVZX279+v1NRUrV27VnFxcaqtrVVKSop8Pp+km3eyb2ho0IYNG5SYmKj58+eP+h1vv/22Hn30UeXm5o7Ynp2drcrKSh09elQVFRUqKyvTzp07gzfju3jxorq6uvTEE09oz549OnDggEpLS/XCCy/I6/Xq9OnTCgQCWrVqlYqKilRVVaW6ujqtXLlS+/fvV09Pz6T//szMTP3xxx86duyY8vPzVVdXpxMnTowYU11drYceekgrV65UaWmpbty4ofr6ej3//PPBMbt27dLq1aslSU1NTf9nFuzHNfoWKC6Wfvrp5t8AAMB9qPVA9Bt+0271arNv2u3atUuHDh3S4cOHlZOTo4KCAr3zzjtavny5pJt3pH/11Ve1bt065efn68KFC6qvr1fcresL9u3bp08//VQZGRlau3btqPk7OztVV1enbdu2jXosLi5OW7duDX6F3ptvvqnt27dr9+7dysrK0tNPP62rt75yYOHChfr888/V39+vgoIC5eXl6a233gqe3d+5c6fKysq0Y8cOFRQUaMWKFZOezZekkpISVVZWas+ePfL7/Wpubh71tXiFhYWqra3VBx98IL/fr6KiIrW0tIwYk5mZqfvvv19ZWVmjLoNwA49z+4URCElvb6+8Xq+uXLmipKQk0+EAAEBtCjP+PwF3uH79us6fP6/ly5drzpw5psOBJRzHUWZmpnbv3q2qqirT4Yww0T4bam3io/sAAAAAgJjx559/6tixY+ro6Bj3Ov5oR6MPAAAAAIgZixcvVnJysg4ePDjmPQrcgEYfAAAAABAzYuHqdW7GBwAAAACAi9DoAwAAAHC9WDiLC3cIx75Kow8AAADAtYa/zu3atWuGIwFCM7yvDu+7U8E1+gAAAABcKz4+Xj6fT11dXZKkO+64Qx6Px3BUwGiO4+jatWvq6uqSz+dTfHz8lOei0QcAAADgaikpKZIUbPYBm/l8vuA+O1U0+gAAAABczePxKDU1VYsXL9bAwIDpcIBxzZ49e1pn8ofR6AMAAACICfHx8WFpogDbcTM+AAAAAABchEYfAAAAAAAXodEHAAAAAMBFuEZ/ihzHkST19vYajgQAgJuGa9JwjcL0UOsBALYJtdbT6E9RX1+fJCkjI8NwJAAAjNTX1yev12s6jKhHrQcA2GqyWu9xeNt/SoaGhtTW1qZ58+bJ4/EoPz9f33zzzahxY22/fVtvb68yMjJ06dIlJSUlzXjstxsv9kjNFepzJhs30ePkZ2ZzE8pY8hP+ecK1diYbQ37szc/t2x3HUV9fn9LS0hQXx9V50zWdWn/7dtNr4fZ4Ij0PtX5ivBYjP+F4DvmJ/FwmXiuHWus5oz9FcXFxWrp0afDn+Pj4MXfssbaPNzYpKcnI4hgvnkjNFepzJhs30ePkZ2ZzE8pY8hP+ecK1diYbQ37szc9Y2zmTHz7TqfXjbTe1FsaLJ1LzUOsnxmsx8hOO55CfyM9l6rVyKLWet/vDpKKiIuTt4401JZzxTGWuUJ8z2biJHic/M5ubUMaSn/DPE661M9kY8mNvfmz7/3a7/zcPtuXHDeuBY9XMzEV+JkZ+phZPpJjMj+nXyhPho/sW6O3tldfr1ZUrV4y9y4/xkR+7kR+7kR/gJtaC3ciP3ciP3ciPnTijb4HExERVV1crMTHRdCgYA/mxG/mxG/kBbmIt2I382I382I382Ikz+gAAAAAAuAhn9AEAAAAAcBEafQAAAAAAXIRGHwAAAAAAF6HRBwAAAADARWj0AQAAAABwERr9KHLp0iUVFhYqOztba9asUW1tremQcJutW7dq/vz52r59u+lQIOnUqVNatWqVMjMzdejQIdPh4DasF2Bs1Hv7cfyyB7XebqwVc/h6vSjS3t6uzs5O+f1+dXR0KC8vT7/99pvmzp1rOjTc8sUXX6ivr09HjhzRe++9ZzqcmHbjxg1lZ2ersbFRXq9XeXl5am5u1sKFC02HhltYL8DYqPf24/hlB2q9/Vgr5nBGP4qkpqbK7/dLklJSUpScnKzu7m6zQWGEwsJCzZs3z3QYkNTS0qJ77rlH6enpuvPOO7V582Z98sknpsPCf7BegLFR7+3H8csO1Hr7sVbModEPo6+++kpbtmxRWlqaPB6PTp48OWpMTU2N7r77bs2ZM0f33XefWlpapvS7zpw5o8HBQWVkZEwz6tgRyfxg+qabr7a2NqWnpwd/Tk9P1+XLlyMRekxgPSGWUe/txvEpelDr7cZaim40+mF09epV5ebmqqamZszHjx8/rqqqKlVXV+u7775Tbm6uNm3apK6uruAYv9+ve++9d9Sftra24Jju7m7t2LFDBw8enPF/k5tEKj8Ij3DkCzOH/CCWUe/tRr2PHtQSu5GfKOdgRkhyTpw4MWJbIBBwKioqgj8PDg46aWlpziuvvBLyvNevX3cefPBB59133w1XqDFppvLjOI7T2NjobNu2LRxh4pap5Kupqcl57LHHgo8/++yzztGjRyMSb6yZznpivSDaUe/tRr2PHtR6u1Hrow9n9CPk33//1ZkzZ1RcXBzcFhcXp+LiYn399dchzeE4jp566ikVFRXpySefnKlQY1I48oPICSVfgUBAra2tunz5svr7+/Xhhx9q06ZNpkKOKawnxDLqvd04PkUPar3dWEv2o9GPkL/++kuDg4NasmTJiO1LlixRR0dHSHM0NTXp+PHjOnnypPx+v/x+v86ePTsT4caccORHkoqLi/X444+rvr5eS5cu5UA3Q0LJ16xZs7Rv3z5t3LhRfr9fzz33HHfhjZBQ1xPrBW5Evbcb9T56UOvtRq233yzTASB0DzzwgIaGhkyHgQl89tlnpkPAf5SUlKikpMR0GBgH6wUYG/Xefhy/7EGttxtrxRzO6EdIcnKy4uPj1dnZOWJ7Z2enUlJSDEWFYeQnupAvu5EfxDL2f7uRn+hBruxGfuxHox8hCQkJysvLU0NDQ3Db0NCQGhoatH79eoORQSI/0YZ82Y38IJax/9uN/EQPcmU38mM/ProfRv39/Tp37lzw5/Pnz+uHH37QggULtGzZMlVVVamsrEzr1q1TIBDQ66+/rqtXr6q8vNxg1LGD/EQX8mU38oNYxv5vN/ITPciV3chPlDN92383aWxsdCSN+lNWVhYcc+DAAWfZsmVOQkKCEwgEnNOnT5sLOMaQn+hCvuxGfhDL2P/tRn6iB7myG/mJbh7HcZyZexsBAAAAAABEEtfoAwAAAADgIjT6AAAAAAC4CI0+AAAAAAAuQqMPAAAAAICL0OgDAAAAAOAiNPoAAAAAALgIjT4AAAAAAC5Cow8AAAAAgIvQ6AMAAAAA4CI0+gAAAAAAuAiNPgBjOjo69Mwzz2jFihVKTExURkaGtmzZooaGBtOhAQCAMKDWA2bMMh0AgNh04cIFbdiwQT6fT6+99ppycnI0MDCgjz/+WBUVFfrll19MhwgAAKaBWg+Y43EcxzEdBIDY88gjj+jHH3/Ur7/+qrlz5454rKenRz6fz0xgAAAgLKj1gDl8dB9AxHV3d+ujjz5SRUXFqMIvicIPAECUo9YDZtHoA4i4c+fOyXEcZWVlmQ4FAADMAGo9YBaNPoCI44ohAADcjVoPmEWjDyDiMjMz5fF4uAkPAAAuRa0HzOJmfACM2Lx5s86ePcsNegAAcClqPWAOZ/QBGFFTU6PBwUEFAgG9//77+v333/Xzzz/rjTfe0Pr1602HBwAApolaD5jDGX0AxrS3t+vll1/WqVOn1N7erkWLFikvL0+VlZUqLCw0HR4AAJgmaj1gBo0+AAAAAAAuwkf3AQAAAABwERp9AAAAAABchEYfAAAAAAAXodEHAAAAAMBFaPQBAAAAAHARGn0AAAAAAFyERh8AAAAAABeh0QcAAAAAwEVo9AEAAAAAcBEafQAAAAAAXIRGHwAAAAAAF/kfMblrmwTVRSUAAAAASUVORK5CYII=", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from sklearn import linear_model\n", + "from sklearn.svm import SVC\n", + "\n", + "C = [0.01, 0.1, 0.2, 0.5, 0.8, 1, 5, 10, 20, 50]\n", + "LRtrainAcc = []\n", + "LRtestAcc = []\n", + "SVMtrainAcc = []\n", + "SVMtestAcc = []\n", + "\n", + "for param in C:\n", + " clf = linear_model.LogisticRegression(C=param)\n", + " clf.fit(X_train, Y_train)\n", + " Y_predTrain = clf.predict(X_train)\n", + " Y_predTest = clf.predict(X_test)\n", + " LRtrainAcc.append(accuracy_score(Y_train, Y_predTrain))\n", + " LRtestAcc.append(accuracy_score(Y_test, Y_predTest))\n", + "\n", + " clf = SVC(C=param,kernel='linear')\n", + " clf.fit(X_train, Y_train)\n", + " Y_predTrain = clf.predict(X_train)\n", + " Y_predTest = clf.predict(X_test)\n", + " SVMtrainAcc.append(accuracy_score(Y_train, Y_predTrain))\n", + " SVMtestAcc.append(accuracy_score(Y_test, Y_predTest))\n", + "\n", + "fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12,6))\n", + "ax1.plot(C, LRtrainAcc, 'ro-', C, LRtestAcc,'bv--')\n", + "ax1.legend(['Training Accuracy','Test Accuracy'])\n", + "ax1.set_xlabel('C')\n", + "ax1.set_xscale('log')\n", + "ax1.set_ylabel('Accuracy')\n", + "\n", + "ax2.plot(C, SVMtrainAcc, 'ro-', C, SVMtestAcc,'bv--')\n", + "ax2.legend(['Training Accuracy','Test Accuracy'])\n", + "ax2.set_xlabel('C')\n", + "ax2.set_xscale('log')\n", + "ax2.set_ylabel('Accuracy')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that linear classifiers perform poorly on the data since the true decision boundaries between classes are nonlinear for the given 2-dimensional dataset." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3.4.3 Nonlinear Support Vector Machine\n", + "\n", + "The code below shows an example of using nonlinear support vector machine with a Gaussian radial basis function kernel to fit the 2-dimensional dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Text(0, 0.5, 'Accuracy')" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from sklearn.svm import SVC\n", + "\n", + "C = [0.01, 0.1, 0.2, 0.5, 0.8, 1, 5, 10, 20, 50]\n", + "SVMtrainAcc = []\n", + "SVMtestAcc = []\n", + "\n", + "for param in C:\n", + " clf = SVC(C=param,kernel='rbf',gamma='auto')\n", + " clf.fit(X_train, Y_train)\n", + " Y_predTrain = clf.predict(X_train)\n", + " Y_predTest = clf.predict(X_test)\n", + " SVMtrainAcc.append(accuracy_score(Y_train, Y_predTrain))\n", + " SVMtestAcc.append(accuracy_score(Y_test, Y_predTest))\n", + "\n", + "plt.plot(C, SVMtrainAcc, 'ro-', C, SVMtestAcc,'bv--')\n", + "plt.legend(['Training Accuracy','Test Accuracy'])\n", + "plt.xlabel('C')\n", + "plt.xscale('log')\n", + "plt.ylabel('Accuracy')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Observe that the nonlinear SVM can achieve a higher test accuracy compared to linear SVM." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3.4.4 Ensemble Methods\n", + "\n", + "An ensemble classifier constructs a set of base classifiers from the training data and performs classification by taking a vote on the predictions made by each base classifier. We consider 3 types of ensemble classifiers in this example: bagging, boosting, and random forest. Detailed explanation about these classifiers can be found in Section 4.10 of the book.\n", + "\n", + "In the example below, we fit 500 base classifiers to the 2-dimensional dataset using each ensemble method. The base classifier corresponds to a decision tree with maximum depth equals to 10." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/tomg/Source/courses/tools4ds/DS701-Course-Notes/.venv/lib/python3.12/site-packages/sklearn/ensemble/_weight_boosting.py:527: FutureWarning: The SAMME.R algorithm (the default) is deprecated and will be removed in 1.6. Use the SAMME algorithm to circumvent this warning.\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/plain": [ + "[Text(1.5, 0, 'Random Forest'),\n", + " Text(2.5, 0, 'Bagging'),\n", + " Text(3.5, 0, 'AdaBoost')]" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from sklearn import ensemble\n", + "from sklearn.tree import DecisionTreeClassifier\n", + "\n", + "numBaseClassifiers = 500\n", + "maxdepth = 10\n", + "trainAcc = []\n", + "testAcc = []\n", + "\n", + "clf = ensemble.RandomForestClassifier(n_estimators=numBaseClassifiers)\n", + "clf.fit(X_train, Y_train)\n", + "Y_predTrain = clf.predict(X_train)\n", + "Y_predTest = clf.predict(X_test)\n", + "trainAcc.append(accuracy_score(Y_train, Y_predTrain))\n", + "testAcc.append(accuracy_score(Y_test, Y_predTest))\n", + "\n", + "clf = ensemble.BaggingClassifier(DecisionTreeClassifier(max_depth=maxdepth),n_estimators=numBaseClassifiers)\n", + "clf.fit(X_train, Y_train)\n", + "Y_predTrain = clf.predict(X_train)\n", + "Y_predTest = clf.predict(X_test)\n", + "trainAcc.append(accuracy_score(Y_train, Y_predTrain))\n", + "testAcc.append(accuracy_score(Y_test, Y_predTest))\n", + "\n", + "clf = ensemble.AdaBoostClassifier(DecisionTreeClassifier(max_depth=maxdepth),n_estimators=numBaseClassifiers)\n", + "clf.fit(X_train, Y_train)\n", + "Y_predTrain = clf.predict(X_train)\n", + "Y_predTest = clf.predict(X_test)\n", + "trainAcc.append(accuracy_score(Y_train, Y_predTrain))\n", + "testAcc.append(accuracy_score(Y_test, Y_predTest))\n", + "\n", + "methods = ['Random Forest', 'Bagging', 'AdaBoost']\n", + "fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12,6))\n", + "ax1.bar([1.5,2.5,3.5], trainAcc)\n", + "ax1.set_xticks([1.5,2.5,3.5])\n", + "ax1.set_xticklabels(methods)\n", + "ax2.bar([1.5,2.5,3.5], testAcc)\n", + "ax2.set_xticks([1.5,2.5,3.5])\n", + "ax2.set_xticklabels(methods)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3.5 Summary\n", + "\n", + "This section provides several examples of using Python sklearn library to build classification models from a given input data. We also illustrate the problem of model overfitting and show how to apply different classification methods to the given dataset." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/ds701_book/iris b/ds701_book/iris new file mode 100644 index 00000000..864ab2b1 --- /dev/null +++ b/ds701_book/iris @@ -0,0 +1,37 @@ +digraph Tree { +node [shape=box, fontname="helvetica"] ; +edge [fontname="helvetica"] ; +0 [label="x[2] <= 2.45\ngini = 0.667\nsamples = 150\nvalue = [50, 50, 50]"] ; +1 [label="gini = 0.0\nsamples = 50\nvalue = [50, 0, 0]"] ; +0 -> 1 [labeldistance=2.5, labelangle=45, headlabel="True"] ; +2 [label="x[3] <= 1.75\ngini = 0.5\nsamples = 100\nvalue = [0, 50, 50]"] ; +0 -> 2 [labeldistance=2.5, labelangle=-45, headlabel="False"] ; +3 [label="x[2] <= 4.95\ngini = 0.168\nsamples = 54\nvalue = [0, 49, 5]"] ; +2 -> 3 ; +4 [label="x[3] <= 1.65\ngini = 0.041\nsamples = 48\nvalue = [0, 47, 1]"] ; +3 -> 4 ; +5 [label="gini = 0.0\nsamples = 47\nvalue = [0, 47, 0]"] ; +4 -> 5 ; +6 [label="gini = 0.0\nsamples = 1\nvalue = [0, 0, 1]"] ; +4 -> 6 ; +7 [label="x[3] <= 1.55\ngini = 0.444\nsamples = 6\nvalue = [0, 2, 4]"] ; +3 -> 7 ; +8 [label="gini = 0.0\nsamples = 3\nvalue = [0, 0, 3]"] ; +7 -> 8 ; +9 [label="x[2] <= 5.45\ngini = 0.444\nsamples = 3\nvalue = [0, 2, 1]"] ; +7 -> 9 ; +10 [label="gini = 0.0\nsamples = 2\nvalue = [0, 2, 0]"] ; +9 -> 10 ; +11 [label="gini = 0.0\nsamples = 1\nvalue = [0, 0, 1]"] ; +9 -> 11 ; +12 [label="x[2] <= 4.85\ngini = 0.043\nsamples = 46\nvalue = [0, 1, 45]"] ; +2 -> 12 ; +13 [label="x[1] <= 3.1\ngini = 0.444\nsamples = 3\nvalue = [0, 1, 2]"] ; +12 -> 13 ; +14 [label="gini = 0.0\nsamples = 2\nvalue = [0, 0, 2]"] ; +13 -> 14 ; +15 [label="gini = 0.0\nsamples = 1\nvalue = [0, 1, 0]"] ; +13 -> 15 ; +16 [label="gini = 0.0\nsamples = 43\nvalue = [0, 0, 43]"] ; +12 -> 16 ; +} diff --git a/ds701_book/iris.png b/ds701_book/iris.png new file mode 100644 index 00000000..9141bc29 Binary files /dev/null and b/ds701_book/iris.png differ diff --git a/ds701_book/nvidia_data.csv b/ds701_book/nvidia_data.csv new file mode 100644 index 00000000..d862bc45 --- /dev/null +++ b/ds701_book/nvidia_data.csv @@ -0,0 +1,251 @@ +Date,Open,High,Low,Close,Adj Close,Volume +2023-01-03,14.85099983215332,14.996000289916992,14.095999717712402,14.3149995803833,14.305581092834473,401277000 +2023-01-04,14.567000389099121,14.852999687194824,14.241000175476074,14.74899959564209,14.739295959472656,431324000 +2023-01-05,14.491000175476074,14.564000129699707,14.14799976348877,14.265000343322754,14.255613327026367,389168000 +2023-01-06,14.473999977111816,15.010000228881836,14.034000396728516,14.859000205993652,14.849224090576172,405044000 +2023-01-09,15.284000396728516,16.055999755859375,15.140999794006348,15.628000259399414,15.617716789245605,504231000 +2023-01-10,15.506999969482422,15.961999893188477,15.472000122070312,15.909000396728516,15.89853286743164,384101000 +2023-01-11,15.84000015258789,16.027999877929688,15.562999725341797,16.000999450683594,15.990469932556152,353285000 +2023-01-12,16.100000381469727,16.636999130249023,15.491999626159668,16.51099967956543,16.50013542175293,551409000 +2023-01-13,16.277999877929688,16.922000885009766,16.165000915527344,16.89900016784668,16.88787841796875,447287000 +2023-01-17,16.89900016784668,17.72800064086914,16.89900016784668,17.70199966430664,17.690353393554688,511102000 +2023-01-18,17.66699981689453,17.87299919128418,17.281999588012695,17.37700080871582,17.365568161010742,439624000 +2023-01-19,17.035999298095703,17.19700050354004,16.731000900268555,16.764999389648438,16.75396728515625,452932000 +2023-01-20,17.01099967956543,17.856000900268555,16.825000762939453,17.839000701904297,17.82726287841797,564967000 +2023-01-23,18.06399917602539,19.2450008392334,17.81800079345703,19.19300079345703,19.180374145507812,655163000 +2023-01-24,18.82699966430664,19.4950008392334,18.81999969482422,19.264999389648438,19.252323150634766,496204000 +2023-01-25,18.913000106811523,19.3700008392334,18.579999923706055,19.322999954223633,19.310287475585938,449537000 +2023-01-26,19.701000213623047,20.166000366210938,19.277999877929688,19.802000045776367,19.788972854614258,489535000 +2023-01-27,19.461999893188477,20.628000259399414,19.405000686645508,20.364999771118164,20.351600646972656,542142000 +2023-01-30,19.950000762939453,20.139999389648438,19.149999618530273,19.16200065612793,19.14939308166504,488611000 +2023-01-31,19.170000076293945,19.687000274658203,18.950000762939453,19.53700065612793,19.524145126342773,498017000 +2023-02-01,19.69099998474121,21.191999435424805,19.611000061035156,20.94300079345703,20.92922019958496,660477000 +2023-02-02,21.0,21.948999404907227,20.700000762939453,21.708999633789062,21.69471549987793,564276000 +2023-02-03,21.0,21.7450008392334,20.788999557495117,21.100000381469727,21.086116790771484,429366000 +2023-02-06,20.805999755859375,21.56999969482422,20.785999298095703,21.089000701904297,21.075122833251953,452197000 +2023-02-07,21.381999969482422,22.259000778198242,21.149999618530273,22.17300033569336,22.158411026000977,664150000 +2023-02-08,22.386999130249023,22.878000259399414,22.007999420166016,22.204999923706055,22.190387725830078,559651000 +2023-02-09,22.606000900268555,23.020000457763672,22.1200008392334,22.336999893188477,22.322301864624023,523876000 +2023-02-10,21.690000534057617,22.077999114990234,20.81100082397461,21.264999389648438,21.251008987426758,550737000 +2023-02-13,21.538000106811523,22.04800033569336,20.961999893188477,21.788000106811523,21.773662567138672,474919000 +2023-02-14,21.577999114990234,23.048999786376953,21.365999221801758,22.97100067138672,22.955886840820312,675474000 +2023-02-15,22.548999786376953,22.854999542236328,22.106000900268555,22.763999938964844,22.749019622802734,420575000 +2023-02-16,22.132999420166016,22.549999237060547,21.927000045776367,22.00200080871582,21.987524032592773,412026000 +2023-02-17,21.631000518798828,21.739999771118164,20.975000381469727,21.38800048828125,21.37392807006836,465888000 +2023-02-21,21.0,21.493999481201172,20.618000030517578,20.655000686645508,20.641408920288086,410015000 +2023-02-22,20.707000732421875,21.104000091552734,20.42099952697754,20.753999710083008,20.740346908569336,513184000 +2023-02-23,23.440000534057617,23.88800048828125,23.024999618530273,23.663999557495117,23.648427963256836,1117995000 +2023-02-24,23.225000381469727,23.474000930786133,22.94700050354004,23.285999298095703,23.27067756652832,589716000 +2023-02-27,23.670000076293945,23.8799991607666,23.45400047302246,23.500999450683594,23.48553466796875,452994000 +2023-02-28,23.371999740600586,23.825000762939453,23.20599937438965,23.215999603271484,23.20072364807129,455963000 +2023-03-01,23.191999435424805,23.256000518798828,22.507999420166016,22.697999954223633,22.68306541442871,460026000 +2023-03-02,22.488000869750977,23.3799991607666,22.43199920654297,23.31399917602539,23.29865837097168,389415000 +2023-03-03,23.31999969482422,23.899999618530273,23.1299991607666,23.889999389648438,23.87428092956543,412393000 +2023-03-06,23.891000747680664,24.24799919128418,23.48699951171875,23.554000854492188,23.538503646850586,437429000 +2023-03-07,23.600000381469727,24.125,23.240999221801758,23.288000106811523,23.276628494262695,515154000 +2023-03-08,23.48699951171875,24.200000762939453,23.423999786376953,24.180999755859375,24.169193267822266,513573000 +2023-03-09,24.174999237060547,24.45400047302246,23.382999420166016,23.43600082397461,23.424558639526367,501257000 +2023-03-10,23.409000396728516,23.62700080871582,22.72599983215332,22.96500015258789,22.953784942626953,474866000 +2023-03-13,22.75200080871582,23.29800033569336,22.297000885009766,22.965999603271484,22.954784393310547,421890000 +2023-03-14,23.496000289916992,24.2189998626709,23.459999084472656,24.062999725341797,24.051250457763672,474910000 +2023-03-15,23.76099967956543,24.285999298095703,23.360000610351562,24.22800064086914,24.21617317199707,524486000 +2023-03-16,24.027000427246094,25.58799934387207,23.893999099731445,25.541000366210938,25.528528213500977,583253000 +2023-03-17,25.98200035095215,26.39900016784668,25.667999267578125,25.725000381469727,25.712440490722656,848547000 +2023-03-20,25.614999771118164,26.02400016784668,25.1299991607666,25.899999618530273,25.88735580444336,432747000 +2023-03-21,26.18000030517578,26.392000198364258,25.381000518798828,26.198999404907227,26.186208724975586,547408000 +2023-03-22,26.424999237060547,27.589000701904297,26.23699951171875,26.468000411987305,26.455076217651367,797295000 +2023-03-23,27.114999771118164,27.499000549316406,26.690000534057617,27.19099998474121,27.177722930908203,564889000 +2023-03-24,27.0310001373291,27.16699981689453,26.354999542236328,26.77899932861328,26.76592445373535,454920000 +2023-03-27,26.836999893188477,27.0,26.364999771118164,26.5310001373291,26.518049240112305,361026000 +2023-03-28,26.44700050354004,26.51300048828125,25.850000381469727,26.40999984741211,26.397106170654297,356104000 +2023-03-29,26.825000762939453,27.077999114990234,26.597000122070312,26.983999252319336,26.970823287963867,393694000 +2023-03-30,27.229000091552734,27.499000549316406,27.101999282836914,27.382999420166016,27.36962890625,364516000 +2023-03-31,27.139999389648438,27.833999633789062,27.104999542236328,27.777000427246094,27.763439178466797,433933000 +2023-04-03,27.509000778198242,28.0,27.336000442504883,27.96500015258789,27.951345443725586,398716000 +2023-04-04,27.965999603271484,28.0,27.30699920654297,27.452999114990234,27.439594268798828,368592000 +2023-04-05,26.82900047302246,26.99799919128418,26.395000457763672,26.881000518798828,26.867876052856445,515015000 +2023-04-06,26.583999633789062,27.079999923706055,26.427000045776367,27.03700065612793,27.0237979888916,397654000 +2023-04-10,26.822999954223633,27.621000289916992,26.66900062561035,27.57900047302246,27.565536499023438,395279000 +2023-04-11,27.724000930786133,27.790000915527344,27.125999450683594,27.16900062561035,27.15573501586914,314378000 +2023-04-12,27.3700008392334,27.468000411987305,26.447999954223633,26.4950008392334,26.482065200805664,446259000 +2023-04-13,26.733999252319336,26.886999130249023,26.32900047302246,26.46299934387207,26.4500789642334,353615000 +2023-04-14,26.503999710083008,26.882999420166016,26.219999313354492,26.757999420166016,26.74493408203125,395660000 +2023-04-17,26.565000534057617,27.006000518798828,26.433000564575195,27.00200080871582,26.98881721496582,321471000 +2023-04-18,27.533000946044922,28.110000610351562,27.35700035095215,27.66699981689453,27.653491973876953,604812000 +2023-04-19,27.361000061035156,28.0,27.23200035095215,27.930999755859375,27.9173641204834,358660000 +2023-04-20,27.67099952697754,28.030000686645508,27.0,27.104000091552734,27.090768814086914,427713000 +2023-04-21,26.95199966430664,27.183000564575195,26.722000122070312,27.118999481201172,27.105758666992188,345103000 +2023-04-24,27.01300048828125,27.365999221801758,26.67099952697754,27.04199981689453,27.028797149658203,339282000 +2023-04-25,27.077999114990234,27.246999740600586,26.225000381469727,26.240999221801758,26.228187561035156,378527000 +2023-04-26,27.00200080871582,27.329999923706055,26.704999923706055,26.95599937438965,26.942838668823242,406337000 +2023-04-27,27.363000869750977,27.4950008392334,26.625,27.22599983215332,27.21270751953125,377161000 +2023-04-28,27.225000381469727,27.757999420166016,27.070999145507812,27.749000549316406,27.73545265197754,292216000 +2023-05-01,27.84000015258789,29.058000564575195,27.780000686645508,28.90999984741211,28.895885467529297,570329000 +2023-05-02,28.68000030517578,28.836000442504883,28.08300018310547,28.209999084472656,28.196226119995117,402730000 +2023-05-03,27.84000015258789,28.367000579833984,27.472000122070312,27.802000045776367,27.78842544555664,383387000 +2023-05-04,27.650999069213867,27.858999252319336,27.239999771118164,27.562000274658203,27.548542022705078,321850000 +2023-05-05,27.826000213623047,28.7549991607666,27.731000900268555,28.68000030517578,28.665998458862305,361494000 +2023-05-08,28.52199935913086,29.219999313354492,28.350000381469727,29.150999069213867,29.13676643371582,344979000 +2023-05-09,28.89900016784668,28.972999572753906,28.450000762939453,28.570999145507812,28.557048797607422,314869000 +2023-05-10,29.0049991607666,29.113000869750977,28.461000442504883,28.885000228881836,28.870899200439453,369350000 +2023-05-11,28.895999908447266,28.950000762939453,28.246000289916992,28.577999114990234,28.56404685974121,322149000 +2023-05-12,28.52899932861328,28.780000686645508,28.04599952697754,28.34000015258789,28.326162338256836,274222000 +2023-05-15,28.507999420166016,28.9689998626709,28.152000427246094,28.952999114990234,28.93886375427246,293926000 +2023-05-16,28.840999603271484,29.8700008392334,28.832000732421875,29.21299934387207,29.198734283447266,449405000 +2023-05-17,29.583999633789062,30.198999404907227,29.43000030517578,30.17799949645996,30.163267135620117,432535000 +2023-05-18,30.40999984741211,31.827999114990234,30.31999969482422,31.67799949645996,31.66253089904785,748725000 +2023-05-19,31.535999298095703,31.579999923706055,30.916000366210938,31.263999938964844,31.248735427856445,473908000 +2023-05-22,30.900999069213867,31.520000457763672,30.68000030517578,31.176000595092773,31.16077995300293,372000000 +2023-05-23,31.0,31.288000106811523,30.631000518798828,30.687999725341797,30.673015594482422,356253000 +2023-05-24,30.209999084472656,30.60700035095215,29.805999755859375,30.538000106811523,30.523088455200195,721419000 +2023-05-25,38.52299880981445,39.47999954223633,36.6349983215332,37.97999954223633,37.96146011352539,1543911000 +2023-05-26,37.88999938964844,39.16999816894531,37.54999923706055,38.94599914550781,38.92698669433594,714397000 +2023-05-30,40.595001220703125,41.9379997253418,39.94900131225586,40.111000061035156,40.0914192199707,923401000 +2023-05-31,39.487998962402344,40.29499816894531,37.821998596191406,37.83399963378906,37.815528869628906,1002580000 +2023-06-01,38.48899841308594,40.04999923706055,38.34000015258789,39.77000045776367,39.750579833984375,635873000 +2023-06-02,40.09700012207031,40.5,39.05799865722656,39.32699966430664,39.307796478271484,482731000 +2023-06-05,38.909000396728516,39.564998626708984,38.707000732421875,39.17100143432617,39.15187454223633,396094000 +2023-06-06,38.83000183105469,39.15999984741211,38.14799880981445,38.65399932861328,38.6351318359375,388729000 +2023-06-07,38.915000915527344,39.499000549316406,37.35599899291992,37.474998474121094,37.46057891845703,511998000 +2023-06-08,37.7239990234375,38.86399841308594,37.505001068115234,38.5099983215332,38.49517822265625,417772000 +2023-06-09,39.0369987487793,39.71099853515625,38.56700134277344,38.77000045776367,38.75508117675781,427717000 +2023-06-12,39.20000076293945,39.529998779296875,38.61800003051758,39.481998443603516,39.46680450439453,388701000 +2023-06-13,40.18600082397461,41.10100173950195,39.7400016784668,41.02199935913086,41.0062141418457,613208000 +2023-06-14,40.82400131225586,43.0,40.551998138427734,42.99700164794922,42.98045349121094,740465000 +2023-06-15,42.60200119018555,43.28900146484375,42.14699935913086,42.65299987792969,42.63658905029297,568622000 +2023-06-16,43.45000076293945,43.72100067138672,42.6609992980957,42.69200134277344,42.67557144165039,655709000 +2023-06-20,42.99800109863281,43.9900016784668,42.67399978637695,43.80799865722656,43.791141510009766,451153000 +2023-06-21,43.500999450683594,43.6150016784668,42.08000183105469,43.04499816894531,43.0284309387207,551603000 +2023-06-22,42.25299835205078,43.42599868774414,42.23400115966797,43.025001525878906,43.008445739746094,417737000 +2023-06-23,42.4640007019043,42.808998107910156,42.01499938964844,42.20899963378906,42.1927604675293,358140000 +2023-06-26,42.46099853515625,42.763999938964844,40.099998474121094,40.63199996948242,40.616363525390625,594322000 +2023-06-27,40.79899978637695,41.939998626708984,40.448001861572266,41.875999450683594,41.859886169433594,462175000 +2023-06-28,40.65999984741211,41.845001220703125,40.518001556396484,41.117000579833984,41.10117721557617,582639000 +2023-06-29,41.55799865722656,41.599998474121094,40.599998474121094,40.821998596191406,40.8062858581543,380514000 +2023-06-30,41.68000030517578,42.54999923706055,41.500999450683594,42.301998138427734,42.28572082519531,501148000 +2023-07-03,42.516998291015625,42.89799880981445,42.20199966430664,42.41299819946289,42.39667510986328,198209000 +2023-07-05,42.1349983215332,43.176998138427734,42.084999084472656,42.31700134277344,42.300716400146484,323618000 +2023-07-06,41.84400177001953,42.17900085449219,41.34600067138672,42.10300064086914,42.08679962158203,303582000 +2023-07-07,42.321998596191406,43.2140007019043,42.18000030517578,42.50299835205078,42.48664093017578,355881000 +2023-07-10,42.65700149536133,42.81100082397461,41.64899826049805,42.18000030517578,42.16376495361328,353908000 +2023-07-11,42.48099899291992,42.757999420166016,42.06700134277344,42.404998779296875,42.38867950439453,298244000 +2023-07-12,43.03300094604492,43.935001373291016,42.777000427246094,43.902000427246094,43.88510513305664,481277000 +2023-07-13,44.518001556396484,46.154998779296875,44.492000579833984,45.97700119018555,45.95930862426758,478204000 +2023-07-14,46.58300018310547,48.0880012512207,45.060001373291016,45.46900177001953,45.45150375366211,772075000 +2023-07-17,46.28900146484375,46.49599838256836,45.262001037597656,46.46099853515625,46.44312286376953,510488000 +2023-07-18,46.70100021362305,47.895999908447266,45.73400115966797,47.49399948120117,47.47572326660156,569164000 +2023-07-19,47.4640007019043,47.81800079345703,46.742000579833984,47.07699966430664,47.05888366699219,427502000 +2023-07-20,46.50699996948242,47.08700180053711,45.0620002746582,45.52000045776367,45.502479553222656,537865000 +2023-07-21,45.78799819946289,45.86600112915039,44.099998474121094,44.308998107910156,44.29194641113281,963769000 +2023-07-24,44.73099899291992,45.10900115966797,44.040000915527344,44.61199951171875,44.59483337402344,382516000 +2023-07-25,44.941001892089844,46.18299865722656,44.92300033569336,45.67900085449219,45.66142654418945,348081000 +2023-07-26,46.020999908447266,46.053001403808594,44.630001068115234,45.45199966430664,45.434513092041016,364237000 +2023-07-27,46.51900100708008,47.39500045776367,45.75,45.900001525878906,45.88233947753906,455976000 +2023-07-28,46.667999267578125,47.027000427246094,46.38100051879883,46.75,46.73200988769531,331194000 +2023-07-31,46.75400161743164,47.130001068115234,46.50600051879883,46.729000091552734,46.71101379394531,251055000 +2023-08-01,46.459999084472656,46.900001525878906,46.027000427246094,46.50699996948242,46.48910140991211,237858000 +2023-08-02,45.83100128173828,45.84000015258789,43.387001037597656,44.26900100708008,44.25196838378906,530142000 +2023-08-03,43.79999923706055,45.11800003051758,43.79999923706055,44.51499938964844,44.49787139892578,324176000 +2023-08-04,44.98500061035156,45.641998291015625,44.393001556396484,44.68000030517578,44.66280746459961,362666000 +2023-08-07,45.111000061035156,45.540000915527344,44.5629997253418,45.41699981689453,45.3995246887207,322154000 +2023-08-08,44.85300064086914,45.242000579833984,44.055999755859375,44.66400146484375,44.646820068359375,353843000 +2023-08-09,44.27399826049805,44.3120002746582,42.13399887084961,42.55400085449219,42.537628173828125,586449000 +2023-08-10,42.15999984741211,43.57400131225586,41.834999084472656,42.38800048828125,42.3716926574707,492705000 +2023-08-11,41.750999450683594,42.018001556396484,40.638999938964844,40.85499954223633,40.83928298950195,533085000 +2023-08-14,40.486000061035156,43.79999923706055,40.31100082397461,43.75299835205078,43.73616027832031,690286000 +2023-08-15,44.560001373291016,45.268001556396484,43.709999084472656,43.939998626708984,43.923091888427734,676512000 +2023-08-16,44.52000045776367,44.67499923706055,43.40599822998047,43.486000061035156,43.46926498413086,527451000 +2023-08-17,43.970001220703125,44.0620002746582,43.000999450683594,43.34400177001953,43.32732009887695,452395000 +2023-08-18,42.6349983215332,43.577999114990234,41.65999984741211,43.29899978637695,43.2823371887207,583768000 +2023-08-21,44.49399948120117,47.064998626708984,44.22200012207031,46.96699905395508,46.94892883300781,692573000 +2023-08-22,48.1349983215332,48.1870002746582,45.33300018310547,45.667999267578125,45.65042495727539,755293000 +2023-08-23,45.86600112915039,47.20000076293945,45.20800018310547,47.11600112915039,47.09787368774414,779046000 +2023-08-24,50.215999603271484,50.26599884033203,47.159000396728516,47.16299819946289,47.14485168457031,1156044000 +2023-08-25,47.012001037597656,47.80500030517578,45.02399826049805,46.018001556396484,46.00028991699219,925341000 +2023-08-28,46.481998443603516,46.97999954223633,44.88800048828125,46.834999084472656,46.816978454589844,685192000 +2023-08-29,46.66600036621094,49.08100128173828,46.39099884033203,48.784000396728516,48.765228271484375,701397000 +2023-08-30,49.04399871826172,49.926998138427734,48.42499923706055,49.263999938964844,49.24504470825195,735206000 +2023-08-31,49.380001068115234,49.74399948120117,48.95800018310547,49.35499954223633,49.33600997924805,528570000 +2023-09-01,49.762001037597656,49.79999923706055,48.141998291015625,48.50899887084961,48.490333557128906,463830000 +2023-09-05,48.222999572753906,48.85100173950195,47.86000061035156,48.54800033569336,48.529319763183594,382653000 +2023-09-06,48.441001892089844,48.54899978637695,46.58000183105469,47.06100082397461,47.04676818847656,468670000 +2023-09-07,45.525001525878906,46.34400177001953,45.152000427246094,46.24100112915039,46.22701644897461,433330000 +2023-09-08,45.94200134277344,46.60599899291992,45.270999908447266,45.571998596191406,45.5582160949707,473069000 +2023-09-11,46.14799880981445,46.16299819946289,44.3120002746582,45.178001403808594,45.16433334350586,473966000 +2023-09-12,44.737998962402344,45.67300033569336,44.53099822998047,44.869998931884766,44.85642623901367,349256000 +2023-09-13,44.599998474121094,45.93000030517578,44.50299835205078,45.48500061035156,45.47124481201172,398355000 +2023-09-14,45.95000076293945,45.98699951171875,45.13100051879883,45.58100128173828,45.56721496582031,370097000 +2023-09-15,45.340999603271484,45.5989990234375,43.80799865722656,43.900001525878906,43.886722564697266,506831000 +2023-09-18,42.74800109863281,44.242000579833984,42.0,43.965999603271484,43.95269775390625,500271000 +2023-09-19,43.83300018310547,43.965999603271484,43.00199890136719,43.52000045776367,43.5068359375,373064000 +2023-09-20,43.599998474121094,43.90299987792969,42.222999572753906,42.23899841308594,42.226226806640625,367108000 +2023-09-21,41.58300018310547,42.099998474121094,40.97999954223633,41.016998291015625,41.00459289550781,450736000 +2023-09-22,41.571998596191406,42.1150016784668,41.23099899291992,41.61000061035156,41.597412109375,479236000 +2023-09-25,41.590999603271484,42.5359992980957,41.176998138427734,42.22200012207031,42.209228515625,419091000 +2023-09-26,42.000999450683594,42.81999969482422,41.654998779296875,41.9109992980957,41.89832305908203,402282000 +2023-09-27,42.33000183105469,42.87200164794922,41.62900161743164,42.46799850463867,42.45515441894531,444935000 +2023-09-28,42.459999084472656,43.44599914550781,42.1150016784668,43.0890007019043,43.075965881347656,424663000 +2023-09-29,43.82699966430664,44.14400100708008,43.30699920654297,43.499000549316406,43.48584747314453,397830000 +2023-10-02,44.029998779296875,45.17499923706055,43.861000061035156,44.78200149536133,44.768455505371094,433298000 +2023-10-03,44.80799865722656,45.130001068115234,43.24599838256836,43.516998291015625,43.50383377075195,470850000 +2023-10-04,43.742000579833984,44.143001556396484,43.29199981689453,44.04100036621094,44.027679443359375,361821000 +2023-10-05,44.04999923706055,44.900001525878906,43.88800048828125,44.6879997253418,44.67448043823242,393483000 +2023-10-06,44.19300079345703,45.78900146484375,44.0260009765625,45.762001037597656,45.74816131591797,434436000 +2023-10-09,44.84199905395508,45.60499954223633,44.36800003051758,45.27299880981445,45.25930404663086,409675000 +2023-10-10,45.310001373291016,46.25899887084961,45.0880012512207,45.79800033569336,45.784149169921875,368582000 +2023-10-11,46.19599914550781,46.85900115966797,46.04999923706055,46.805999755859375,46.791839599609375,378137000 +2023-10-12,46.777000427246094,47.60900115966797,46.33000183105469,46.94499969482422,46.9307975769043,481325000 +2023-10-13,46.959999084472656,47.11600112915039,45.279998779296875,45.46099853515625,45.44724655151367,474115000 +2023-10-16,45.0629997253418,46.224998474121094,44.9119987487793,46.095001220703125,46.08106231689453,375099000 +2023-10-17,44.0,44.75400161743164,42.47999954223633,43.9379997253418,43.924713134765625,812333000 +2023-10-18,42.590999603271484,43.21900177001953,41.82500076293945,42.19599914550781,42.1832389831543,627294000 +2023-10-19,42.81100082397461,43.297000885009766,41.88199996948242,42.10100173950195,42.0882682800293,501233000 +2023-10-20,41.88999938964844,42.470001220703125,41.077999114990234,41.387001037597656,41.374481201171875,477266000 +2023-10-23,41.229000091552734,43.24800109863281,40.94499969482422,42.974998474121094,42.96200180053711,478530000 +2023-10-24,43.07699966430664,43.696998596191406,42.691001892089844,43.66299819946289,43.6497917175293,401463000 +2023-10-25,43.39799880981445,43.650001525878906,41.55500030517578,41.77899932861328,41.766361236572266,398379000 +2023-10-26,41.85300064086914,42.25600051879883,39.880001068115234,40.32600021362305,40.313804626464844,541001000 +2023-10-27,41.130001068115234,41.20600128173828,40.01499938964844,40.5,40.48775100708008,416784000 +2023-10-30,41.08700180053711,41.76599884033203,40.48099899291992,41.1609992980957,41.14855194091797,388028000 +2023-10-31,40.45000076293945,40.87900161743164,39.22999954223633,40.779998779296875,40.76766586303711,517969000 +2023-11-01,40.88399887084961,42.38100051879883,40.86899948120117,42.32500076293945,42.31220245361328,437593000 +2023-11-02,43.327999114990234,43.88399887084961,42.89400100708008,43.50600051879883,43.49284362792969,409172000 +2023-11-03,44.02000045776367,45.308998107910156,43.722999572753906,45.005001068115234,44.991390228271484,424610000 +2023-11-06,45.28499984741211,45.935001373291016,44.89899826049805,45.750999450683594,45.737159729003906,400733000 +2023-11-07,45.71900177001953,46.21799850463867,45.15800094604492,45.95500183105469,45.941104888916016,343165000 +2023-11-08,46.099998474121094,46.867000579833984,45.96799850463867,46.57400131225586,46.559913635253906,346719000 +2023-11-09,47.46699905395508,48.22999954223633,46.75,46.95000076293945,46.93579864501953,540496000 +2023-11-10,47.5,48.47200012207031,47.28300094604492,48.334999084472656,48.320377349853516,421245000 +2023-11-13,48.31999969482422,49.11600112915039,48.0989990234375,48.619998931884766,48.60529327392578,384136000 +2023-11-14,49.68000030517578,49.83399963378906,49.040000915527344,49.65599822998047,49.6409797668457,416954000 +2023-11-15,49.935001373291016,49.959999084472656,48.20000076293945,48.88800048828125,48.87321853637695,475497000 +2023-11-16,48.67900085449219,49.525001525878906,48.33000183105469,49.47999954223633,49.46503829956055,339756000 +2023-11-17,49.52399826049805,49.71699905395508,49.00699996948242,49.29800033569336,49.2830924987793,325205000 +2023-11-20,49.3120002746582,50.54800033569336,49.180999755859375,50.409000396728516,50.39375305175781,414120000 +2023-11-21,50.125999450683594,50.516998291015625,49.22200012207031,49.944000244140625,49.92889404296875,565747000 +2023-11-22,49.85200119018555,50.334999084472656,47.689998626708984,48.715999603271484,48.701263427734375,899420000 +2023-11-24,48.470001220703125,48.92100143432617,47.744998931884766,47.7760009765625,47.76155090332031,294645000 +2023-11-27,47.79999923706055,48.529998779296875,47.652000427246094,48.242000579833984,48.2274055480957,395662000 +2023-11-28,48.236000061035156,48.323001861572266,47.472999572753906,47.82099914550781,47.80653381347656,401491000 +2023-11-29,48.37900161743164,48.762001037597656,47.86000061035156,48.13999938964844,48.12544250488281,382005000 +2023-11-30,48.02399826049805,48.11000061035156,46.422000885009766,46.77000045776367,46.75585174560547,526247000 +2023-12-01,46.525001525878906,47.20000076293945,46.1870002746582,46.76499938964844,46.750850677490234,369317000 +2023-12-04,46.07699966430664,46.07699966430664,45.0099983215332,45.5099983215332,45.49623107910156,437543000 +2023-12-05,45.465999603271484,46.599998474121094,45.270999908447266,46.566001892089844,46.55601119995117,371718000 +2023-12-06,47.21500015258789,47.387001037597656,45.4119987487793,45.50299835205078,45.49323272705078,380590000 +2023-12-07,45.70000076293945,46.62900161743164,45.604000091552734,46.59600067138672,46.586002349853516,350823000 +2023-12-08,46.595001220703125,47.74100112915039,46.54999923706055,47.50600051879883,47.49580764770508,359224000 +2023-12-11,47.49100112915039,47.53099822998047,45.83000183105469,46.62699890136719,46.61699676513672,509728000 +2023-12-12,46.04600143432617,47.66600036621094,46.04600143432617,47.65700149536133,47.64677429199219,372387000 +2023-12-13,47.62900161743164,48.59400177001953,47.608001708984375,48.0880012512207,48.07768630981445,447792000 +2023-12-14,48.38999938964844,48.66999816894531,47.422000885009766,48.349998474121094,48.339622497558594,391232000 +2023-12-15,48.194000244140625,49.40399932861328,48.119998931884766,48.88999938964844,48.87950897216797,479948000 +2023-12-18,49.400001525878906,50.43299865722656,49.150001525878906,50.07699966430664,50.066253662109375,412587000 +2023-12-19,49.42399978637695,49.70000076293945,48.89500045776367,49.604000091552734,49.59335708618164,464444000 +2023-12-20,49.654998779296875,49.999000549316406,48.097999572753906,48.111000061035156,48.10067367553711,397894000 +2023-12-21,48.81100082397461,49.095001220703125,48.41899871826172,48.9900016784668,48.9794921875,300425000 +2023-12-22,49.19499969482422,49.382999420166016,48.46699905395508,48.83000183105469,48.81952667236328,252507000 +2023-12-26,48.96799850463867,49.599998474121094,48.959999084472656,49.27899932861328,49.26842498779297,244200000 +2023-12-27,49.51100158691406,49.68000030517578,49.084999084472656,49.41699981689453,49.406394958496094,233648000 +2023-12-28,49.643001556396484,49.88399887084961,49.4119987487793,49.52199935913086,49.511375427246094,246587000 +2023-12-29,49.8129997253418,49.99700164794922,48.750999450683594,49.52199935913086,49.511375427246094,389293000 diff --git a/ds701_book/remove_dev_fences.py b/ds701_book/remove_dev_fences.py new file mode 100644 index 00000000..6a8d9330 --- /dev/null +++ b/ds701_book/remove_dev_fences.py @@ -0,0 +1,15 @@ +import sys +import json + +def remove_div_fences(cell): + if cell['cell_type'] == 'markdown': + cell['source'] = ''.join(line for line in cell['source'] if not line.startswith(':::')) + return cell + +def main(): + notebook = json.load(sys.stdin) + notebook['cells'] = [remove_div_fences(cell) for cell in notebook['cells']] + json.dump(notebook, sys.stdout) + +if __name__ == '__main__': + main() diff --git a/ds701_book/sklearn-trees.ipynb b/ds701_book/sklearn-trees.ipynb new file mode 100644 index 00000000..92ee15f7 --- /dev/null +++ b/ds701_book/sklearn-trees.ipynb @@ -0,0 +1,1271 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "title: \"14b. Decision Trees in sklearn\"\n", + "---\n", + "\n", + "## Decision Trees\n", + "\n", + "**Decision Trees (DTs)** are a non-parametric supervised learning method used\n", + "for :ref:`classification ` and :ref:`regression\n", + "`. The goal is to create a model that predicts the value of a\n", + "target variable by learning simple decision rules inferred from the data\n", + "features. A tree can be seen as a piecewise constant approximation.\n", + "\n", + "For instance, in the example below, decision trees learn from data to\n", + "approximate a sine curve with a set of if-then-else decision rules. The deeper\n", + "the tree, the more complex the decision rules and the fitter the model.\n", + "\n", + ".. figure:: ../auto_examples/tree/images/sphx_glr_plot_tree_regression_001.png\n", + " :target: ../auto_examples/tree/plot_tree_regression.html\n", + " :scale: 75\n", + " :align: center\n", + "\n", + "Some advantages of decision trees are:\n", + "\n", + "- Simple to understand and to interpret. Trees can be visualized.\n", + "\n", + "- Requires little data preparation. Other techniques often require data\n", + " normalization, dummy variables need to be created and blank values to\n", + " be removed. Some tree and algorithm combinations support\n", + " :ref:`missing values `.\n", + "\n", + "- The cost of using the tree (i.e., predicting data) is logarithmic in the\n", + " number of data points used to train the tree.\n", + "\n", + "- Able to handle both numerical and categorical data. However, the scikit-learn\n", + " implementation does not support categorical variables for now. Other\n", + " techniques are usually specialized in analyzing datasets that have only one type\n", + " of variable. See :ref:`algorithms ` for more\n", + " information.\n", + "\n", + "- Able to handle multi-output problems.\n", + "\n", + "- Uses a white box model. If a given situation is observable in a model,\n", + " the explanation for the condition is easily explained by boolean logic.\n", + " By contrast, in a black box model (e.g., in an artificial neural\n", + " network), results may be more difficult to interpret.\n", + "\n", + "- Possible to validate a model using statistical tests. That makes it\n", + " possible to account for the reliability of the model.\n", + "\n", + "- Performs well even if its assumptions are somewhat violated by\n", + " the true model from which the data were generated.\n", + "\n", + "\n", + "The disadvantages of decision trees include:\n", + "\n", + "- Decision-tree learners can create over-complex trees that do not\n", + " generalize the data well. This is called overfitting. Mechanisms\n", + " such as pruning, setting the minimum number of samples required\n", + " at a leaf node or setting the maximum depth of the tree are\n", + " necessary to avoid this problem.\n", + "\n", + "- Decision trees can be unstable because small variations in the\n", + " data might result in a completely different tree being generated.\n", + " This problem is mitigated by using decision trees within an\n", + " ensemble.\n", + "\n", + "- Predictions of decision trees are neither smooth nor continuous, but\n", + " piecewise constant approximations as seen in the above figure. Therefore,\n", + " they are not good at extrapolation.\n", + "\n", + "- The problem of learning an optimal decision tree is known to be\n", + " NP-complete under several aspects of optimality and even for simple\n", + " concepts. Consequently, practical decision-tree learning algorithms\n", + " are based on heuristic algorithms such as the greedy algorithm where\n", + " locally optimal decisions are made at each node. Such algorithms\n", + " cannot guarantee to return the globally optimal decision tree. This\n", + " can be mitigated by training multiple trees in an ensemble learner,\n", + " where the features and samples are randomly sampled with replacement.\n", + "\n", + "- There are concepts that are hard to learn because decision trees\n", + " do not express them easily, such as XOR, parity or multiplexer problems.\n", + "\n", + "- Decision tree learners create biased trees if some classes dominate.\n", + " It is therefore recommended to balance the dataset prior to fitting\n", + " with the decision tree.\n", + "\n", + "\n", + "## Classification\n", + "\n", + "`DecisionTreeClassifier` is a class capable of performing multi-class\n", + "classification on a dataset.\n", + "\n", + "As with other classifiers, :class:`DecisionTreeClassifier` takes as input two arrays:\n", + "an array X, sparse or dense, of shape ``(n_samples, n_features)`` holding the\n", + "training samples, and an array Y of integer values, shape ``(n_samples,)``,\n", + "holding the class labels for the training samples::" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn import tree\n", + "X = [[0, 0], [1, 1]]\n", + "Y = [0, 1]\n", + "clf = tree.DecisionTreeClassifier()\n", + "clf = clf.fit(X, Y)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After being fitted, the model can then be used to predict the class of samples::" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([1])" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clf.predict([[2., 2.]])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In case that there are multiple classes with the same and highest\n", + "probability, the classifier will predict the class with the lowest index\n", + "amongst those classes.\n", + "\n", + "As an alternative to outputting a specific class, the probability of each class\n", + "can be predicted, which is the fraction of training samples of the class in a\n", + "leaf::" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[0., 1.]])" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clf.predict_proba([[2., 2.]])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`DecisionTreeClassifier` is capable of both binary (where the\n", + "labels are [-1, 1]) classification and multiclass (where the labels are\n", + "[0, ..., K-1]) classification.\n", + "\n", + "Using the Iris dataset, we can construct a tree as follows::" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.datasets import load_iris\n", + "from sklearn import tree\n", + "iris = load_iris()\n", + "X, y = iris.data, iris.target\n", + "clf = tree.DecisionTreeClassifier()\n", + "clf = clf.fit(X, y)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once trained, you can plot the tree with the :func:`plot_tree` function::" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Text(0.5, 0.9166666666666666, 'x[2] <= 2.45\\ngini = 0.667\\nsamples = 150\\nvalue = [50, 50, 50]'),\n", + " Text(0.4230769230769231, 0.75, 'gini = 0.0\\nsamples = 50\\nvalue = [50, 0, 0]'),\n", + " Text(0.46153846153846156, 0.8333333333333333, 'True '),\n", + " Text(0.5769230769230769, 0.75, 'x[3] <= 1.75\\ngini = 0.5\\nsamples = 100\\nvalue = [0, 50, 50]'),\n", + " Text(0.5384615384615384, 0.8333333333333333, ' False'),\n", + " Text(0.3076923076923077, 0.5833333333333334, 'x[2] <= 4.95\\ngini = 0.168\\nsamples = 54\\nvalue = [0, 49, 5]'),\n", + " Text(0.15384615384615385, 0.4166666666666667, 'x[3] <= 1.65\\ngini = 0.041\\nsamples = 48\\nvalue = [0, 47, 1]'),\n", + " Text(0.07692307692307693, 0.25, 'gini = 0.0\\nsamples = 47\\nvalue = [0, 47, 0]'),\n", + " Text(0.23076923076923078, 0.25, 'gini = 0.0\\nsamples = 1\\nvalue = [0, 0, 1]'),\n", + " Text(0.46153846153846156, 0.4166666666666667, 'x[3] <= 1.55\\ngini = 0.444\\nsamples = 6\\nvalue = [0, 2, 4]'),\n", + " Text(0.38461538461538464, 0.25, 'gini = 0.0\\nsamples = 3\\nvalue = [0, 0, 3]'),\n", + " Text(0.5384615384615384, 0.25, 'x[2] <= 5.45\\ngini = 0.444\\nsamples = 3\\nvalue = [0, 2, 1]'),\n", + " Text(0.46153846153846156, 0.08333333333333333, 'gini = 0.0\\nsamples = 2\\nvalue = [0, 2, 0]'),\n", + " Text(0.6153846153846154, 0.08333333333333333, 'gini = 0.0\\nsamples = 1\\nvalue = [0, 0, 1]'),\n", + " Text(0.8461538461538461, 0.5833333333333334, 'x[2] <= 4.85\\ngini = 0.043\\nsamples = 46\\nvalue = [0, 1, 45]'),\n", + " Text(0.7692307692307693, 0.4166666666666667, 'x[1] <= 3.1\\ngini = 0.444\\nsamples = 3\\nvalue = [0, 1, 2]'),\n", + " Text(0.6923076923076923, 0.25, 'gini = 0.0\\nsamples = 2\\nvalue = [0, 0, 2]'),\n", + " Text(0.8461538461538461, 0.25, 'gini = 0.0\\nsamples = 1\\nvalue = [0, 1, 0]'),\n", + " Text(0.9230769230769231, 0.4166666666666667, 'gini = 0.0\\nsamples = 43\\nvalue = [0, 0, 43]')]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "tree.plot_tree(clf)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + ".. figure:: ../auto_examples/tree/images/sphx_glr_plot_iris_dtc_002.png\n", + " :target: ../auto_examples/tree/plot_iris_dtc.html\n", + " :scale: 75\n", + " :align: center\n", + "\n", + ".. dropdown:: Alternative ways to export trees\n", + "\n", + " We can also export the tree in `Graphviz\n", + " `_ format using the :func:`export_graphviz`\n", + " exporter. If you use the `conda `_ package manager, the graphviz binaries\n", + " and the python package can be installed with `conda install python-graphviz`.\n", + "\n", + " Alternatively binaries for graphviz can be downloaded from the graphviz project homepage,\n", + " and the Python wrapper installed from pypi with `pip install graphviz`.\n", + "\n", + " Below is an example graphviz export of the above tree trained on the entire\n", + " iris dataset; the results are saved in an output file `iris.pdf`::" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'iris.pdf'" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import graphviz # doctest: +SKIP\n", + "dot_data = tree.export_graphviz(clf, out_file=None) # doctest: +SKIP\n", + "graph = graphviz.Source(dot_data) # doctest: +SKIP\n", + "graph.render(\"iris\") # doctest: +SKIP" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " The :func:`export_graphviz` exporter also supports a variety of aesthetic\n", + " options, including coloring nodes by their class (or value for regression) and\n", + " using explicit variable and class names if desired. Jupyter notebooks also\n", + " render these plots inline automatically::" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "Tree\n", + "\n", + "\n", + "\n", + "0\n", + "\n", + "petal length (cm) ≤ 2.45\n", + "gini = 0.667\n", + "samples = 150\n", + "value = [50, 50, 50]\n", + "class = setosa\n", + "\n", + "\n", + "\n", + "1\n", + "\n", + "gini = 0.0\n", + "samples = 50\n", + "value = [50, 0, 0]\n", + "class = setosa\n", + "\n", + "\n", + "\n", + "0->1\n", + "\n", + "\n", + "True\n", + "\n", + "\n", + "\n", + "2\n", + "\n", + "petal width (cm) ≤ 1.75\n", + "gini = 0.5\n", + "samples = 100\n", + "value = [0, 50, 50]\n", + "class = versicolor\n", + "\n", + "\n", + "\n", + "0->2\n", + "\n", + "\n", + "False\n", + "\n", + "\n", + "\n", + "3\n", + "\n", + "petal length (cm) ≤ 4.95\n", + "gini = 0.168\n", + "samples = 54\n", + "value = [0, 49, 5]\n", + "class = versicolor\n", + "\n", + "\n", + "\n", + "2->3\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "12\n", + "\n", + "petal length (cm) ≤ 4.85\n", + "gini = 0.043\n", + "samples = 46\n", + "value = [0, 1, 45]\n", + "class = virginica\n", + "\n", + "\n", + "\n", + "2->12\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "4\n", + "\n", + "petal width (cm) ≤ 1.65\n", + "gini = 0.041\n", + "samples = 48\n", + "value = [0, 47, 1]\n", + "class = versicolor\n", + "\n", + "\n", + "\n", + "3->4\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "7\n", + "\n", + "petal width (cm) ≤ 1.55\n", + "gini = 0.444\n", + "samples = 6\n", + "value = [0, 2, 4]\n", + "class = virginica\n", + "\n", + "\n", + "\n", + "3->7\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "5\n", + "\n", + "gini = 0.0\n", + "samples = 47\n", + "value = [0, 47, 0]\n", + "class = versicolor\n", + "\n", + "\n", + "\n", + "4->5\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "6\n", + "\n", + "gini = 0.0\n", + "samples = 1\n", + "value = [0, 0, 1]\n", + "class = virginica\n", + "\n", + "\n", + "\n", + "4->6\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "8\n", + "\n", + "gini = 0.0\n", + "samples = 3\n", + "value = [0, 0, 3]\n", + "class = virginica\n", + "\n", + "\n", + "\n", + "7->8\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "9\n", + "\n", + "petal length (cm) ≤ 5.45\n", + "gini = 0.444\n", + "samples = 3\n", + "value = [0, 2, 1]\n", + "class = versicolor\n", + "\n", + "\n", + "\n", + "7->9\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "10\n", + "\n", + "gini = 0.0\n", + "samples = 2\n", + "value = [0, 2, 0]\n", + "class = versicolor\n", + "\n", + "\n", + "\n", + "9->10\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "11\n", + "\n", + "gini = 0.0\n", + "samples = 1\n", + "value = [0, 0, 1]\n", + "class = virginica\n", + "\n", + "\n", + "\n", + "9->11\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "13\n", + "\n", + "sepal width (cm) ≤ 3.1\n", + "gini = 0.444\n", + "samples = 3\n", + "value = [0, 1, 2]\n", + "class = virginica\n", + "\n", + "\n", + "\n", + "12->13\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "16\n", + "\n", + "gini = 0.0\n", + "samples = 43\n", + "value = [0, 0, 43]\n", + "class = virginica\n", + "\n", + "\n", + "\n", + "12->16\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "14\n", + "\n", + "gini = 0.0\n", + "samples = 2\n", + "value = [0, 0, 2]\n", + "class = virginica\n", + "\n", + "\n", + "\n", + "13->14\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "15\n", + "\n", + "gini = 0.0\n", + "samples = 1\n", + "value = [0, 1, 0]\n", + "class = versicolor\n", + "\n", + "\n", + "\n", + "13->15\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dot_data = tree.export_graphviz(clf, out_file=None, # doctest: +SKIP\n", + " feature_names=iris.feature_names, # doctest: +SKIP\n", + " class_names=iris.target_names, # doctest: +SKIP\n", + " filled=True, rounded=True, # doctest: +SKIP\n", + " special_characters=True) # doctest: +SKIP\n", + "graph = graphviz.Source(dot_data) # doctest: +SKIP\n", + "graph # doctest: +SKIP" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " .. only:: html\n", + "\n", + " .. figure:: ../images/iris.svg\n", + " :align: center\n", + "\n", + " .. only:: latex\n", + "\n", + " .. figure:: ../images/iris.pdf\n", + " :align: center\n", + "\n", + " .. figure:: ../auto_examples/tree/images/sphx_glr_plot_iris_dtc_001.png\n", + " :target: ../auto_examples/tree/plot_iris_dtc.html\n", + " :align: center\n", + " :scale: 75\n", + "\n", + " Alternatively, the tree can also be exported in textual format with the\n", + " function :func:`export_text`. This method doesn't require the installation\n", + " of external libraries and is more compact:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "|--- petal width (cm) <= 0.80\n", + "| |--- class: 0\n", + "|--- petal width (cm) > 0.80\n", + "| |--- petal width (cm) <= 1.75\n", + "| | |--- class: 1\n", + "| |--- petal width (cm) > 1.75\n", + "| | |--- class: 2\n", + "\n" + ] + } + ], + "source": [ + "from sklearn.datasets import load_iris\n", + "from sklearn.tree import DecisionTreeClassifier\n", + "from sklearn.tree import export_text\n", + "iris = load_iris()\n", + "decision_tree = DecisionTreeClassifier(random_state=0, max_depth=2)\n", + "decision_tree = decision_tree.fit(iris.data, iris.target)\n", + "r = export_text(decision_tree, feature_names=iris['feature_names'])\n", + "print(r)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " |--- petal width (cm) <= 0.80\n", + " | |--- class: 0\n", + " |--- petal width (cm) > 0.80\n", + " | |--- petal width (cm) <= 1.75\n", + " | | |--- class: 1\n", + " | |--- petal width (cm) > 1.75\n", + " | | |--- class: 2\n", + " \n", + "\n", + ".. rubric:: Examples\n", + "\n", + "* :ref:`sphx_glr_auto_examples_tree_plot_iris_dtc.py`\n", + "* :ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py`\n", + "\n", + ".. _tree_regression:\n", + "\n", + "## Regression\n", + "\n", + ".. figure:: ../auto_examples/tree/images/sphx_glr_plot_tree_regression_001.png\n", + " :target: ../auto_examples/tree/plot_tree_regression.html\n", + " :scale: 75\n", + " :align: center\n", + "\n", + "Decision trees can also be applied to regression problems, using the\n", + ":class:`DecisionTreeRegressor` class.\n", + "\n", + "As in the classification setting, the fit method will take as argument arrays X\n", + "and y, only that in this case y is expected to have floating point values\n", + "instead of integer values::" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([0.5])" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn import tree\n", + "X = [[0, 0], [2, 2]]\n", + "y = [0.5, 2.5]\n", + "clf = tree.DecisionTreeRegressor()\n", + "clf = clf.fit(X, y)\n", + "clf.predict([[1, 1]])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " array([0.5])\n", + "\n", + ".. rubric:: Examples\n", + "\n", + "* :ref:`sphx_glr_auto_examples_tree_plot_tree_regression.py`\n", + "\n", + "\n", + ".. _tree_multioutput:\n", + "\n", + "## Multi-output problems\n", + "\n", + "A multi-output problem is a supervised learning problem with several outputs\n", + "to predict, that is when Y is a 2d array of shape ``(n_samples, n_outputs)``.\n", + "\n", + "When there is no correlation between the outputs, a very simple way to solve\n", + "this kind of problem is to build n independent models, i.e. one for each\n", + "output, and then to use those models to independently predict each one of the n\n", + "outputs. However, because it is likely that the output values related to the\n", + "same input are themselves correlated, an often better way is to build a single\n", + "model capable of predicting simultaneously all n outputs. First, it requires\n", + "lower training time since only a single estimator is built. Second, the\n", + "generalization accuracy of the resulting estimator may often be increased.\n", + "\n", + "With regard to decision trees, this strategy can readily be used to support\n", + "multi-output problems. This requires the following changes:\n", + "\n", + "- Store n output values in leaves, instead of 1;\n", + "- Use splitting criteria that compute the average reduction across all\n", + " n outputs.\n", + "\n", + "This module offers support for multi-output problems by implementing this\n", + "strategy in both :class:`DecisionTreeClassifier` and\n", + ":class:`DecisionTreeRegressor`. If a decision tree is fit on an output array Y\n", + "of shape ``(n_samples, n_outputs)`` then the resulting estimator will:\n", + "\n", + "* Output n_output values upon ``predict``;\n", + "\n", + "* Output a list of n_output arrays of class probabilities upon\n", + " ``predict_proba``.\n", + "\n", + "The use of multi-output trees for regression is demonstrated in\n", + ":ref:`sphx_glr_auto_examples_tree_plot_tree_regression_multioutput.py`. In this example, the input\n", + "X is a single real value and the outputs Y are the sine and cosine of X.\n", + "\n", + ".. figure:: ../auto_examples/tree/images/sphx_glr_plot_tree_regression_multioutput_001.png\n", + " :target: ../auto_examples/tree/plot_tree_regression_multioutput.html\n", + " :scale: 75\n", + " :align: center\n", + "\n", + "The use of multi-output trees for classification is demonstrated in\n", + ":ref:`sphx_glr_auto_examples_miscellaneous_plot_multioutput_face_completion.py`. In this example, the inputs\n", + "X are the pixels of the upper half of faces and the outputs Y are the pixels of\n", + "the lower half of those faces.\n", + "\n", + ".. figure:: ../auto_examples/miscellaneous/images/sphx_glr_plot_multioutput_face_completion_001.png\n", + " :target: ../auto_examples/miscellaneous/plot_multioutput_face_completion.html\n", + " :scale: 75\n", + " :align: center\n", + "\n", + ".. rubric:: Examples\n", + "\n", + "* :ref:`sphx_glr_auto_examples_tree_plot_tree_regression_multioutput.py`\n", + "* :ref:`sphx_glr_auto_examples_miscellaneous_plot_multioutput_face_completion.py`\n", + "\n", + ".. rubric:: References\n", + "\n", + "* M. Dumont et al, `Fast multi-class image annotation with random subwindows\n", + " and multiple output randomized trees\n", + " `_,\n", + " International Conference on Computer Vision Theory and Applications 2009\n", + "\n", + ".. _tree_complexity:\n", + "\n", + "## Complexity\n", + "\n", + "In general, the run time cost to construct a balanced binary tree is\n", + ":math:`O(n_{samples}n_{features}\\log(n_{samples}))` and query time\n", + ":math:`O(\\log(n_{samples}))`. Although the tree construction algorithm attempts\n", + "to generate balanced trees, they will not always be balanced. Assuming that the\n", + "subtrees remain approximately balanced, the cost at each node consists of\n", + "searching through :math:`O(n_{features})` to find the feature that offers the\n", + "largest reduction in the impurity criterion, e.g. log loss (which is equivalent to an\n", + "information gain). This has a cost of\n", + ":math:`O(n_{features}n_{samples}\\log(n_{samples}))` at each node, leading to a\n", + "total cost over the entire trees (by summing the cost at each node) of\n", + ":math:`O(n_{features}n_{samples}^{2}\\log(n_{samples}))`.\n", + "\n", + "\n", + "## Tips on practical use\n", + "\n", + "* Decision trees tend to overfit on data with a large number of features.\n", + " Getting the right ratio of samples to number of features is important, since\n", + " a tree with few samples in high dimensional space is very likely to overfit.\n", + "\n", + "* Consider performing dimensionality reduction (:ref:`PCA `,\n", + " :ref:`ICA `, or :ref:`feature_selection`) beforehand to\n", + " give your tree a better chance of finding features that are discriminative.\n", + "\n", + "* :ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py` will help\n", + " in gaining more insights about how the decision tree makes predictions, which is\n", + " important for understanding the important features in the data.\n", + "\n", + "* Visualize your tree as you are training by using the ``export``\n", + " function. Use ``max_depth=3`` as an initial tree depth to get a feel for\n", + " how the tree is fitting to your data, and then increase the depth.\n", + "\n", + "* Remember that the number of samples required to populate the tree doubles\n", + " for each additional level the tree grows to. Use ``max_depth`` to control\n", + " the size of the tree to prevent overfitting.\n", + "\n", + "* Use ``min_samples_split`` or ``min_samples_leaf`` to ensure that multiple\n", + " samples inform every decision in the tree, by controlling which splits will\n", + " be considered. A very small number will usually mean the tree will overfit,\n", + " whereas a large number will prevent the tree from learning the data. Try\n", + " ``min_samples_leaf=5`` as an initial value. If the sample size varies\n", + " greatly, a float number can be used as percentage in these two parameters.\n", + " While ``min_samples_split`` can create arbitrarily small leaves,\n", + " ``min_samples_leaf`` guarantees that each leaf has a minimum size, avoiding\n", + " low-variance, over-fit leaf nodes in regression problems. For\n", + " classification with few classes, ``min_samples_leaf=1`` is often the best\n", + " choice.\n", + "\n", + " Note that ``min_samples_split`` considers samples directly and independent of\n", + " ``sample_weight``, if provided (e.g. a node with m weighted samples is still\n", + " treated as having exactly m samples). Consider ``min_weight_fraction_leaf`` or\n", + " ``min_impurity_decrease`` if accounting for sample weights is required at splits.\n", + "\n", + "* Balance your dataset before training to prevent the tree from being biased\n", + " toward the classes that are dominant. Class balancing can be done by\n", + " sampling an equal number of samples from each class, or preferably by\n", + " normalizing the sum of the sample weights (``sample_weight``) for each\n", + " class to the same value. Also note that weight-based pre-pruning criteria,\n", + " such as ``min_weight_fraction_leaf``, will then be less biased toward\n", + " dominant classes than criteria that are not aware of the sample weights,\n", + " like ``min_samples_leaf``.\n", + "\n", + "* If the samples are weighted, it will be easier to optimize the tree\n", + " structure using weight-based pre-pruning criterion such as\n", + " ``min_weight_fraction_leaf``, which ensure that leaf nodes contain at least\n", + " a fraction of the overall sum of the sample weights.\n", + "\n", + "* All decision trees use ``np.float32`` arrays internally.\n", + " If training data is not in this format, a copy of the dataset will be made.\n", + "\n", + "* If the input matrix X is very sparse, it is recommended to convert to sparse\n", + " ``csc_matrix`` before calling fit and sparse ``csr_matrix`` before calling\n", + " predict. Training time can be orders of magnitude faster for a sparse\n", + " matrix input compared to a dense matrix when features have zero values in\n", + " most of the samples.\n", + "\n", + "\n", + ".. _tree_algorithms:\n", + "\n", + "## Tree algorithms: ID3, C4.5, C5.0 and CART\n", + "\n", + "What are all the various decision tree algorithms and how do they differ\n", + "from each other? Which one is implemented in scikit-learn?\n", + "\n", + ".. dropdown:: Various decision tree algorithms\n", + "\n", + " ID3_ (Iterative Dichotomiser 3) was developed in 1986 by Ross Quinlan.\n", + " The algorithm creates a multiway tree, finding for each node (i.e. in\n", + " a greedy manner) the categorical feature that will yield the largest\n", + " information gain for categorical targets. Trees are grown to their\n", + " maximum size and then a pruning step is usually applied to improve the\n", + " ability of the tree to generalize to unseen data.\n", + "\n", + " C4.5 is the successor to ID3 and removed the restriction that features\n", + " must be categorical by dynamically defining a discrete attribute (based\n", + " on numerical variables) that partitions the continuous attribute value\n", + " into a discrete set of intervals. C4.5 converts the trained trees\n", + " (i.e. the output of the ID3 algorithm) into sets of if-then rules.\n", + " The accuracy of each rule is then evaluated to determine the order\n", + " in which they should be applied. Pruning is done by removing a rule's\n", + " precondition if the accuracy of the rule improves without it.\n", + "\n", + " C5.0 is Quinlan's latest version release under a proprietary license.\n", + " It uses less memory and builds smaller rulesets than C4.5 while being\n", + " more accurate.\n", + "\n", + " CART (Classification and Regression Trees) is very similar to C4.5, but\n", + " it differs in that it supports numerical target variables (regression) and\n", + " does not compute rule sets. CART constructs binary trees using the feature\n", + " and threshold that yield the largest information gain at each node.\n", + "\n", + "scikit-learn uses an optimized version of the CART algorithm; however, the\n", + "scikit-learn implementation does not support categorical variables for now.\n", + "\n", + ".. _ID3: https://en.wikipedia.org/wiki/ID3_algorithm\n", + "\n", + "\n", + ".. _tree_mathematical_formulation:\n", + "\n", + "## Mathematical formulation\n", + "\n", + "Given training vectors :math:`x_i \\in R^n`, i=1,..., l and a label vector\n", + ":math:`y \\in R^l`, a decision tree recursively partitions the feature space\n", + "such that the samples with the same labels or similar target values are grouped\n", + "together.\n", + "\n", + "Let the data at node :math:`m` be represented by :math:`Q_m` with :math:`n_m`\n", + "samples. For each candidate split :math:`\\theta = (j, t_m)` consisting of a\n", + "feature :math:`j` and threshold :math:`t_m`, partition the data into\n", + ":math:`Q_m^{left}(\\theta)` and :math:`Q_m^{right}(\\theta)` subsets\n", + "\n", + "$$\n", + "\\begin{aligned}\n", + "Q_m^{left}(\\theta) &= \\{(x, y) | x_j \\leq t_m\\} \\\\\n", + "Q_m^{right}(\\theta) &= Q_m \\setminus Q_m^{left}(\\theta)\n", + "\\end{aligned}\n", + "$$\n", + "\n", + "The quality of a candidate split of node :math:`m` is then computed using an\n", + "impurity function or loss function :math:`H()`, the choice of which depends on\n", + "the task being solved (classification or regression)\n", + "\n", + "$$\n", + "G(Q_m, \\theta) = \\frac{n_m^{left}}{n_m} H(Q_m^{left}(\\theta))\n", + "+ \\frac{n_m^{right}}{n_m} H(Q_m^{right}(\\theta))\n", + "$$\n", + "\n", + "Select the parameters that minimises the impurity\n", + "\n", + "$$\n", + "\\theta^* = \\operatorname{argmin}_\\theta G(Q_m, \\theta)\n", + "$$\n", + "\n", + "Recurse for subsets :math:`Q_m^{left}(\\theta^*)` and\n", + ":math:`Q_m^{right}(\\theta^*)` until the maximum allowable depth is reached,\n", + ":math:`n_m < \\min_{samples}` or :math:`n_m = 1`.\n", + "\n", + "## Classification criteria\n", + "\n", + "If a target is a classification outcome taking on values 0,1,...,K-1,\n", + "for node :math:`m`, let\n", + "\n", + "$$\n", + "p_{mk} = \\frac{1}{n_m} \\sum_{y \\in Q_m} I(y = k)\n", + "$$\n", + "\n", + "be the proportion of class k observations in node :math:`m`. If :math:`m` is a\n", + "terminal node, `predict_proba` for this region is set to :math:`p_{mk}`.\n", + "Common measures of impurity are the following.\n", + "\n", + "Gini:\n", + "\n", + "$$\n", + "H(Q_m) = \\sum_k p_{mk} (1 - p_{mk})\n", + "$$\n", + "\n", + "Log Loss or Entropy:\n", + "\n", + "$$\n", + "H(Q_m) = - \\sum_k p_{mk} \\log(p_{mk})\n", + "$$\n", + "\n", + ".. dropdown:: Shannon entropy\n", + "\n", + " The entropy criterion computes the Shannon entropy of the possible classes. It\n", + " takes the class frequencies of the training data points that reached a given\n", + " leaf :math:`m` as their probability. Using the **Shannon entropy as tree node\n", + " splitting criterion is equivalent to minimizing the log loss** (also known as\n", + " cross-entropy and multinomial deviance) between the true labels :math:`y_i`\n", + " and the probabilistic predictions :math:`T_k(x_i)` of the tree model :math:`T` for class :math:`k`.\n", + "\n", + " To see this, first recall that the log loss of a tree model :math:`T`\n", + " computed on a dataset :math:`D` is defined as follows:\n", + "\n", + "$$\n", + "\\mathrm{LL}(D, T) = -\\frac{1}{n} \\sum_{(x_i, y_i) \\in D} \\sum_k I(y_i = k) \\log(T_k(x_i))\n", + "$$\n", + "\n", + " where :math:`D` is a training dataset of :math:`n` pairs :math:`(x_i, y_i)`.\n", + "\n", + " In a classification tree, the predicted class probabilities within leaf nodes\n", + " are constant, that is: for all :math:`(x_i, y_i) \\in Q_m`, one has:\n", + " :math:`T_k(x_i) = p_{mk}` for each class :math:`k`.\n", + "\n", + " This property makes it possible to rewrite :math:`\\mathrm{LL}(D, T)` as the\n", + " sum of the Shannon entropies computed for each leaf of :math:`T` weighted by\n", + " the number of training data points that reached each leaf:\n", + "\n", + "$$\n", + "\\mathrm{LL}(D, T) = \\sum_{m \\in T} \\frac{n_m}{n} H(Q_m)\n", + "$$\n", + "\n", + "## Regression criteria\n", + "\n", + "If the target is a continuous value, then for node :math:`m`, common\n", + "criteria to minimize as for determining locations for future splits are Mean\n", + "Squared Error (MSE or L2 error), Poisson deviance as well as Mean Absolute\n", + "Error (MAE or L1 error). MSE and Poisson deviance both set the predicted value\n", + "of terminal nodes to the learned mean value :math:`\\bar{y}_m` of the node\n", + "whereas the MAE sets the predicted value of terminal nodes to the median\n", + ":math:`median(y)_m`.\n", + "\n", + "Mean Squared Error:\n", + "\n", + "$$\n", + "\\bar{y}_m = \\frac{1}{n_m} \\sum_{y \\in Q_m} y\n", + "$$\n", + "\n", + "$$\n", + "H(Q_m) = \\frac{1}{n_m} \\sum_{y \\in Q_m} (y - \\bar{y}_m)^2\n", + "$$\n", + "\n", + "Mean Poisson deviance:\n", + "\n", + "$$\n", + "H(Q_m) = \\frac{2}{n_m} \\sum_{y \\in Q_m} (y \\log\\frac{y}{\\bar{y}_m} - y + \\bar{y}_m)\n", + "$$\n", + "\n", + "Setting `criterion=\"poisson\"` might be a good choice if your target is a count\n", + "or a frequency (count per some unit). In any case, :math:`y >= 0` is a\n", + "necessary condition to use this criterion. Note that it fits much slower than\n", + "the MSE criterion. For performance reasons the actual implementation minimizes\n", + "the half mean poisson deviance, i.e. the mean poisson deviance divided by 2.\n", + "\n", + "Mean Absolute Error:\n", + "\n", + "$$\n", + "median(y)_m = \\underset{y \\in Q_m}{\\mathrm{median}}(y)\n", + "$$\n", + "\n", + "$$\n", + "H(Q_m) = \\frac{1}{n_m} \\sum_{y \\in Q_m} |y - median(y)_m|\n", + "$$\n", + "\n", + "Note that it fits much slower than the MSE criterion.\n", + "\n", + ".. _tree_missing_value_support:\n", + "\n", + "## Missing Values Support\n", + "\n", + ":class:`DecisionTreeClassifier` and :class:`DecisionTreeRegressor`\n", + "have built-in support for missing values when `splitter='best'` and criterion is\n", + "`'gini'`, `'entropy`', or `'log_loss'`, for classification or\n", + "`'squared_error'`, `'friedman_mse'`, or `'poisson'` for regression.\n", + "\n", + "For each potential threshold on the non-missing data, the splitter will evaluate\n", + "the split with all the missing values going to the left node or the right node.\n", + "\n", + "Decisions are made as follows:\n", + "\n", + "- By default when predicting, the samples with missing values are classified\n", + " with the class used in the split found during training::" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([0, 0, 1, 1])" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.tree import DecisionTreeClassifier\n", + "import numpy as np\n", + "\n", + "X = np.array([0, 1, 6, np.nan]).reshape(-1, 1)\n", + "y = [0, 0, 1, 1]\n", + "\n", + "tree = DecisionTreeClassifier(random_state=0).fit(X, y)\n", + "tree.predict(X)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " array([0, 0, 1, 1])\n", + "\n", + "- If the criterion evaluation is the same for both nodes,\n", + " then the tie for missing value at predict time is broken by going to the\n", + " right node. The splitter also checks the split where all the missing\n", + " values go to one child and non-missing values go to the other::" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([1])" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.tree import DecisionTreeClassifier\n", + "import numpy as np\n", + "\n", + "X = np.array([np.nan, -1, np.nan, 1]).reshape(-1, 1)\n", + "y = [0, 0, 1, 1]\n", + "\n", + "tree = DecisionTreeClassifier(random_state=0).fit(X, y)\n", + "\n", + "X_test = np.array([np.nan]).reshape(-1, 1)\n", + "tree.predict(X_test)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " array([1])\n", + "\n", + "- If no missing values are seen during training for a given feature, then during\n", + " prediction missing values are mapped to the child with the most samples::" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([1])" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.tree import DecisionTreeClassifier\n", + "import numpy as np\n", + "\n", + "X = np.array([0, 1, 2, 3]).reshape(-1, 1)\n", + "y = [0, 1, 1, 1]\n", + "\n", + "tree = DecisionTreeClassifier(random_state=0).fit(X, y)\n", + "\n", + "X_test = np.array([np.nan]).reshape(-1, 1)\n", + "tree.predict(X_test)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " array([1])\n", + "\n", + ".. _minimal_cost_complexity_pruning:\n", + "\n", + "## Minimal Cost-Complexity Pruning\n", + "\n", + "Minimal cost-complexity pruning is an algorithm used to prune a tree to avoid\n", + "over-fitting, described in Chapter 3 of [BRE]_. This algorithm is parameterized\n", + "by :math:`\\alpha\\ge0` known as the complexity parameter. The complexity\n", + "parameter is used to define the cost-complexity measure, :math:`R_\\alpha(T)` of\n", + "a given tree :math:`T`:\n", + "\n", + "$$\n", + "R_\\alpha(T) = R(T) + \\alpha|\\widetilde{T}|\n", + "$$\n", + "\n", + "where :math:`|\\widetilde{T}|` is the number of terminal nodes in :math:`T` and :math:`R(T)`\n", + "is traditionally defined as the total misclassification rate of the terminal\n", + "nodes. Alternatively, scikit-learn uses the total sample weighted impurity of\n", + "the terminal nodes for :math:`R(T)`. As shown above, the impurity of a node\n", + "depends on the criterion. Minimal cost-complexity pruning finds the subtree of\n", + ":math:`T` that minimizes :math:`R_\\alpha(T)`.\n", + "\n", + "The cost complexity measure of a single node is\n", + ":math:`R_\\alpha(t)=R(t)+\\alpha`. The branch, :math:`T_t`, is defined to be a\n", + "tree where node :math:`t` is its root. In general, the impurity of a node\n", + "is greater than the sum of impurities of its terminal nodes,\n", + ":math:`R(T_t)` and :ref:`regression +`. The goal is to create a model that predicts the value of a +target variable by learning simple decision rules inferred from the data +features. A tree can be seen as a piecewise constant approximation. + +For instance, in the example below, decision trees learn from data to +approximate a sine curve with a set of if-then-else decision rules. The deeper +the tree, the more complex the decision rules and the fitter the model. + +.. figure:: ../auto_examples/tree/images/sphx_glr_plot_tree_regression_001.png + :target: ../auto_examples/tree/plot_tree_regression.html + :scale: 75 + :align: center + +Some advantages of decision trees are: + +- Simple to understand and to interpret. Trees can be visualized. + +- Requires little data preparation. Other techniques often require data + normalization, dummy variables need to be created and blank values to + be removed. Some tree and algorithm combinations support + :ref:`missing values `. + +- The cost of using the tree (i.e., predicting data) is logarithmic in the + number of data points used to train the tree. + +- Able to handle both numerical and categorical data. However, the scikit-learn + implementation does not support categorical variables for now. Other + techniques are usually specialized in analyzing datasets that have only one type + of variable. See :ref:`algorithms ` for more + information. + +- Able to handle multi-output problems. + +- Uses a white box model. If a given situation is observable in a model, + the explanation for the condition is easily explained by boolean logic. + By contrast, in a black box model (e.g., in an artificial neural + network), results may be more difficult to interpret. + +- Possible to validate a model using statistical tests. That makes it + possible to account for the reliability of the model. + +- Performs well even if its assumptions are somewhat violated by + the true model from which the data were generated. + + +The disadvantages of decision trees include: + +- Decision-tree learners can create over-complex trees that do not + generalize the data well. This is called overfitting. Mechanisms + such as pruning, setting the minimum number of samples required + at a leaf node or setting the maximum depth of the tree are + necessary to avoid this problem. + +- Decision trees can be unstable because small variations in the + data might result in a completely different tree being generated. + This problem is mitigated by using decision trees within an + ensemble. + +- Predictions of decision trees are neither smooth nor continuous, but + piecewise constant approximations as seen in the above figure. Therefore, + they are not good at extrapolation. + +- The problem of learning an optimal decision tree is known to be + NP-complete under several aspects of optimality and even for simple + concepts. Consequently, practical decision-tree learning algorithms + are based on heuristic algorithms such as the greedy algorithm where + locally optimal decisions are made at each node. Such algorithms + cannot guarantee to return the globally optimal decision tree. This + can be mitigated by training multiple trees in an ensemble learner, + where the features and samples are randomly sampled with replacement. + +- There are concepts that are hard to learn because decision trees + do not express them easily, such as XOR, parity or multiplexer problems. + +- Decision tree learners create biased trees if some classes dominate. + It is therefore recommended to balance the dataset prior to fitting + with the decision tree. + + +## Classification + +`DecisionTreeClassifier` is a class capable of performing multi-class +classification on a dataset. + +As with other classifiers, :class:`DecisionTreeClassifier` takes as input two arrays: +an array X, sparse or dense, of shape ``(n_samples, n_features)`` holding the +training samples, and an array Y of integer values, shape ``(n_samples,)``, +holding the class labels for the training samples:: + +```{python} +from sklearn import tree +X = [[0, 0], [1, 1]] +Y = [0, 1] +clf = tree.DecisionTreeClassifier() +clf = clf.fit(X, Y) +``` + +After being fitted, the model can then be used to predict the class of samples:: + +```{python} +clf.predict([[2., 2.]]) +``` + +In case that there are multiple classes with the same and highest +probability, the classifier will predict the class with the lowest index +amongst those classes. + +As an alternative to outputting a specific class, the probability of each class +can be predicted, which is the fraction of training samples of the class in a +leaf:: + +```{python} +clf.predict_proba([[2., 2.]]) +``` + +`DecisionTreeClassifier` is capable of both binary (where the +labels are [-1, 1]) classification and multiclass (where the labels are +[0, ..., K-1]) classification. + +Using the Iris dataset, we can construct a tree as follows:: + +```{python} +from sklearn.datasets import load_iris +from sklearn import tree +iris = load_iris() +X, y = iris.data, iris.target +clf = tree.DecisionTreeClassifier() +clf = clf.fit(X, y) +``` + +Once trained, you can plot the tree with the :func:`plot_tree` function:: + +```{python} +tree.plot_tree(clf) +``` + +.. figure:: ../auto_examples/tree/images/sphx_glr_plot_iris_dtc_002.png + :target: ../auto_examples/tree/plot_iris_dtc.html + :scale: 75 + :align: center + +.. dropdown:: Alternative ways to export trees + + We can also export the tree in `Graphviz + `_ format using the :func:`export_graphviz` + exporter. If you use the `conda `_ package manager, the graphviz binaries + and the python package can be installed with `conda install python-graphviz`. + + Alternatively binaries for graphviz can be downloaded from the graphviz project homepage, + and the Python wrapper installed from pypi with `pip install graphviz`. + + Below is an example graphviz export of the above tree trained on the entire + iris dataset; the results are saved in an output file `iris.pdf`:: + +```{python} +import graphviz # doctest: +SKIP +dot_data = tree.export_graphviz(clf, out_file=None) # doctest: +SKIP +graph = graphviz.Source(dot_data) # doctest: +SKIP +graph.render("iris") # doctest: +SKIP +``` + + The :func:`export_graphviz` exporter also supports a variety of aesthetic + options, including coloring nodes by their class (or value for regression) and + using explicit variable and class names if desired. Jupyter notebooks also + render these plots inline automatically:: + +```{python} +dot_data = tree.export_graphviz(clf, out_file=None, # doctest: +SKIP + feature_names=iris.feature_names, # doctest: +SKIP + class_names=iris.target_names, # doctest: +SKIP + filled=True, rounded=True, # doctest: +SKIP + special_characters=True) # doctest: +SKIP +graph = graphviz.Source(dot_data) # doctest: +SKIP +graph # doctest: +SKIP +``` + + .. only:: html + + .. figure:: ../images/iris.svg + :align: center + + .. only:: latex + + .. figure:: ../images/iris.pdf + :align: center + + .. figure:: ../auto_examples/tree/images/sphx_glr_plot_iris_dtc_001.png + :target: ../auto_examples/tree/plot_iris_dtc.html + :align: center + :scale: 75 + + Alternatively, the tree can also be exported in textual format with the + function :func:`export_text`. This method doesn't require the installation + of external libraries and is more compact: + +```{python} +from sklearn.datasets import load_iris +from sklearn.tree import DecisionTreeClassifier +from sklearn.tree import export_text +iris = load_iris() +decision_tree = DecisionTreeClassifier(random_state=0, max_depth=2) +decision_tree = decision_tree.fit(iris.data, iris.target) +r = export_text(decision_tree, feature_names=iris['feature_names']) +print(r) +``` + + + |--- petal width (cm) <= 0.80 + | |--- class: 0 + |--- petal width (cm) > 0.80 + | |--- petal width (cm) <= 1.75 + | | |--- class: 1 + | |--- petal width (cm) > 1.75 + | | |--- class: 2 + + +.. rubric:: Examples + +* :ref:`sphx_glr_auto_examples_tree_plot_iris_dtc.py` +* :ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py` + +.. _tree_regression: + +## Regression + +.. figure:: ../auto_examples/tree/images/sphx_glr_plot_tree_regression_001.png + :target: ../auto_examples/tree/plot_tree_regression.html + :scale: 75 + :align: center + +Decision trees can also be applied to regression problems, using the +:class:`DecisionTreeRegressor` class. + +As in the classification setting, the fit method will take as argument arrays X +and y, only that in this case y is expected to have floating point values +instead of integer values:: + +```{python} +from sklearn import tree +X = [[0, 0], [2, 2]] +y = [0.5, 2.5] +clf = tree.DecisionTreeRegressor() +clf = clf.fit(X, y) +clf.predict([[1, 1]]) +``` + + array([0.5]) + +.. rubric:: Examples + +* :ref:`sphx_glr_auto_examples_tree_plot_tree_regression.py` + + +.. _tree_multioutput: + +## Multi-output problems + +A multi-output problem is a supervised learning problem with several outputs +to predict, that is when Y is a 2d array of shape ``(n_samples, n_outputs)``. + +When there is no correlation between the outputs, a very simple way to solve +this kind of problem is to build n independent models, i.e. one for each +output, and then to use those models to independently predict each one of the n +outputs. However, because it is likely that the output values related to the +same input are themselves correlated, an often better way is to build a single +model capable of predicting simultaneously all n outputs. First, it requires +lower training time since only a single estimator is built. Second, the +generalization accuracy of the resulting estimator may often be increased. + +With regard to decision trees, this strategy can readily be used to support +multi-output problems. This requires the following changes: + +- Store n output values in leaves, instead of 1; +- Use splitting criteria that compute the average reduction across all + n outputs. + +This module offers support for multi-output problems by implementing this +strategy in both :class:`DecisionTreeClassifier` and +:class:`DecisionTreeRegressor`. If a decision tree is fit on an output array Y +of shape ``(n_samples, n_outputs)`` then the resulting estimator will: + +* Output n_output values upon ``predict``; + +* Output a list of n_output arrays of class probabilities upon + ``predict_proba``. + +The use of multi-output trees for regression is demonstrated in +:ref:`sphx_glr_auto_examples_tree_plot_tree_regression_multioutput.py`. In this example, the input +X is a single real value and the outputs Y are the sine and cosine of X. + +.. figure:: ../auto_examples/tree/images/sphx_glr_plot_tree_regression_multioutput_001.png + :target: ../auto_examples/tree/plot_tree_regression_multioutput.html + :scale: 75 + :align: center + +The use of multi-output trees for classification is demonstrated in +:ref:`sphx_glr_auto_examples_miscellaneous_plot_multioutput_face_completion.py`. In this example, the inputs +X are the pixels of the upper half of faces and the outputs Y are the pixels of +the lower half of those faces. + +.. figure:: ../auto_examples/miscellaneous/images/sphx_glr_plot_multioutput_face_completion_001.png + :target: ../auto_examples/miscellaneous/plot_multioutput_face_completion.html + :scale: 75 + :align: center + +.. rubric:: Examples + +* :ref:`sphx_glr_auto_examples_tree_plot_tree_regression_multioutput.py` +* :ref:`sphx_glr_auto_examples_miscellaneous_plot_multioutput_face_completion.py` + +.. rubric:: References + +* M. Dumont et al, `Fast multi-class image annotation with random subwindows + and multiple output randomized trees + `_, + International Conference on Computer Vision Theory and Applications 2009 + +.. _tree_complexity: + +## Complexity + +In general, the run time cost to construct a balanced binary tree is +:math:`O(n_{samples}n_{features}\log(n_{samples}))` and query time +:math:`O(\log(n_{samples}))`. Although the tree construction algorithm attempts +to generate balanced trees, they will not always be balanced. Assuming that the +subtrees remain approximately balanced, the cost at each node consists of +searching through :math:`O(n_{features})` to find the feature that offers the +largest reduction in the impurity criterion, e.g. log loss (which is equivalent to an +information gain). This has a cost of +:math:`O(n_{features}n_{samples}\log(n_{samples}))` at each node, leading to a +total cost over the entire trees (by summing the cost at each node) of +:math:`O(n_{features}n_{samples}^{2}\log(n_{samples}))`. + + +## Tips on practical use + +* Decision trees tend to overfit on data with a large number of features. + Getting the right ratio of samples to number of features is important, since + a tree with few samples in high dimensional space is very likely to overfit. + +* Consider performing dimensionality reduction (:ref:`PCA `, + :ref:`ICA `, or :ref:`feature_selection`) beforehand to + give your tree a better chance of finding features that are discriminative. + +* :ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py` will help + in gaining more insights about how the decision tree makes predictions, which is + important for understanding the important features in the data. + +* Visualize your tree as you are training by using the ``export`` + function. Use ``max_depth=3`` as an initial tree depth to get a feel for + how the tree is fitting to your data, and then increase the depth. + +* Remember that the number of samples required to populate the tree doubles + for each additional level the tree grows to. Use ``max_depth`` to control + the size of the tree to prevent overfitting. + +* Use ``min_samples_split`` or ``min_samples_leaf`` to ensure that multiple + samples inform every decision in the tree, by controlling which splits will + be considered. A very small number will usually mean the tree will overfit, + whereas a large number will prevent the tree from learning the data. Try + ``min_samples_leaf=5`` as an initial value. If the sample size varies + greatly, a float number can be used as percentage in these two parameters. + While ``min_samples_split`` can create arbitrarily small leaves, + ``min_samples_leaf`` guarantees that each leaf has a minimum size, avoiding + low-variance, over-fit leaf nodes in regression problems. For + classification with few classes, ``min_samples_leaf=1`` is often the best + choice. + + Note that ``min_samples_split`` considers samples directly and independent of + ``sample_weight``, if provided (e.g. a node with m weighted samples is still + treated as having exactly m samples). Consider ``min_weight_fraction_leaf`` or + ``min_impurity_decrease`` if accounting for sample weights is required at splits. + +* Balance your dataset before training to prevent the tree from being biased + toward the classes that are dominant. Class balancing can be done by + sampling an equal number of samples from each class, or preferably by + normalizing the sum of the sample weights (``sample_weight``) for each + class to the same value. Also note that weight-based pre-pruning criteria, + such as ``min_weight_fraction_leaf``, will then be less biased toward + dominant classes than criteria that are not aware of the sample weights, + like ``min_samples_leaf``. + +* If the samples are weighted, it will be easier to optimize the tree + structure using weight-based pre-pruning criterion such as + ``min_weight_fraction_leaf``, which ensure that leaf nodes contain at least + a fraction of the overall sum of the sample weights. + +* All decision trees use ``np.float32`` arrays internally. + If training data is not in this format, a copy of the dataset will be made. + +* If the input matrix X is very sparse, it is recommended to convert to sparse + ``csc_matrix`` before calling fit and sparse ``csr_matrix`` before calling + predict. Training time can be orders of magnitude faster for a sparse + matrix input compared to a dense matrix when features have zero values in + most of the samples. + + +.. _tree_algorithms: + +## Tree algorithms: ID3, C4.5, C5.0 and CART + +What are all the various decision tree algorithms and how do they differ +from each other? Which one is implemented in scikit-learn? + +.. dropdown:: Various decision tree algorithms + + ID3_ (Iterative Dichotomiser 3) was developed in 1986 by Ross Quinlan. + The algorithm creates a multiway tree, finding for each node (i.e. in + a greedy manner) the categorical feature that will yield the largest + information gain for categorical targets. Trees are grown to their + maximum size and then a pruning step is usually applied to improve the + ability of the tree to generalize to unseen data. + + C4.5 is the successor to ID3 and removed the restriction that features + must be categorical by dynamically defining a discrete attribute (based + on numerical variables) that partitions the continuous attribute value + into a discrete set of intervals. C4.5 converts the trained trees + (i.e. the output of the ID3 algorithm) into sets of if-then rules. + The accuracy of each rule is then evaluated to determine the order + in which they should be applied. Pruning is done by removing a rule's + precondition if the accuracy of the rule improves without it. + + C5.0 is Quinlan's latest version release under a proprietary license. + It uses less memory and builds smaller rulesets than C4.5 while being + more accurate. + + CART (Classification and Regression Trees) is very similar to C4.5, but + it differs in that it supports numerical target variables (regression) and + does not compute rule sets. CART constructs binary trees using the feature + and threshold that yield the largest information gain at each node. + +scikit-learn uses an optimized version of the CART algorithm; however, the +scikit-learn implementation does not support categorical variables for now. + +.. _ID3: https://en.wikipedia.org/wiki/ID3_algorithm + + +.. _tree_mathematical_formulation: + +## Mathematical formulation + +Given training vectors :math:`x_i \in R^n`, i=1,..., l and a label vector +:math:`y \in R^l`, a decision tree recursively partitions the feature space +such that the samples with the same labels or similar target values are grouped +together. + +Let the data at node :math:`m` be represented by :math:`Q_m` with :math:`n_m` +samples. For each candidate split :math:`\theta = (j, t_m)` consisting of a +feature :math:`j` and threshold :math:`t_m`, partition the data into +:math:`Q_m^{left}(\theta)` and :math:`Q_m^{right}(\theta)` subsets + +$$ +\begin{aligned} +Q_m^{left}(\theta) &= \{(x, y) | x_j \leq t_m\} \\ +Q_m^{right}(\theta) &= Q_m \setminus Q_m^{left}(\theta) +\end{aligned} +$$ + +The quality of a candidate split of node :math:`m` is then computed using an +impurity function or loss function :math:`H()`, the choice of which depends on +the task being solved (classification or regression) + +$$ +G(Q_m, \theta) = \frac{n_m^{left}}{n_m} H(Q_m^{left}(\theta)) ++ \frac{n_m^{right}}{n_m} H(Q_m^{right}(\theta)) +$$ + +Select the parameters that minimises the impurity + +$$ +\theta^* = \operatorname{argmin}_\theta G(Q_m, \theta) +$$ + +Recurse for subsets :math:`Q_m^{left}(\theta^*)` and +:math:`Q_m^{right}(\theta^*)` until the maximum allowable depth is reached, +:math:`n_m < \min_{samples}` or :math:`n_m = 1`. + +## Classification criteria + +If a target is a classification outcome taking on values 0,1,...,K-1, +for node :math:`m`, let + +$$ +p_{mk} = \frac{1}{n_m} \sum_{y \in Q_m} I(y = k) +$$ + +be the proportion of class k observations in node :math:`m`. If :math:`m` is a +terminal node, `predict_proba` for this region is set to :math:`p_{mk}`. +Common measures of impurity are the following. + +Gini: + +$$ +H(Q_m) = \sum_k p_{mk} (1 - p_{mk}) +$$ + +Log Loss or Entropy: + +$$ +H(Q_m) = - \sum_k p_{mk} \log(p_{mk}) +$$ + +.. dropdown:: Shannon entropy + + The entropy criterion computes the Shannon entropy of the possible classes. It + takes the class frequencies of the training data points that reached a given + leaf :math:`m` as their probability. Using the **Shannon entropy as tree node + splitting criterion is equivalent to minimizing the log loss** (also known as + cross-entropy and multinomial deviance) between the true labels :math:`y_i` + and the probabilistic predictions :math:`T_k(x_i)` of the tree model :math:`T` for class :math:`k`. + + To see this, first recall that the log loss of a tree model :math:`T` + computed on a dataset :math:`D` is defined as follows: + +$$ +\mathrm{LL}(D, T) = -\frac{1}{n} \sum_{(x_i, y_i) \in D} \sum_k I(y_i = k) \log(T_k(x_i)) +$$ + + where :math:`D` is a training dataset of :math:`n` pairs :math:`(x_i, y_i)`. + + In a classification tree, the predicted class probabilities within leaf nodes + are constant, that is: for all :math:`(x_i, y_i) \in Q_m`, one has: + :math:`T_k(x_i) = p_{mk}` for each class :math:`k`. + + This property makes it possible to rewrite :math:`\mathrm{LL}(D, T)` as the + sum of the Shannon entropies computed for each leaf of :math:`T` weighted by + the number of training data points that reached each leaf: + +$$ +\mathrm{LL}(D, T) = \sum_{m \in T} \frac{n_m}{n} H(Q_m) +$$ + +## Regression criteria + +If the target is a continuous value, then for node :math:`m`, common +criteria to minimize as for determining locations for future splits are Mean +Squared Error (MSE or L2 error), Poisson deviance as well as Mean Absolute +Error (MAE or L1 error). MSE and Poisson deviance both set the predicted value +of terminal nodes to the learned mean value :math:`\bar{y}_m` of the node +whereas the MAE sets the predicted value of terminal nodes to the median +:math:`median(y)_m`. + +Mean Squared Error: + +$$ +\bar{y}_m = \frac{1}{n_m} \sum_{y \in Q_m} y +$$ + +$$ +H(Q_m) = \frac{1}{n_m} \sum_{y \in Q_m} (y - \bar{y}_m)^2 +$$ + +Mean Poisson deviance: + +$$ +H(Q_m) = \frac{2}{n_m} \sum_{y \in Q_m} (y \log\frac{y}{\bar{y}_m} - y + \bar{y}_m) +$$ + +Setting `criterion="poisson"` might be a good choice if your target is a count +or a frequency (count per some unit). In any case, :math:`y >= 0` is a +necessary condition to use this criterion. Note that it fits much slower than +the MSE criterion. For performance reasons the actual implementation minimizes +the half mean poisson deviance, i.e. the mean poisson deviance divided by 2. + +Mean Absolute Error: + +$$ +median(y)_m = \underset{y \in Q_m}{\mathrm{median}}(y) +$$ + +$$ +H(Q_m) = \frac{1}{n_m} \sum_{y \in Q_m} |y - median(y)_m| +$$ + +Note that it fits much slower than the MSE criterion. + +.. _tree_missing_value_support: + +## Missing Values Support + +:class:`DecisionTreeClassifier` and :class:`DecisionTreeRegressor` +have built-in support for missing values when `splitter='best'` and criterion is +`'gini'`, `'entropy`', or `'log_loss'`, for classification or +`'squared_error'`, `'friedman_mse'`, or `'poisson'` for regression. + +For each potential threshold on the non-missing data, the splitter will evaluate +the split with all the missing values going to the left node or the right node. + +Decisions are made as follows: + +- By default when predicting, the samples with missing values are classified + with the class used in the split found during training:: + +```{python} +from sklearn.tree import DecisionTreeClassifier +import numpy as np + +X = np.array([0, 1, 6, np.nan]).reshape(-1, 1) +y = [0, 0, 1, 1] + +tree = DecisionTreeClassifier(random_state=0).fit(X, y) +tree.predict(X) +``` + + array([0, 0, 1, 1]) + +- If the criterion evaluation is the same for both nodes, + then the tie for missing value at predict time is broken by going to the + right node. The splitter also checks the split where all the missing + values go to one child and non-missing values go to the other:: + +```{python} +from sklearn.tree import DecisionTreeClassifier +import numpy as np + +X = np.array([np.nan, -1, np.nan, 1]).reshape(-1, 1) +y = [0, 0, 1, 1] + +tree = DecisionTreeClassifier(random_state=0).fit(X, y) + +X_test = np.array([np.nan]).reshape(-1, 1) +tree.predict(X_test) +``` + + array([1]) + +- If no missing values are seen during training for a given feature, then during + prediction missing values are mapped to the child with the most samples:: + +```{python} +from sklearn.tree import DecisionTreeClassifier +import numpy as np + +X = np.array([0, 1, 2, 3]).reshape(-1, 1) +y = [0, 1, 1, 1] + +tree = DecisionTreeClassifier(random_state=0).fit(X, y) + +X_test = np.array([np.nan]).reshape(-1, 1) +tree.predict(X_test) +``` + + array([1]) + +.. _minimal_cost_complexity_pruning: + +## Minimal Cost-Complexity Pruning + +Minimal cost-complexity pruning is an algorithm used to prune a tree to avoid +over-fitting, described in Chapter 3 of [BRE]_. This algorithm is parameterized +by :math:`\alpha\ge0` known as the complexity parameter. The complexity +parameter is used to define the cost-complexity measure, :math:`R_\alpha(T)` of +a given tree :math:`T`: + +$$ +R_\alpha(T) = R(T) + \alpha|\widetilde{T}| +$$ + +where :math:`|\widetilde{T}|` is the number of terminal nodes in :math:`T` and :math:`R(T)` +is traditionally defined as the total misclassification rate of the terminal +nodes. Alternatively, scikit-learn uses the total sample weighted impurity of +the terminal nodes for :math:`R(T)`. As shown above, the impurity of a node +depends on the criterion. Minimal cost-complexity pruning finds the subtree of +:math:`T` that minimizes :math:`R_\alpha(T)`. + +The cost complexity measure of a single node is +:math:`R_\alpha(t)=R(t)+\alpha`. The branch, :math:`T_t`, is defined to be a +tree where node :math:`t` is its root. In general, the impurity of a node +is greater than the sum of impurities of its terminal nodes, +:math:`R(T_t)\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
dateApplianceslightsT1RH_1T2RH_2T3RH_3T4...T9RH_9T_outPress_mm_hgRH_outWindspeedVisibilityTdewpointrv1rv2
02016-01-11 17:00:00603019.8947.59666719.244.79000019.7944.73000019.000000...17.03333345.536.600000733.592.07.00000063.0000005.313.27543313.275433
12016-01-11 17:10:00603019.8946.69333319.244.72250019.7944.79000019.000000...17.06666745.566.483333733.692.06.66666759.1666675.218.60619518.606195
22016-01-11 17:20:00503019.8946.30000019.244.62666719.7944.93333318.926667...17.00000045.506.366667733.792.06.33333355.3333335.128.64266828.642668
32016-01-11 17:30:00504019.8946.06666719.244.59000019.7945.00000018.890000...17.00000045.406.250000733.892.06.00000051.5000005.045.41038945.410389
42016-01-11 17:40:00604019.8946.33333319.244.53000019.7945.00000018.890000...17.00000045.406.133333733.992.05.66666747.6666674.910.08409710.084097
\n", + "

5 rows × 29 columns

\n", + "" + ], + "text/plain": [ + " date Appliances lights T1 RH_1 T2 RH_2 \\\n", + "0 2016-01-11 17:00:00 60 30 19.89 47.596667 19.2 44.790000 \n", + "1 2016-01-11 17:10:00 60 30 19.89 46.693333 19.2 44.722500 \n", + "2 2016-01-11 17:20:00 50 30 19.89 46.300000 19.2 44.626667 \n", + "3 2016-01-11 17:30:00 50 40 19.89 46.066667 19.2 44.590000 \n", + "4 2016-01-11 17:40:00 60 40 19.89 46.333333 19.2 44.530000 \n", + "\n", + " T3 RH_3 T4 ... T9 RH_9 T_out Press_mm_hg \\\n", + "0 19.79 44.730000 19.000000 ... 17.033333 45.53 6.600000 733.5 \n", + "1 19.79 44.790000 19.000000 ... 17.066667 45.56 6.483333 733.6 \n", + "2 19.79 44.933333 18.926667 ... 17.000000 45.50 6.366667 733.7 \n", + "3 19.79 45.000000 18.890000 ... 17.000000 45.40 6.250000 733.8 \n", + "4 19.79 45.000000 18.890000 ... 17.000000 45.40 6.133333 733.9 \n", + "\n", + " RH_out Windspeed Visibility Tdewpoint rv1 rv2 \n", + "0 92.0 7.000000 63.000000 5.3 13.275433 13.275433 \n", + "1 92.0 6.666667 59.166667 5.2 18.606195 18.606195 \n", + "2 92.0 6.333333 55.333333 5.1 28.642668 28.642668 \n", + "3 92.0 6.000000 51.500000 5.0 45.410389 45.410389 \n", + "4 92.0 5.666667 47.666667 4.9 10.084097 10.084097 \n", + "\n", + "[5 rows x 29 columns]" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import os\n", + "\n", + "file_path = 'energydata_complete.csv'\n", + "url = \"https://archive.ics.uci.edu/ml/machine-learning-databases/00374/energydata_complete.csv\"\n", + "\n", + "if os.path.exists(file_path):\n", + " data = pd.read_csv(file_path)\n", + "else:\n", + " data = pd.read_csv(url)\n", + " data.to_csv(file_path, index=False)\n", + "\n", + "data.head()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [], + "source": [ + "# Save the dataframe locally if it doesn't exist\n", + "if not os.path.exists(file_path):\n", + " data.to_csv(file_path, index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Column Descriptions\n", + "\n", + "| Column | Description |\n", + "| ---- | ----------- |\n", + "| date | time year-month-day hour:minute:second |\n", + "| Appliances | energy use in Wh |\n", + "| lights | energy use of light fixtures in the house in Wh |\n", + "| T1 | Temperature in kitchen area, in Celsius |\n", + "| RH_1 | Humidity in kitchen area, in % |\n", + "| T2 | Temperature in living room area, in Celsius |\n", + "| RH_2 | Humidity in living room area, in % |\n", + "| T3 | Temperature in laundry room area |\n", + "| RH_3 | Humidity in laundry room area, in % |\n", + "| T4 | Temperature in office room, in Celsius |\n", + "| RH_4 | Humidity in office room, in % |\n", + "| T5 | Temperature in bathroom, in Celsius |\n", + "| RH_5 | Humidity in bathroom, in % |\n", + "| T6 | Temperature outside the building (north side), in Celsius |\n", + "| RH_6 | Humidity outside the building (north side), in % |\n", + "| T7 | Temperature in ironing room , in Celsius |\n", + "| RH_7 | Humidity in ironing room, in % |\n", + "| T8 | Temperature in teenager room 2, in Celsius |\n", + "| RH_8 | Humidity in teenager room 2, in % |\n", + "| T9 | Temperature in parents room, in Celsius |\n", + "| RH_9 | Humidity in parents room, in % |\n", + "| To | Temperature outside (from Chievres weather station), in Celsius |\n", + "| Pressure | (from Chievres weather station), in mm Hg |\n", + "| RH_out | Humidity outside (from Chievres weather station), in % |\n", + "| Wind speed | (from Chievres weather station), in m/s |\n", + "| Visibility | (from Chievres weather station), in km |\n", + "| Tdewpoint | (from Chievres weather station), °C |\n", + "| rv1 | Random variable 1, nondimensional |\n", + "| rv2 | Random variable 2, nondimensional |\n", + "\n", + "Where indicated, hourly data (then interpolated) from the nearest airport weather station (Chievres Airport, Belgium) was downloaded from a public data set from Reliable Prognosis, rp5.ru. Permission was obtained from Reliable Prognosis for the distribution of the 4.5 months of weather data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ApplianceslightsT1RH_1T2RH_2T3RH_3T4RH_4...T9RH_9T_outPress_mm_hgRH_outWindspeedVisibilityTdewpointrv1rv2
date
2016-01-11 17:00:00603019.8947.59666719.244.79000019.7944.73000019.00000045.566667...17.03333345.536.600000733.592.07.00000063.0000005.313.27543313.275433
2016-01-11 17:10:00603019.8946.69333319.244.72250019.7944.79000019.00000045.992500...17.06666745.566.483333733.692.06.66666759.1666675.218.60619518.606195
2016-01-11 17:20:00503019.8946.30000019.244.62666719.7944.93333318.92666745.890000...17.00000045.506.366667733.792.06.33333355.3333335.128.64266828.642668
2016-01-11 17:30:00504019.8946.06666719.244.59000019.7945.00000018.89000045.723333...17.00000045.406.250000733.892.06.00000051.5000005.045.41038945.410389
2016-01-11 17:40:00604019.8946.33333319.244.53000019.7945.00000018.89000045.530000...17.00000045.406.133333733.992.05.66666747.6666674.910.08409710.084097
\n", + "

5 rows × 28 columns

\n", + "
" + ], + "text/plain": [ + " Appliances lights T1 RH_1 T2 RH_2 \\\n", + "date \n", + "2016-01-11 17:00:00 60 30 19.89 47.596667 19.2 44.790000 \n", + "2016-01-11 17:10:00 60 30 19.89 46.693333 19.2 44.722500 \n", + "2016-01-11 17:20:00 50 30 19.89 46.300000 19.2 44.626667 \n", + "2016-01-11 17:30:00 50 40 19.89 46.066667 19.2 44.590000 \n", + "2016-01-11 17:40:00 60 40 19.89 46.333333 19.2 44.530000 \n", + "\n", + " T3 RH_3 T4 RH_4 ... T9 \\\n", + "date ... \n", + "2016-01-11 17:00:00 19.79 44.730000 19.000000 45.566667 ... 17.033333 \n", + "2016-01-11 17:10:00 19.79 44.790000 19.000000 45.992500 ... 17.066667 \n", + "2016-01-11 17:20:00 19.79 44.933333 18.926667 45.890000 ... 17.000000 \n", + "2016-01-11 17:30:00 19.79 45.000000 18.890000 45.723333 ... 17.000000 \n", + "2016-01-11 17:40:00 19.79 45.000000 18.890000 45.530000 ... 17.000000 \n", + "\n", + " RH_9 T_out Press_mm_hg RH_out Windspeed \\\n", + "date \n", + "2016-01-11 17:00:00 45.53 6.600000 733.5 92.0 7.000000 \n", + "2016-01-11 17:10:00 45.56 6.483333 733.6 92.0 6.666667 \n", + "2016-01-11 17:20:00 45.50 6.366667 733.7 92.0 6.333333 \n", + "2016-01-11 17:30:00 45.40 6.250000 733.8 92.0 6.000000 \n", + "2016-01-11 17:40:00 45.40 6.133333 733.9 92.0 5.666667 \n", + "\n", + " Visibility Tdewpoint rv1 rv2 \n", + "date \n", + "2016-01-11 17:00:00 63.000000 5.3 13.275433 13.275433 \n", + "2016-01-11 17:10:00 59.166667 5.2 18.606195 18.606195 \n", + "2016-01-11 17:20:00 55.333333 5.1 28.642668 28.642668 \n", + "2016-01-11 17:30:00 51.500000 5.0 45.410389 45.410389 \n", + "2016-01-11 17:40:00 47.666667 4.9 10.084097 10.084097 \n", + "\n", + "[5 rows x 28 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "data['date'] = pd.to_datetime(data['date'])\n", + "data.set_index('date', inplace=True)\n", + "\n", + "data.head()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We're interested in the `Appliances` column, which is the energy use of the appliances in Wh. \n", + "\n", + "First, we'll resample the data to hourly resolution and fill missing values using the forward fill method." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/ly/jkydg4dj2vs93b_ds7yp5t7r0000gn/T/ipykernel_38583/973969212.py:1: FutureWarning: Series.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.\n", + " data = data['Appliances'].resample('h').mean().fillna(method='ffill') # Resample and fill missing\n" + ] + }, + { + "data": { + "text/plain": [ + "date\n", + "2016-01-11 17:00:00 55.000000\n", + "2016-01-11 18:00:00 176.666667\n", + "2016-01-11 19:00:00 173.333333\n", + "2016-01-11 20:00:00 125.000000\n", + "2016-01-11 21:00:00 103.333333\n", + "Freq: h, Name: Appliances, dtype: float64" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data = data['Appliances'].resample('h').mean().fillna(method='ffill') # Resample and fill missing\n", + "\n", + "data.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Scale the values to be between 0 and 1 and convert to a numpy array." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "(3290, 1)\n" + ] + } + ], + "source": [ + "# Normalize data\n", + "scaler = MinMaxScaler()\n", + "data_scaled = scaler.fit_transform(data.values.reshape(-1, 1))\n", + "\n", + "print(type(data_scaled))\n", + "print(data_scaled.shape)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# Prepare data for LSTM\n", + "class TimeSeriesDataset(Dataset):\n", + " def __init__(self, data, seq_length):\n", + " self.data = data\n", + " self.seq_length = seq_length\n", + "\n", + " def __len__(self):\n", + " return len(self.data) - self.seq_length\n", + "\n", + " def __getitem__(self, index):\n", + " X = self.data[index:index + self.seq_length]\n", + " y = self.data[index + self.seq_length]\n", + " return torch.tensor(X, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "3266\n" + ] + } + ], + "source": [ + "\n", + "seq_length = 24\n", + "dataset = TimeSeriesDataset(data_scaled, seq_length)\n", + "\n", + "print(len(dataset))" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "82\n", + "21\n" + ] + } + ], + "source": [ + "\n", + "# Split data into training and testing\n", + "train_size = int(len(dataset) * 0.8)\n", + "test_size = len(dataset) - train_size\n", + "\n", + "train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])\n", + "\n", + "train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)\n", + "test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)\n", + "\n", + "print(len(train_loader))\n", + "print(len(test_loader))\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# let's look at the first batch\n", + "for X, y in train_loader:\n", + " print(X.shape)\n", + " print(y.shape)\n", + " break\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# Define the LSTM model\n", + "class LSTMModel(nn.Module):\n", + " def __init__(self, input_size=1, hidden_size=50, output_size=1):\n", + " super(LSTMModel, self).__init__()\n", + " self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)\n", + " self.fc = nn.Linear(hidden_size, output_size)\n", + "\n", + " def forward(self, x):\n", + " x, _ = self.lstm(x)\n", + " x = self.fc(x[:, -1, :]) # Use the output of the last time step\n", + " return x\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "model = LSTMModel()\n", + "criterion = nn.MSELoss()\n", + "optimizer = torch.optim.Adam(model.parameters(), lr=0.001)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Train the model\n", + "epochs = 20\n", + "for epoch in range(epochs):\n", + " model.train()\n", + " train_loss = 0.0\n", + " for X, y in train_loader:\n", + " X = X.unsqueeze(-1) # Add input dimension\n", + " y = y.unsqueeze(-1) # Add target dimension\n", + "\n", + " optimizer.zero_grad()\n", + " outputs = model(X)\n", + " loss = criterion(outputs, y)\n", + " loss.backward()\n", + " optimizer.step()\n", + "\n", + " train_loss += loss.item()\n", + "\n", + " print(f\"Epoch {epoch+1}/{epochs}, Loss: {train_loss/len(train_loader):.4f}\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# Evaluate the model\n", + "model.eval()\n", + "predictions = []\n", + "actuals = []\n", + "with torch.no_grad():\n", + " for X, y in test_loader:\n", + " X = X.unsqueeze(-1)\n", + " y = y.unsqueeze(-1)\n", + " preds = model(X)\n", + " predictions.extend(preds.numpy())\n", + " actuals.extend(y.numpy())\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# Rescale predictions and actuals to original scale\n", + "predictions_rescaled = scaler.inverse_transform(predictions)\n", + "actuals_rescaled = scaler.inverse_transform(actuals)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# Plot results\n", + "plt.figure(figsize=(10, 6))\n", + "plt.plot(actuals_rescaled, label='True Values')\n", + "plt.plot(predictions_rescaled, label='Predicted Values', alpha=0.7)\n", + "plt.legend()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Case Study and Discussion\n", + "\n", + "## Real-world case study: Application of time series analysis\n", + "\n", + "- **Case Study**: Let's explore a real-world case study where time series analysis is applied.\n", + " - **Industry**: Choose an industry (e.g., finance, healthcare, retail).\n", + " - **Problem Statement**: Define the problem that needs to be addressed using time series analysis.\n", + " - **Data Collection**: Describe the data collection process and the type of data used.\n", + " - **Model Selection**: Select appropriate time series models for the analysis.\n", + " - **Analysis**: Perform the time series analysis and interpret the results.\n", + " - **Outcome**: Discuss the outcomes and how the analysis helped in decision-making.\n", + " \n", + "## Group discussion on potential projects or applications\n", + "\n", + "- **Group Discussion**: Let's engage in a group discussion to brainstorm potential projects or applications of time series analysis.\n", + " - **Project Ideas**: Share and discuss various project ideas that can benefit from time series analysis.\n", + " - **Application Areas**: Identify different application areas such as finance, healthcare, retail, and more.\n", + " - **Challenges**: Discuss the potential challenges and limitations of applying time series analysis in these projects.\n", + " - **Collaboration**: Explore opportunities for collaboration and knowledge sharing within the group." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/ds701_book/timeseries-chatgpt.qmd b/ds701_book/timeseries-chatgpt.qmd new file mode 100644 index 00000000..baed192c --- /dev/null +++ b/ds701_book/timeseries-chatgpt.qmd @@ -0,0 +1,391 @@ +--- +title: Classic and Deep Learning Time Series Forecasting +jupyter: python3 +--- + + + +## ARIMA/SARIMA Example + +```{python} +import pandas as pd +import numpy as np +from statsmodels.tsa.seasonal import seasonal_decompose +from statsmodels.tsa.statespace.sarimax import SARIMAX +import matplotlib.pyplot as plt +``` + +```{python} +path = '~/.cache/kagglehub/datasets/chirag19/air-passengers/versions/1/AirPassengers.csv' + +data = pd.read_csv(path) + +data.head() +``` + +```{python} +# Load the dataset +# from statsmodels.datasets.airline import load_pandas +#data = load_pandas().data +data['Month'] = pd.date_range(start='1949-01', periods=len(data), freq='ME') +data.set_index('Month', inplace=True) + +data.head() +``` + +```{python} + +# Log transform to stabilize variance +data['Log_Passengers'] = np.log(data['#Passengers']) + +data.head() +``` + +```{python} +# Seasonal decomposition +decomposition = seasonal_decompose(data['Log_Passengers'], model='additive') +decomposition.plot() +plt.show() +``` + +```{python} +# SARIMA model +model = SARIMAX(data['Log_Passengers'], + order=(1, 1, 1), + seasonal_order=(1, 1, 1, 12), + freq='ME') +results = model.fit() + +# Summary and diagnostics +print(results.summary()) +results.plot_diagnostics(figsize=(15, 10)) +plt.show() +``` + +```{python} + +# Forecasting +forecast = results.get_forecast(steps=24) +forecast_index = pd.date_range(data.index[-1] + pd.DateOffset(months=1), periods=24, freq='ME') +forecast_values = np.exp(forecast.predicted_mean) # Convert back from log +confidence_intervals = np.exp(forecast.conf_int()) + +# Plot +plt.figure(figsize=(10, 6)) +plt.plot(data['#Passengers'], label='Observed') +plt.plot(forecast_index, forecast_values, label='Forecast', color='red') +plt.fill_between(forecast_index, confidence_intervals.iloc[:, 0], confidence_intervals.iloc[:, 1], color='pink', alpha=0.3) +plt.legend() +plt.show() +``` + +## Neural Network Example -- TensorFlow/Keras + +Dataset: Energy Consumption Dataset (available via UCI Machine Learning Repository) + +```{python} +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +from sklearn.preprocessing import MinMaxScaler +from tensorflow.keras.models import Sequential +from tensorflow.keras.layers import Dense, LSTM +``` + +```{python} + +# Load dataset +url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00374/energydata_complete.csv" +data = pd.read_csv(url) +data['date'] = pd.to_datetime(data['date']) +data.set_index('date', inplace=True) +data = data['Appliances'].resample('H').mean().fillna(method='ffill') # Resample and fill missing + +# Normalize data +scaler = MinMaxScaler() +data_scaled = scaler.fit_transform(data.values.reshape(-1, 1)) + +# Prepare data for LSTM +def create_sequences(data, seq_length): + X, y = [], [] + for i in range(len(data) - seq_length): + X.append(data[i:i + seq_length]) + y.append(data[i + seq_length]) + return np.array(X), np.array(y) + +seq_length = 24 +X, y = create_sequences(data_scaled, seq_length) +X_train, X_test = X[:int(len(X) * 0.8)], X[int(len(X) * 0.8):] +y_train, y_test = y[:int(len(y) * 0.8)], y[int(len(y) * 0.8):] + +# LSTM model +model = Sequential([ + LSTM(50, activation='relu', input_shape=(seq_length, 1)), + Dense(1) +]) +model.compile(optimizer='adam', loss='mse') + +# Train model +history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=20, batch_size=32, verbose=1) + +# Evaluate and predict +predictions = model.predict(X_test) +predictions_rescaled = scaler.inverse_transform(predictions) +y_test_rescaled = scaler.inverse_transform(y_test.reshape(-1, 1)) + +# Plot results +plt.figure(figsize=(10, 6)) +plt.plot(y_test_rescaled, label='True Values') +plt.plot(predictions_rescaled, label='Predicted Values', alpha=0.7) +plt.legend() +plt.show() +``` + +## Neural Network Example -- PyTorch + +Dataset: Energy Consumption Dataset (UCI Machine Learning Repository) + +```{python} +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +from sklearn.preprocessing import MinMaxScaler +import torch +import torch.nn as nn +from torch.utils.data import DataLoader, Dataset +``` + +Load dataset from https://archive.ics.uci.edu/dataset/374/appliances+energy+prediction + +```{python} +import os + +file_path = 'energydata_complete.csv' +url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00374/energydata_complete.csv" + +if os.path.exists(file_path): + data = pd.read_csv(file_path) +else: + data = pd.read_csv(url) + data.to_csv(file_path, index=False) + +data.head() +``` + +```{python} +# Save the dataframe locally if it doesn't exist +if not os.path.exists(file_path): + data.to_csv(file_path, index=False) +``` + +### Column Descriptions + +| Column | Description | +| ---- | ----------- | +| date | time year-month-day hour:minute:second | +| Appliances | energy use in Wh | +| lights | energy use of light fixtures in the house in Wh | +| T1 | Temperature in kitchen area, in Celsius | +| RH_1 | Humidity in kitchen area, in % | +| T2 | Temperature in living room area, in Celsius | +| RH_2 | Humidity in living room area, in % | +| T3 | Temperature in laundry room area | +| RH_3 | Humidity in laundry room area, in % | +| T4 | Temperature in office room, in Celsius | +| RH_4 | Humidity in office room, in % | +| T5 | Temperature in bathroom, in Celsius | +| RH_5 | Humidity in bathroom, in % | +| T6 | Temperature outside the building (north side), in Celsius | +| RH_6 | Humidity outside the building (north side), in % | +| T7 | Temperature in ironing room , in Celsius | +| RH_7 | Humidity in ironing room, in % | +| T8 | Temperature in teenager room 2, in Celsius | +| RH_8 | Humidity in teenager room 2, in % | +| T9 | Temperature in parents room, in Celsius | +| RH_9 | Humidity in parents room, in % | +| To | Temperature outside (from Chievres weather station), in Celsius | +| Pressure | (from Chievres weather station), in mm Hg | +| RH_out | Humidity outside (from Chievres weather station), in % | +| Wind speed | (from Chievres weather station), in m/s | +| Visibility | (from Chievres weather station), in km | +| Tdewpoint | (from Chievres weather station), °C | +| rv1 | Random variable 1, nondimensional | +| rv2 | Random variable 2, nondimensional | + +Where indicated, hourly data (then interpolated) from the nearest airport weather station (Chievres Airport, Belgium) was downloaded from a public data set from Reliable Prognosis, rp5.ru. Permission was obtained from Reliable Prognosis for the distribution of the 4.5 months of weather data. + +```{python} +data.info() +``` + +```{python} + +data['date'] = pd.to_datetime(data['date']) +data.set_index('date', inplace=True) + +data.head() +``` + +We're interested in the `Appliances` column, which is the energy use of the appliances in Wh. + +First, we'll resample the data to hourly resolution and fill missing values using the forward fill method. + +```{python} +data = data['Appliances'].resample('h').mean().fillna(method='ffill') # Resample and fill missing + +data.head() +``` + +Scale the values to be between 0 and 1 and convert to a numpy array. + +```{python} +# Normalize data +scaler = MinMaxScaler() +data_scaled = scaler.fit_transform(data.values.reshape(-1, 1)) + +print(type(data_scaled)) +print(data_scaled.shape) +``` + +```{python} + +# Prepare data for LSTM +class TimeSeriesDataset(Dataset): + def __init__(self, data, seq_length): + self.data = data + self.seq_length = seq_length + + def __len__(self): + return len(self.data) - self.seq_length + + def __getitem__(self, index): + X = self.data[index:index + self.seq_length] + y = self.data[index + self.seq_length] + return torch.tensor(X, dtype=torch.float32), torch.tensor(y, dtype=torch.float32) +``` + +```{python} + +seq_length = 24 +dataset = TimeSeriesDataset(data_scaled, seq_length) + +print(len(dataset)) +``` + +```{python} + +# Split data into training and testing +train_size = int(len(dataset) * 0.8) +test_size = len(dataset) - train_size + +train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size]) + +train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True) +test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False) + +print(len(train_loader)) +print(len(test_loader)) +``` + +```{python} +# let's look at the first batch +for X, y in train_loader: + print(X.shape) + print(y.shape) + break +``` + +```{python} + +# Define the LSTM model +class LSTMModel(nn.Module): + def __init__(self, input_size=1, hidden_size=50, output_size=1): + super(LSTMModel, self).__init__() + self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True) + self.fc = nn.Linear(hidden_size, output_size) + + def forward(self, x): + x, _ = self.lstm(x) + x = self.fc(x[:, -1, :]) # Use the output of the last time step + return x +``` + +```{python} +model = LSTMModel() +criterion = nn.MSELoss() +optimizer = torch.optim.Adam(model.parameters(), lr=0.001) +``` + +```{python} +# Train the model +epochs = 20 +for epoch in range(epochs): + model.train() + train_loss = 0.0 + for X, y in train_loader: + X = X.unsqueeze(-1) # Add input dimension + y = y.unsqueeze(-1) # Add target dimension + + optimizer.zero_grad() + outputs = model(X) + loss = criterion(outputs, y) + loss.backward() + optimizer.step() + + train_loss += loss.item() + + print(f"Epoch {epoch+1}/{epochs}, Loss: {train_loss/len(train_loader):.4f}") +``` + +```{python} + +# Evaluate the model +model.eval() +predictions = [] +actuals = [] +with torch.no_grad(): + for X, y in test_loader: + X = X.unsqueeze(-1) + y = y.unsqueeze(-1) + preds = model(X) + predictions.extend(preds.numpy()) + actuals.extend(y.numpy()) +``` + +```{python} + +# Rescale predictions and actuals to original scale +predictions_rescaled = scaler.inverse_transform(predictions) +actuals_rescaled = scaler.inverse_transform(actuals) +``` + +```{python} + +# Plot results +plt.figure(figsize=(10, 6)) +plt.plot(actuals_rescaled, label='True Values') +plt.plot(predictions_rescaled, label='Predicted Values', alpha=0.7) +plt.legend() +plt.show() +``` + +# Case Study and Discussion + +## Real-world case study: Application of time series analysis + +- **Case Study**: Let's explore a real-world case study where time series analysis is applied. + - **Industry**: Choose an industry (e.g., finance, healthcare, retail). + - **Problem Statement**: Define the problem that needs to be addressed using time series analysis. + - **Data Collection**: Describe the data collection process and the type of data used. + - **Model Selection**: Select appropriate time series models for the analysis. + - **Analysis**: Perform the time series analysis and interpret the results. + - **Outcome**: Discuss the outcomes and how the analysis helped in decision-making. + +## Group discussion on potential projects or applications + +- **Group Discussion**: Let's engage in a group discussion to brainstorm potential projects or applications of time series analysis. + - **Project Ideas**: Share and discuss various project ideas that can benefit from time series analysis. + - **Application Areas**: Identify different application areas such as finance, healthcare, retail, and more. + - **Challenges**: Discuss the potential challenges and limitations of applying time series analysis in these projects. + - **Collaboration**: Explore opportunities for collaboration and knowledge sharing within the group. + diff --git a/ds701_book/yahoo_data.csv b/ds701_book/yahoo_data.csv new file mode 100644 index 00000000..3b4f22b7 --- /dev/null +++ b/ds701_book/yahoo_data.csv @@ -0,0 +1,252 @@ +Date,Open,High,Low,Close,Adj Close,Volume +2015-01-02,55.459999084472656,55.599998474121094,54.2400016784668,55.150001525878906,55.150001525878906,1664500 +2015-01-05,54.540000915527344,54.95000076293945,52.33000183105469,52.529998779296875,52.529998779296875,2023000 +2015-01-06,52.54999923706055,53.93000030517578,50.75,52.439998626708984,52.439998626708984,3762800 +2015-01-07,53.31999969482422,53.75,51.7599983215332,52.209999084472656,52.209999084472656,1548200 +2015-01-08,52.59000015258789,54.13999938964844,51.7599983215332,53.83000183105469,53.83000183105469,2015300 +2015-01-09,55.959999084472656,56.9900016784668,54.720001220703125,56.06999969482422,56.06999969482422,6224200 +2015-01-12,56.0,56.060001373291016,53.43000030517578,54.02000045776367,54.02000045776367,2407700 +2015-01-13,54.470001220703125,54.79999923706055,52.52000045776367,53.18000030517578,53.18000030517578,1958400 +2015-01-14,52.79999923706055,53.68000030517578,51.459999084472656,52.20000076293945,52.20000076293945,1854600 +2015-01-15,53.0,53.61000061035156,50.029998779296875,50.119998931884766,50.119998931884766,2647800 +2015-01-16,50.18000030517578,51.4900016784668,50.029998779296875,51.38999938964844,51.38999938964844,2183300 +2015-01-20,51.650001525878906,51.779998779296875,50.689998626708984,51.40999984741211,51.40999984741211,1235000 +2015-01-21,51.20000076293945,53.5,51.20000076293945,53.40999984741211,53.40999984741211,3248100 +2015-01-22,53.869998931884766,55.279998779296875,53.119998931884766,54.79999923706055,54.79999923706055,2295400 +2015-01-23,54.65999984741211,55.63999938964844,54.29999923706055,55.189998626708984,55.189998626708984,1636400 +2015-01-26,55.119998931884766,55.790000915527344,54.83000183105469,55.40999984741211,55.40999984741211,1450300 +2015-01-27,56.060001373291016,56.15999984741211,54.56999969482422,55.630001068115234,55.630001068115234,2410400 +2015-01-28,56.150001525878906,56.150001525878906,52.91999816894531,53.0,53.0,2013100 +2015-01-29,52.849998474121094,53.310001373291016,51.40999984741211,52.93000030517578,52.93000030517578,1844100 +2015-01-30,52.59000015258789,53.41999816894531,52.04999923706055,52.470001220703125,52.470001220703125,1875400 +2015-02-02,52.939998626708984,53.5,51.209999084472656,53.470001220703125,53.470001220703125,2105500 +2015-02-03,53.83000183105469,55.93000030517578,53.40999984741211,55.779998779296875,55.779998779296875,2885400 +2015-02-04,55.529998779296875,57.06999969482422,55.25,56.7400016784668,56.7400016784668,2498600 +2015-02-05,57.599998474121094,57.70000076293945,56.08000183105469,57.470001220703125,57.470001220703125,4657300 +2015-02-06,47.70000076293945,48.16999816894531,44.86000061035156,45.11000061035156,45.11000061035156,25180900 +2015-02-09,44.90999984741211,45.040000915527344,42.099998474121094,42.16999816894531,42.16999816894531,13079300 +2015-02-10,43.83000183105469,45.54999923706055,43.310001373291016,44.65999984741211,44.65999984741211,11267700 +2015-02-11,45.38999938964844,46.43000030517578,44.810001373291016,46.18000030517578,46.18000030517578,6359400 +2015-02-12,46.45000076293945,47.84000015258789,45.95000076293945,47.630001068115234,47.630001068115234,4375000 +2015-02-13,48.5099983215332,49.04999923706055,47.220001220703125,47.529998779296875,47.529998779296875,4713100 +2015-02-17,47.439998626708984,48.619998931884766,47.029998779296875,48.189998626708984,48.189998626708984,2390100 +2015-02-18,47.939998626708984,48.689998626708984,47.20000076293945,47.540000915527344,47.540000915527344,2541300 +2015-02-19,47.15999984741211,47.790000915527344,46.869998931884766,47.34000015258789,47.34000015258789,1642200 +2015-02-20,47.400001525878906,47.91999816894531,47.099998474121094,47.790000915527344,47.790000915527344,1688500 +2015-02-23,47.54999923706055,47.7400016784668,46.529998779296875,47.349998474121094,47.349998474121094,2086000 +2015-02-24,46.97999954223633,47.72999954223633,46.619998931884766,47.290000915527344,47.290000915527344,1506500 +2015-02-25,46.939998626708984,47.45000076293945,46.5,47.150001525878906,47.150001525878906,1924600 +2015-02-26,48.630001068115234,48.810001373291016,47.560001373291016,47.75,47.75,3059400 +2015-02-27,48.31999969482422,48.439998626708984,47.04999923706055,48.0,48.0,2118400 +2015-03-02,48.02000045776367,48.459999084472656,47.189998626708984,47.86000061035156,47.86000061035156,1933700 +2015-03-03,47.75,48.97999954223633,47.33000183105469,48.58000183105469,48.58000183105469,2352100 +2015-03-04,48.68000030517578,48.869998931884766,47.310001373291016,47.790000915527344,47.790000915527344,2218800 +2015-03-05,47.689998626708984,48.70000076293945,47.40999984741211,47.939998626708984,47.939998626708984,1696900 +2015-03-06,47.75,48.58000183105469,46.91999816894531,47.04999923706055,47.04999923706055,1994800 +2015-03-09,46.959999084472656,46.959999084472656,45.34000015258789,45.81999969482422,45.81999969482422,2554800 +2015-03-10,45.040000915527344,45.900001525878906,44.25,45.22999954223633,45.22999954223633,2379500 +2015-03-11,45.08000183105469,46.72999954223633,44.72999954223633,45.72999954223633,45.72999954223633,2227300 +2015-03-12,45.900001525878906,46.81999969482422,45.540000915527344,46.79999923706055,46.79999923706055,1657400 +2015-03-13,46.7599983215332,47.5,46.130001068115234,46.45000076293945,46.45000076293945,2259400 +2015-03-16,46.349998474121094,46.75,45.599998474121094,46.709999084472656,46.709999084472656,1607800 +2015-03-17,46.599998474121094,47.59000015258789,46.310001373291016,47.220001220703125,47.220001220703125,1723000 +2015-03-18,47.040000915527344,47.56999969482422,46.599998474121094,46.81999969482422,46.81999969482422,2514800 +2015-03-19,46.66999816894531,47.2400016784668,44.34000015258789,45.18000030517578,45.18000030517578,9280900 +2015-03-20,45.31999969482422,46.400001525878906,44.86000061035156,44.939998626708984,44.939998626708984,4240400 +2015-03-23,44.86000061035156,47.150001525878906,44.7400016784668,47.029998779296875,47.029998779296875,3681700 +2015-03-24,46.970001220703125,47.36000061035156,46.529998779296875,47.040000915527344,47.040000915527344,2266600 +2015-03-25,47.040000915527344,47.2400016784668,45.72999954223633,45.7599983215332,45.7599983215332,2524700 +2015-03-26,45.650001525878906,46.599998474121094,45.310001373291016,45.709999084472656,45.709999084472656,1692000 +2015-03-27,45.81999969482422,47.349998474121094,45.7599983215332,47.15999984741211,47.15999984741211,1889500 +2015-03-30,47.099998474121094,48.2400016784668,46.709999084472656,47.38999938964844,47.38999938964844,3333800 +2015-03-31,47.029998779296875,47.91999816894531,46.790000915527344,47.349998474121094,47.349998474121094,1922500 +2015-04-01,47.25,47.369998931884766,45.09000015258789,45.5,45.5,3670000 +2015-04-02,45.400001525878906,47.5,45.31999969482422,47.13999938964844,47.13999938964844,2594600 +2015-04-06,46.25,47.81999969482422,46.08000183105469,47.310001373291016,47.310001373291016,1560500 +2015-04-07,47.279998779296875,48.16999816894531,46.959999084472656,46.9900016784668,46.9900016784668,1434800 +2015-04-08,46.81999969482422,47.779998779296875,46.369998931884766,47.43000030517578,47.43000030517578,1856500 +2015-04-09,47.400001525878906,48.099998474121094,46.779998779296875,47.0,47.0,1184400 +2015-04-10,47.04999923706055,47.720001220703125,46.54999923706055,47.650001525878906,47.650001525878906,1407400 +2015-04-13,47.720001220703125,48.34000015258789,47.27000045776367,47.400001525878906,47.400001525878906,1485900 +2015-04-14,47.36000061035156,47.95000076293945,46.709999084472656,47.599998474121094,47.599998474121094,1692500 +2015-04-15,47.619998931884766,50.20000076293945,47.58000183105469,49.56999969482422,49.56999969482422,5395300 +2015-04-16,49.33000183105469,50.0,48.529998779296875,49.369998931884766,49.369998931884766,2694500 +2015-04-17,48.90999984741211,49.0099983215332,47.939998626708984,48.29999923706055,48.29999923706055,2068500 +2015-04-20,48.45000076293945,48.4900016784668,47.83000183105469,48.25,48.25,1112200 +2015-04-21,48.41999816894531,49.75,48.150001525878906,49.310001373291016,49.310001373291016,1686900 +2015-04-22,49.959999084472656,51.150001525878906,49.790000915527344,50.45000076293945,50.45000076293945,2499300 +2015-04-23,50.5099983215332,50.75,49.70000076293945,50.31999969482422,50.31999969482422,1457100 +2015-04-24,50.83000183105469,51.220001220703125,50.27000045776367,50.61000061035156,50.61000061035156,1433500 +2015-04-27,50.97999954223633,52.5099983215332,50.880001068115234,51.02000045776367,51.02000045776367,2388300 +2015-04-28,51.29999923706055,52.25,51.060001373291016,51.220001220703125,51.220001220703125,2433700 +2015-04-29,51.0,51.72999954223633,50.380001068115234,51.279998779296875,51.279998779296875,6480400 +2015-04-30,41.25,42.290000915527344,38.75,39.38999938964844,39.38999938964844,25307400 +2015-05-01,39.310001373291016,39.88999938964844,38.540000915527344,39.7599983215332,39.7599983215332,5876800 +2015-05-04,39.689998626708984,39.709999084472656,38.68000030517578,39.61000061035156,39.61000061035156,5171100 +2015-05-05,39.540000915527344,39.9900016784668,38.689998626708984,38.880001068115234,38.880001068115234,2500500 +2015-05-06,38.79999923706055,39.09000015258789,37.90999984741211,38.220001220703125,38.220001220703125,2689200 +2015-05-07,38.220001220703125,48.72999954223633,38.220001220703125,47.0099983215332,47.0099983215332,33831600 +2015-05-08,47.25,50.9900016784668,47.20000076293945,49.93000030517578,49.93000030517578,24155600 +2015-05-11,49.36000061035156,50.290000915527344,47.880001068115234,48.619998931884766,48.619998931884766,10430300 +2015-05-12,48.220001220703125,49.86000061035156,48.150001525878906,48.83000183105469,48.83000183105469,7587400 +2015-05-13,48.83000183105469,49.349998474121094,47.25,47.84000015258789,47.84000015258789,5577100 +2015-05-14,47.849998474121094,48.470001220703125,47.2599983215332,47.349998474121094,47.349998474121094,3539400 +2015-05-15,47.36000061035156,47.56999969482422,46.68000030517578,46.88999938964844,46.88999938964844,3567700 +2015-05-18,45.70000076293945,46.619998931884766,45.5,46.560001373291016,46.560001373291016,2893600 +2015-05-19,46.52000045776367,48.560001373291016,46.0,46.459999084472656,46.459999084472656,4677600 +2015-05-20,46.54999923706055,46.93000030517578,45.75,46.43000030517578,46.43000030517578,1605000 +2015-05-21,46.099998474121094,46.880001068115234,45.849998474121094,46.13999938964844,46.13999938964844,1929800 +2015-05-22,46.0,46.83000183105469,46.0,46.47999954223633,46.47999954223633,1373800 +2015-05-26,46.130001068115234,46.810001373291016,45.36000061035156,45.54999923706055,45.54999923706055,2142300 +2015-05-27,45.849998474121094,46.68000030517578,44.83000183105469,45.52000045776367,45.52000045776367,2544100 +2015-05-28,45.099998474121094,47.91999816894531,44.849998474121094,47.75,47.75,4838900 +2015-05-29,47.15999984741211,48.689998626708984,46.75,47.90999984741211,47.90999984741211,3602000 +2015-06-01,47.709999084472656,48.27000045776367,46.869998931884766,47.40999984741211,47.40999984741211,2296200 +2015-06-02,47.45000076293945,48.900001525878906,47.310001373291016,48.58000183105469,48.58000183105469,2060700 +2015-06-03,48.65999984741211,48.79999923706055,47.279998779296875,47.560001373291016,47.560001373291016,2200200 +2015-06-04,47.33000183105469,47.86000061035156,46.790000915527344,47.150001525878906,47.150001525878906,3160400 +2015-06-05,47.529998779296875,48.5,47.099998474121094,48.220001220703125,48.220001220703125,2715200 +2015-06-08,47.900001525878906,48.040000915527344,45.619998931884766,45.66999816894531,45.66999816894531,3709500 +2015-06-09,45.970001220703125,46.27000045776367,45.22999954223633,45.439998626708984,45.439998626708984,2635900 +2015-06-10,45.810001373291016,45.810001373291016,44.400001525878906,44.459999084472656,44.459999084472656,3719400 +2015-06-11,44.61000061035156,44.86000061035156,43.40999984741211,43.540000915527344,43.540000915527344,5014700 +2015-06-12,43.380001068115234,44.29999923706055,43.25,44.040000915527344,44.040000915527344,3401900 +2015-06-15,43.4900016784668,44.38999938964844,43.4900016784668,44.0,44.0,2341400 +2015-06-16,43.939998626708984,46.220001220703125,43.83000183105469,44.70000076293945,44.70000076293945,5072500 +2015-06-17,44.939998626708984,45.189998626708984,44.18000030517578,44.599998474121094,44.599998474121094,1711700 +2015-06-18,44.5099983215332,45.83000183105469,44.36000061035156,45.439998626708984,45.439998626708984,2045600 +2015-06-19,45.060001373291016,45.459999084472656,44.75,45.119998931884766,45.119998931884766,2302200 +2015-06-22,45.349998474121094,46.02000045776367,44.83000183105469,45.279998779296875,45.279998779296875,2115400 +2015-06-23,45.5099983215332,46.400001525878906,45.08000183105469,46.02000045776367,46.02000045776367,1833600 +2015-06-24,46.0,46.25,45.189998626708984,45.25,45.25,1427000 +2015-06-25,45.540000915527344,45.849998474121094,44.84000015258789,44.880001068115234,44.880001068115234,1368800 +2015-06-26,44.959999084472656,45.2400016784668,44.209999084472656,44.5099983215332,44.5099983215332,2859400 +2015-06-29,43.66999816894531,44.31999969482422,42.31999969482422,42.470001220703125,42.470001220703125,2402100 +2015-06-30,42.970001220703125,43.349998474121094,42.540000915527344,43.029998779296875,43.029998779296875,1834600 +2015-07-01,43.36000061035156,43.4900016784668,42.04999923706055,42.439998626708984,42.439998626708984,1696000 +2015-07-02,42.369998931884766,42.369998931884766,36.099998474121094,38.18000030517578,38.18000030517578,13264600 +2015-07-06,37.709999084472656,38.34000015258789,36.650001525878906,37.4900016784668,37.4900016784668,6536900 +2015-07-07,37.310001373291016,37.36000061035156,35.099998474121094,36.29999923706055,36.29999923706055,6272400 +2015-07-08,35.63999938964844,36.25,35.13999938964844,35.400001525878906,35.400001525878906,2428400 +2015-07-09,35.91999816894531,36.060001373291016,34.630001068115234,34.75,34.75,2778900 +2015-07-10,35.099998474121094,35.43000030517578,34.650001525878906,34.72999954223633,34.72999954223633,2223300 +2015-07-13,34.84000015258789,35.70000076293945,34.650001525878906,35.43000030517578,35.43000030517578,4069800 +2015-07-14,35.43000030517578,36.040000915527344,34.72999954223633,35.9900016784668,35.9900016784668,2972400 +2015-07-15,35.79999923706055,36.08000183105469,34.849998474121094,35.0,35.0,2763000 +2015-07-16,35.25,35.54999923706055,34.54999923706055,35.11000061035156,35.11000061035156,2186800 +2015-07-17,35.459999084472656,35.900001525878906,34.66999816894531,34.959999084472656,34.959999084472656,3014200 +2015-07-20,33.939998626708984,35.349998474121094,33.349998474121094,34.54999923706055,34.54999923706055,4215400 +2015-07-21,34.54999923706055,35.650001525878906,34.5,35.56999969482422,35.56999969482422,2824500 +2015-07-22,34.95000076293945,35.290000915527344,33.88999938964844,34.439998626708984,34.439998626708984,2852400 +2015-07-23,34.439998626708984,35.45000076293945,34.34000015258789,35.040000915527344,35.040000915527344,2188300 +2015-07-24,35.290000915527344,35.29999923706055,34.0,34.560001373291016,34.560001373291016,2094600 +2015-07-27,33.97999954223633,34.31999969482422,33.529998779296875,33.72999954223633,33.72999954223633,3470300 +2015-07-28,33.060001373291016,33.79999923706055,32.36000061035156,33.5099983215332,33.5099983215332,8529200 +2015-07-29,24.940000534057617,25.5,23.65999984741211,25.059999465942383,25.059999465942383,34598900 +2015-07-30,24.90999984741211,26.440000534057617,24.809999465942383,26.030000686645508,26.030000686645508,7878800 +2015-07-31,25.690000534057617,27.479999542236328,25.690000534057617,26.399999618530273,26.399999618530273,7889400 +2015-08-03,26.600000381469727,27.700000762939453,25.559999465942383,25.920000076293945,25.920000076293945,6636900 +2015-08-04,25.8799991607666,26.31999969482422,25.399999618530273,26.18000030517578,26.18000030517578,3176900 +2015-08-05,26.290000915527344,27.06999969482422,26.049999237060547,26.1299991607666,26.1299991607666,2830200 +2015-08-06,26.020000457763672,26.229999542236328,25.040000915527344,25.3799991607666,25.3799991607666,2389000 +2015-08-07,25.280000686645508,25.75,24.989999771118164,25.350000381469727,25.350000381469727,2167200 +2015-08-10,25.600000381469727,26.459999084472656,25.530000686645508,26.0,26.0,2139100 +2015-08-11,25.649999618530273,25.969999313354492,24.90999984741211,25.350000381469727,25.350000381469727,2356000 +2015-08-12,24.93000030517578,25.6299991607666,24.56999969482422,25.469999313354492,25.469999313354492,2341800 +2015-08-13,25.520000457763672,25.75,24.510000228881836,24.540000915527344,24.540000915527344,2263600 +2015-08-14,25.260000228881836,26.139999389648438,24.90999984741211,25.770000457763672,25.770000457763672,4245900 +2015-08-17,25.729999542236328,27.040000915527344,25.5,26.639999389648438,26.639999389648438,4077300 +2015-08-18,26.610000610351562,26.65999984741211,25.010000228881836,25.40999984741211,25.40999984741211,4037700 +2015-08-19,25.1299991607666,25.68000030517578,24.709999084472656,25.25,25.25,2257700 +2015-08-20,24.719999313354492,25.040000915527344,24.0,24.010000228881836,24.010000228881836,2930700 +2015-08-21,23.709999084472656,23.719999313354492,22.670000076293945,23.09000015258789,23.09000015258789,5473700 +2015-08-24,21.65999984741211,24.229999542236328,20.5,22.979999542236328,22.979999542236328,4882000 +2015-08-25,23.8700008392334,24.149999618530273,22.719999313354492,22.75,22.75,2860500 +2015-08-26,23.34000015258789,23.510000228881836,22.510000228881836,23.3799991607666,23.3799991607666,3354500 +2015-08-27,24.040000915527344,24.530000686645508,23.15999984741211,23.979999542236328,23.979999542236328,4050900 +2015-08-28,23.639999389648438,24.329999923706055,23.639999389648438,23.959999084472656,23.959999084472656,2625100 +2015-08-31,23.90999984741211,24.579999923706055,23.700000762939453,24.280000686645508,24.280000686645508,2258500 +2015-09-01,23.59000015258789,24.389999389648438,23.15999984741211,23.399999618530273,23.399999618530273,3158300 +2015-09-02,23.700000762939453,24.290000915527344,23.239999771118164,24.280000686645508,24.280000686645508,2358900 +2015-09-03,24.299999237060547,24.479999542236328,23.8799991607666,24.170000076293945,24.170000076293945,1614900 +2015-09-04,23.700000762939453,24.65999984741211,23.600000381469727,24.209999084472656,24.209999084472656,1894400 +2015-09-08,24.610000610351562,24.610000610351562,23.940000534057617,23.979999542236328,23.979999542236328,3224400 +2015-09-09,24.079999923706055,25.329999923706055,24.06999969482422,24.68000030517578,24.68000030517578,2669400 +2015-09-10,24.729999542236328,25.5,24.600000381469727,25.1200008392334,25.1200008392334,1786900 +2015-09-11,25.0,25.3700008392334,24.6299991607666,24.709999084472656,24.709999084472656,1702900 +2015-09-14,24.770000457763672,24.940000534057617,24.170000076293945,24.469999313354492,24.469999313354492,1678700 +2015-09-15,24.40999984741211,25.200000762939453,24.299999237060547,24.940000534057617,24.940000534057617,1316500 +2015-09-16,24.809999465942383,25.93000030517578,24.729999542236328,25.290000915527344,25.290000915527344,3299800 +2015-09-17,25.1200008392334,25.68000030517578,24.899999618530273,25.1299991607666,25.1299991607666,2011400 +2015-09-18,24.739999771118164,25.440000534057617,24.549999237060547,24.75,24.75,2576500 +2015-09-21,24.860000610351562,24.959999084472656,24.100000381469727,24.229999542236328,24.229999542236328,2279600 +2015-09-22,23.920000076293945,24.489999771118164,23.209999084472656,23.350000381469727,23.350000381469727,2394800 +2015-09-23,23.5,23.5,22.93000030517578,23.100000381469727,23.100000381469727,1655100 +2015-09-24,22.469999313354492,22.670000076293945,22.0,22.579999923706055,22.579999923706055,2778600 +2015-09-25,22.68000030517578,22.989999771118164,21.959999084472656,22.170000076293945,22.170000076293945,1701300 +2015-09-28,22.149999618530273,22.31999969482422,21.3700008392334,21.3799991607666,21.3799991607666,1974700 +2015-09-29,21.670000076293945,21.969999313354492,21.0,21.139999389648438,21.139999389648438,2474400 +2015-09-30,21.489999771118164,22.110000610351562,21.219999313354492,21.65999984741211,21.65999984741211,1965700 +2015-10-01,21.649999618530273,21.8700008392334,20.75,20.8700008392334,20.8700008392334,2769100 +2015-10-02,20.75,22.479999542236328,20.600000381469727,22.3799991607666,22.3799991607666,2248900 +2015-10-05,22.3799991607666,22.709999084472656,22.059999465942383,22.68000030517578,22.68000030517578,1809300 +2015-10-06,22.719999313354492,23.190000534057617,22.510000228881836,22.68000030517578,22.68000030517578,1704300 +2015-10-07,22.8799991607666,23.530000686645508,22.459999084472656,23.5,23.5,1527900 +2015-10-08,23.459999084472656,23.8700008392334,22.8799991607666,23.610000610351562,23.610000610351562,1299600 +2015-10-09,23.610000610351562,25.020000457763672,23.350000381469727,24.8799991607666,24.8799991607666,3320600 +2015-10-12,24.690000534057617,24.81999969482422,22.219999313354492,22.739999771118164,22.739999771118164,4755000 +2015-10-13,22.809999465942383,23.299999237060547,22.469999313354492,22.489999771118164,22.489999771118164,1696300 +2015-10-14,22.479999542236328,22.8799991607666,21.8700008392334,21.979999542236328,21.979999542236328,2223100 +2015-10-15,22.200000762939453,22.84000015258789,22.059999465942383,22.610000610351562,22.610000610351562,2083700 +2015-10-16,22.709999084472656,22.799999237060547,22.040000915527344,22.649999618530273,22.649999618530273,1720600 +2015-10-19,22.510000228881836,22.84000015258789,22.309999465942383,22.600000381469727,22.600000381469727,1402900 +2015-10-20,22.579999923706055,23.1200008392334,22.1299991607666,22.520000457763672,22.520000457763672,1696800 +2015-10-21,22.520000457763672,22.75,22.190000534057617,22.389999389648438,22.389999389648438,1340800 +2015-10-22,22.510000228881836,23.0,22.06999969482422,22.520000457763672,22.520000457763672,2341200 +2015-10-23,22.799999237060547,22.889999389648438,22.110000610351562,22.559999465942383,22.559999465942383,3188700 +2015-10-26,22.600000381469727,24.799999237060547,22.31999969482422,24.43000030517578,24.43000030517578,6890700 +2015-10-27,24.299999237060547,24.299999237060547,22.010000228881836,22.899999618530273,22.899999618530273,6616200 +2015-10-28,21.639999389648438,22.809999465942383,21.43000030517578,22.06999969482422,22.06999969482422,9403700 +2015-10-29,23.260000228881836,24.200000762939453,22.40999984741211,22.950000762939453,22.950000762939453,9740500 +2015-10-30,23.110000610351562,23.149999618530273,22.0,22.25,22.25,5010000 +2015-11-02,22.260000228881836,23.850000381469727,22.209999084472656,23.799999237060547,23.799999237060547,6459600 +2015-11-03,23.670000076293945,24.450000762939453,23.600000381469727,24.1200008392334,24.1200008392334,2482100 +2015-11-04,24.030000686645508,24.600000381469727,23.8799991607666,24.420000076293945,24.420000076293945,1704600 +2015-11-05,24.5,25.559999465942383,24.31999969482422,25.049999237060547,25.049999237060547,2794300 +2015-11-06,25.049999237060547,25.530000686645508,24.5,25.5,25.5,2095400 +2015-11-09,25.34000015258789,25.649999618530273,24.6299991607666,24.959999084472656,24.959999084472656,1567200 +2015-11-10,24.850000381469727,25.360000610351562,24.59000015258789,25.15999984741211,25.15999984741211,1267900 +2015-11-11,25.100000381469727,25.309999465942383,24.540000915527344,24.979999542236328,24.979999542236328,1364400 +2015-11-12,25.18000030517578,27.219999313354492,24.899999618530273,25.899999618530273,25.899999618530273,5213300 +2015-11-13,26.719999313354492,27.489999771118164,26.1200008392334,27.100000381469727,27.100000381469727,4976000 +2015-11-16,27.0,27.59000015258789,26.469999313354492,27.440000534057617,27.440000534057617,3066700 +2015-11-17,27.34000015258789,27.610000610351562,26.860000610351562,27.540000915527344,27.540000915527344,2018000 +2015-11-18,27.540000915527344,28.829999923706055,27.309999465942383,28.229999542236328,28.229999542236328,3091600 +2015-11-19,28.190000534057617,28.690000534057617,27.90999984741211,28.059999465942383,28.059999465942383,1487500 +2015-11-20,28.100000381469727,31.25,28.049999237060547,31.209999084472656,31.209999084472656,6697500 +2015-11-23,30.579999923706055,30.809999465942383,29.149999618530273,29.860000610351562,29.860000610351562,4029900 +2015-11-24,29.459999084472656,30.6299991607666,29.450000762939453,30.010000228881836,30.010000228881836,2584500 +2015-11-25,29.790000915527344,30.540000915527344,29.709999084472656,30.510000228881836,30.510000228881836,1287100 +2015-11-27,30.5,30.600000381469727,29.610000610351562,30.18000030517578,30.18000030517578,1058900 +2015-11-30,30.110000610351562,30.719999313354492,29.770000457763672,30.1299991607666,30.1299991607666,2015600 +2015-12-01,30.110000610351562,30.459999084472656,29.799999237060547,30.309999465942383,30.309999465942383,1886000 +2015-12-02,30.299999237060547,32.470001220703125,30.290000915527344,31.389999389648438,31.389999389648438,4650300 +2015-12-03,31.389999389648438,32.2400016784668,30.479999542236328,30.6299991607666,30.6299991607666,2698900 +2015-12-04,30.530000686645508,30.860000610351562,29.31999969482422,30.450000762939453,30.450000762939453,2313800 +2015-12-07,30.3799991607666,30.639999389648438,29.6299991607666,30.040000915527344,30.040000915527344,1362300 +2015-12-08,29.809999465942383,31.3799991607666,29.5,30.920000076293945,30.920000076293945,1830200 +2015-12-09,30.979999542236328,31.139999389648438,29.260000228881836,30.0,30.0,2238500 +2015-12-10,30.110000610351562,31.299999237060547,29.989999771118164,30.829999923706055,30.829999923706055,1252900 +2015-12-11,30.690000534057617,30.75,29.600000381469727,29.649999618530273,29.649999618530273,1415000 +2015-12-14,29.600000381469727,29.889999389648438,28.850000381469727,29.579999923706055,29.579999923706055,2328600 +2015-12-15,29.68000030517578,30.0,26.459999084472656,26.8700008392334,26.8700008392334,5759200 +2015-12-16,26.889999389648438,28.239999771118164,26.260000228881836,28.030000686645508,28.030000686645508,2992100 +2015-12-17,28.139999389648438,28.31999969482422,27.190000534057617,27.420000076293945,27.420000076293945,1483900 +2015-12-18,27.309999465942383,27.90999984741211,26.899999618530273,27.170000076293945,27.170000076293945,1299800 +2015-12-21,27.170000076293945,27.360000610351562,26.030000686645508,26.25,26.25,1947600 +2015-12-22,26.25,28.700000762939453,26.149999618530273,27.93000030517578,27.93000030517578,2952700 +2015-12-23,27.950000762939453,28.420000076293945,27.440000534057617,28.149999618530273,28.149999618530273,1001000 +2015-12-24,28.270000457763672,28.59000015258789,27.899999618530273,28.399999618530273,28.399999618530273,587400 +2015-12-28,28.1200008392334,28.3799991607666,27.770000457763672,27.8799991607666,27.8799991607666,1004500 +2015-12-29,27.950000762939453,28.540000915527344,27.739999771118164,28.479999542236328,28.479999542236328,1103900 +2015-12-30,28.579999923706055,28.780000686645508,28.170000076293945,28.25,28.25,1068000