diff --git a/.gitignore b/.gitignore index 994be26..8a93e4a 100644 --- a/.gitignore +++ b/.gitignore @@ -8,4 +8,8 @@ *.fls *.fdb_latexmk .DS_Store -.vscode/ \ No newline at end of file +.vscode/ + +notes/2025/tikz/4 +notes/2025/mvp/chapters/4-nn.tex +notes/2025/mvp/chapters/4-nn.pdf \ No newline at end of file diff --git a/notes/2025/mvp/chapters/1-lr.pdf b/notes/2025/mvp/chapters/1-lr.pdf index 26d7e6d..4c0f9a7 100644 Binary files a/notes/2025/mvp/chapters/1-lr.pdf and b/notes/2025/mvp/chapters/1-lr.pdf differ diff --git a/notes/2025/mvp/chapters/1-lr.tex b/notes/2025/mvp/chapters/1-lr.tex index ea22c9b..fcb97bb 100644 --- a/notes/2025/mvp/chapters/1-lr.tex +++ b/notes/2025/mvp/chapters/1-lr.tex @@ -8,10 +8,7 @@ \chapter{Linear Regression} \item Ridge Regression ($L_2$ regularization) \item Lasso Regression ($L_1$ regularization) \end{introduction} -\section{Basic Knowledge} -\begin{example} -\textbf{Linear Regression} -\end{example} +\section{Basic Knowledge of Linear Regression} {Settings.} \begin{itemize} @@ -31,9 +28,8 @@ \section{Basic Knowledge} \end{definition} \end{itemize} -\textbf{Quiz.} How to determine whether a parameter is learnable? -Quiz: How to determine $w$ and $b$? -Quiz: How to determine $w$ and $b$? +\noindent\textbf{Quiz.} How to determine whether a parameter is learnable? How to determine $w$ and $b$? + \noindent Ans: \textbf{ERM} (Empirical Risk Minimization) @@ -41,7 +37,6 @@ \section{Basic Knowledge} \item Loss function. Squared Loss (SE) is commonly used during optimization. The training objective can be written as: \begin{equation} \argmin_{w,b}{\color{blue}{\frac{1}{n}}}\sum_{i\in[n]}\left(y_i-(w^\top x_i+b)\right)^2 - \argmin_{w,b}{\color{blue}{\frac{1}{n}}}\sum_{i\in[n]}\left(y_i-(w^\top x_i+b)\right)^2 \end{equation} The blue factor $1/n$ can be omitted in theoretical analysis, but is often kept in practice to stabilize the loss function during implementation. diff --git a/notes/2025/mvp/chapters/2-lr.pdf b/notes/2025/mvp/chapters/2-lr.pdf index 3ff66b2..111241a 100644 Binary files a/notes/2025/mvp/chapters/2-lr.pdf and b/notes/2025/mvp/chapters/2-lr.pdf differ diff --git a/notes/2025/mvp/chapters/2-lr.tex b/notes/2025/mvp/chapters/2-lr.tex index d11d928..6c82f6a 100644 --- a/notes/2025/mvp/chapters/2-lr.tex +++ b/notes/2025/mvp/chapters/2-lr.tex @@ -9,9 +9,9 @@ \chapter{Logistic Regression} \item Maximum a posteriori \end{introduction} \section{Classification} -\begin{example} - \textbf{Binary Classification Problem} -\end{example} + + \subsection{Binary Classification Problem} + Settings. \begin{itemize} \item Dataset: $D=\{(x_i,y_i)\}_{i=1}^n$, where $x_i\in\mathbb{R}^d$ and $y_i\in\{0,1\}$. @@ -194,9 +194,9 @@ \section{Classification} \end{remark} \vspace{1em} -\begin{example} - \textbf{Multi-Class Classificaiton (Softmax Regression)} -\end{example} + + \subsection{Multi-Class Classificaiton (Softmax Regression)} + We can combine $k$ sub-classifiers to solve a $k$-class classification task. Specifically, we apply a sigmoid-like transformation to each sub-linear model $f_k(x) = w_k^\top x + b_k$, and obtain the probability of class $k$ using a normalized expression: @@ -249,9 +249,9 @@ \section{Classification} via the reparameterization $w = w_1 - w_2$ and $b = b_1 - b_2$. \end{remark} \section{Rethink of Linear Regression} -\begin{example} - \textbf{MLE Explanation for Linear Regression} -\end{example} + + \subsection{MLE Explanation for Linear Regression} + \begin{definition}[Gaussian/Normal Distribution] \begin{equation} x\sim\mathcal N(\mu,\sigma^2)\quad\Leftrightarrow\quad P(x)=\frac{1}{\sqrt{2\pi\sigma^2}}\exp\left(-\frac{(x-\mu)^2}{2\sigma^2}\right) @@ -289,8 +289,8 @@ \section{Rethink of Linear Regression} \end{align} where the equivalence follows by dropping constants and positive scalings. Equation~\eqref{2.24} recovers ERM with the squared-loss objective. -\begin{example}\textbf{Maximum a Posteriori (MAP)} -\end{example} +\subsection{Maximum a Posteriori (MAP)} + In the MLE perspective, $w$ and $b$ are treated as unknown fixed constants. In the Bayesian framework, however, even $w,b$ are considered as random variables (R.V.). A ``fixed constant'' can be seen as a random variable with an extremely sharp distribution (near $\delta$-distribution). diff --git a/notes/2025/mvp/chapters/3-svm.pdf b/notes/2025/mvp/chapters/3-svm.pdf index e5a1ded..a53eb86 100644 Binary files a/notes/2025/mvp/chapters/3-svm.pdf and b/notes/2025/mvp/chapters/3-svm.pdf differ diff --git a/notes/2025/mvp/chapters/3-svm.tex b/notes/2025/mvp/chapters/3-svm.tex index 1c43d70..9e701e7 100644 --- a/notes/2025/mvp/chapters/3-svm.tex +++ b/notes/2025/mvp/chapters/3-svm.tex @@ -347,7 +347,7 @@ \section{Dual Form Of SVM} \subsection{Preliminaries from Convex Analysis*} \begin{tcolorbox} \vspace{0.5em} - \color{red!50!black}$\circledast $\quad\emph{\textbf{This subsection is included out of personal interest and is not part of the course material. + \color{red!50!black}$\circledast $\quad\emph{{This subsection is included out of personal interest and is not part of the course material. Readers who wish to focus solely on the course content may safely skip to the next subsection.}} \vspace{0.5em} \end{tcolorbox} @@ -557,6 +557,7 @@ \subsection{Preliminaries from Convex Analysis*} \end{equation} \end{theorem} \begin{definition}[Hilbert Space] + \label{def:3.4} A \textbf{Hilbert space} is a complete inner product space. % Formally, let $\mathcal{H}$ be a vector space over $\mathbb{R}$ or $\mathbb{C}$ endowed with an inner product % \begin{equation} @@ -1042,10 +1043,66 @@ \section{Kernel} In this way, the model, known as the \textbf{Soft-Margin SVM}, allows small violations of the margin to achieve a better balance between robustness and generalization. This is the most commonly used form of SVM in practice. -% \begin{figure}[H] -% \centering -% \includegraphics{../../tikz/3/3.pdf} -% \caption{Hinge loss and CE loss.} -% \label{3-svm} -% \end{figure} +\vspace{1em} + +We claim that the constraints on the slack variables are equivalent to the following formulation: +\begin{equation} + \xi_i \ge \max(0,\, 1 - y_i(w^\top x_i + b)). +\end{equation} + +In the minimization problem +\begin{equation} + \arg\min_{w,b,\xi}\ \frac{1}{2}\|w\|^2 + C\sum_i \xi_i, +\end{equation} +it is crucial to observe that each $\xi_i$ will always take the minimal feasible value,either $0$ or $1 - y_i(w^\top x_i + b)$. +Otherwise, if $\xi_i$ were larger than necessary, the objective $\sum_i \xi_i$ could be further reduced by decreasing $\xi_i$. + +Thus, the optimization problem can be reformulated as an unconstrained objective: +\begin{equation} + \argmin_{w,b}\ \frac{1}{2}\|w\|^2 + C\sum_{i=1}^n \max(0,\, 1 - y_i(w^\top x_i + b)). +\end{equation} + +This formulation introduces the \textbf{Hinge loss}. + +\begin{definition}[Hinge loss] +The hinge loss function $L(z)$ is defined as: +\begin{equation} + L(z) = \max(0,\, 1 - z). +\end{equation} +\end{definition} +\begin{figure}[H] + \centering + \includegraphics{../../tikz/3/3.pdf} + \caption{Hinge loss and CE loss.} + \label{4-svm} +\end{figure} +Intuitively, the hinge loss $L_{\rm hinge}(t)$ remains zero when $t \ge 1$. +This property directly enforces a margin: samples that already lie beyond the decision boundary (i.e., sufficiently far from the separating hyperplane) incur no loss, +while those within or on the wrong side of the margin contribute a positive penalty. +In this sense, the hinge loss explicitly encourages a wider margin between classes. + +For comparison, consider the cross-entropy loss: +\begin{equation} + -\log(\sigma(z)) = \log(1 + e^{-z}), +\end{equation} +where $\sigma(z)$ denotes the sigmoid function. + +\begin{remark} + In class, we used $\log_2$ instead of the natural logarithm, leading to an approximation when $x\to -\infty$: + \begin{equation} + \log(1 + e^{-z}) \sim \frac{-z}{\log_2 e}, + \end{equation} + which results in a slightly different scaling in the plotted curve. +\end{remark} + +Both the cross-entropy loss and the hinge loss serve as \emph{substitute losses} for the zero-one loss: +\begin{equation} + L_{\rm zero\text{-}one}(z) = + \begin{cases} + 1, & z < 0,\\[4pt] + 0, & z \ge 0. + \end{cases} +\end{equation} +The zero-one loss is non-differentiable and provides no gradient information, +so gradient-based optimization methods cannot be directly applied in this case. \end{document} \ No newline at end of file diff --git a/notes/2025/mvp/chapters/5-rt.pdf b/notes/2025/mvp/chapters/5-rt.pdf new file mode 100644 index 0000000..4f1ccdb Binary files /dev/null and b/notes/2025/mvp/chapters/5-rt.pdf differ diff --git a/notes/2025/mvp/chapters/5-rt.tex b/notes/2025/mvp/chapters/5-rt.tex new file mode 100644 index 0000000..3a8ad55 --- /dev/null +++ b/notes/2025/mvp/chapters/5-rt.tex @@ -0,0 +1,487 @@ +\documentclass[../main]{subfiles} +\begin{document} + +\chapter{Representer Theorem} +\begin{introduction} + \item Representer Theorem + \item Dual form of Linear Regression +\end{introduction} +\section{Revisit the Preliminaries} +We first provide a brief overview of the previous chapters on Support Vector Machines (SVM). +The introduction of slack variables $\xi_i \ge 0,\ \forall i \in [n]$ allows the model to tolerate outliers or samples that do not strictly follow the shared distribution. +However, it is important to properly scale the penalty on the slack variables, since inappropriate choices of this scaling factor $C$ may severely affect the model’s generalization ability. + +The primal optimization problem of the soft-margin SVM can be written as: +\begin{gather*} + \argmin_{w,b,\xi}\ \frac{1}{2}\|w\|^2 + C\sum_{i=1}^{n}\xi_i \\ + \text{s.t.}\quad y_i(w^\top x_i + b) \ge 1 - \xi_i,\quad \xi_i \ge 0,\quad \forall i \in [n]. +\end{gather*} + +We can further express the decision function of the SVM as: +\begin{equation} + f(x) = w^\top \varphi(x), +\end{equation} +where $\varphi(x)$ denotes a feature mapping that transforms the input $x$ into a (possibly higher-dimensional) feature space. + +The objective function can then be rewritten as: +\begin{equation} + \argmin_w \sum_{i \in [n]} \mathrm{Hinge}\bigl(f(x_i)\bigr) + \frac{\lambda}{2}\|w\|^2. +\end{equation} + +\begin{remark} + There is a minor difference between the above expression and the original SVM formulation: + the bias term $b$ can be absorbed into $w$ by augmenting the input vector with an additional dimension (e.g., appending $x_0 = 1$). + % Further discussion could be held here.% +\end{remark} + +By rewriting the $\mathrm{Hinge}\bigl(f(x_i)\bigr)$ term in terms of the slack variables $\xi_i$, +we can derive the dual formulation of the soft-margin SVM: +\begin{equation} + \argmax_{\alpha \ge 0,\, \beta \ge 0}\; + \argmin_{w,b} + \left[ + \sum_{i \in [n]} \xi_i + + \frac{\lambda}{2} w^\top w + + \sum_{i \in [n]} \alpha_i\bigl(1 - y_i w^\top \varphi(x_i) - \xi_i\bigr) + - \sum_{i \in [n]} \beta_i \xi_i + \right]. + \label{5.1.3} +\end{equation} +Here, $\alpha_i,\beta_i$ are both Lagrange multiplier. The gradient of the uppon targer brings: +\begin{gather}\left. + \begin{matrix} + \partial_w L=0\\ + \partial_{\xi_i} L=0 + \end{matrix}\right\}\quad \Rightarrow\quad\begin{cases} + \displaystyle w=\frac{1}{\lambda}\sum_{i\in [n]}\alpha_i y_i\varphi(x_i)\\ + \alpha_i+\beta_i=1 + \end{cases} +\end{gather} +Bring back to \eqref{5.1.3}: +\begin{align} + \mathrm{Dual}&:\frac{1}{2}\lambda\left(\frac{1}{\lambda}\right)^2\sum_{i,j\in[n]}\alpha_i\alpha_jy_iy_j\varphi^\top(x_i)\varphi(x_j)-\sum_{i\in[n]}\alpha_iy_i\varphi^\top(x_i)\frac{1}{\lambda}\sum_{j\in[n]}\alpha_jy_j\varphi(x_j)+\sum_{i\in[n]}\alpha_i +\end{align} +By simplification and kernel tricks: +\begin{align} + \mathrm{Dual}&:\argmax_{\alpha}-\frac{1}{2\lambda}\sum_{i,j\in[n]}\alpha_i\alpha_jy_iy_jk(x_i,x_j)+\sum_{i\in[n]}\alpha_i\quad s.t.\; 0\le\alpha_i\le 1 +\end{align} +It can be observed that the only difference between the dual problems of the soft-margin and hard-margin SVMs lies in the constraint on $\alpha_i$. +In the soft-margin formulation, each $\alpha_i$ is required to satisfy $\alpha_i \le 1$ +(which is equivalent to introducing $\beta_i \ge 0$ and enforcing $\alpha_i + \beta_i = 1$). +This constraint limits the influence of any single support vector, preventing the model from overemphasizing outliers or noisy samples. + +Ultimately, the prediction function $f(x)$ can be reformulated as a linear combination of kernel evaluations between the input point $x$ and the training data points $x_i$: +\begin{equation} + f(x) = \sum_{i=1}^{n} \alpha_i\, k(x_i, x), +\end{equation} +where $k(x_i, x) = \varphi(x_i)^\top \varphi(x)$ denotes the kernel function. +\vspace{1em} + +In the previous part, we introduced the notion of the \emph{feature space}. +We now provide a new example, the \textbf{Ridge Regression}, to demonstrate that this concept is widely used in the field of machine learning. + +The objective function of Ridge Regression is: +\begin{equation} + \argmin_{w} \frac{1}{2}\sum_{i \in [n]} \bigl(y_i - w^\top \varphi(x_i)\bigr)^2 + \frac{1}{2}\lambda\, w^\top w =: L(w) +\end{equation} + +Taking the derivative with respect to $w$ gives: +\begin{equation} + \frac{\partial L}{\partial w} + = -\sum_{i \in [n]} \bigl(y_i - w^\top \varphi(x_i)\bigr)\varphi(x_i) + \lambda w = 0 +\end{equation} + +Rearranging terms yields: +\begin{equation} + w = \frac{1}{\lambda}\sum_{i \in [n]} \bigl(y_i - w^\top \varphi(x_i)\bigr)\varphi(x_i) +\end{equation} + +We can rewrite this as: +\begin{equation} + w = \sum_{i \in [n]} \alpha_i\, \varphi(x_i) +\end{equation} +where $\alpha_i = \frac{1}{\lambda}\bigl(y_i - w^\top \varphi(x_i)\bigr)$. + +Substituting this representation into the prediction function gives: +\begin{equation} + f(x) = w^\top \varphi(x) = \sum_{i \in [n]} \alpha_i\, k(x_i, x) +\end{equation} +where $k(x_i, x) = \varphi(x_i)^\top \varphi(x)$ is the kernel function. + +The shared form also occurs in the logistic regression problem. + +The objective function of regularized logistic regression is: +\begin{equation} + \min_{w} \sum_{i \in [n]} \log\bigl(1 + e^{-y_i w^\top \varphi(x_i)}\bigr) + + \frac{1}{2}\lambda\, w^\top w +\end{equation} + +Taking the derivative with respect to $w$ gives: +\begin{equation} + \frac{\partial L}{\partial w} + = \sum_{i \in [n]} \frac{e^{-y_i w^\top \varphi(x_i)}}{1 + e^{-y_i w^\top \varphi(x_i)}}(-y_i)\varphi(x_i) + + \lambda w = 0 +\end{equation} + +Rearranging the terms, we can express $w$ as: +\begin{equation} + w = \sum_{i \in [n]} \alpha_i\, \varphi(x_i) +\end{equation} +where +\begin{equation} + \alpha_i = \frac{1}{\lambda}\frac{y_i e^{-y_i w^\top \varphi(x_i)}}{1 + e^{-y_i w^\top \varphi(x_i)}}. +\end{equation} + +Substituting this representation into the prediction function gives: +\begin{equation} + f(x) = w^\top \varphi(x) = \sum_{i \in [n]} \alpha_i\, k(x_i, x), +\end{equation} + +The above observation suggests a deeper connection between \emph{representation} and the \emph{kernel trick}. +It shows that all the regression models we have discussed so far can be interpreted as +special cases of the same underlying idea. +\section{Representer Theorem} +We have already defined the concept of a \emph{Hilbert space} in Definition~\ref{def:3.4}. +Building upon that foundation, we now introduce the notion of a \textbf{Reproducing Kernel Hilbert Space (RKHS)}, +which extends the idea of inner product spaces to functional spaces, allowing the kernel trick to be formally grounded in Hilbert space theory. +\begin{definition}[Reproducing Kernel Hilbert Space (RKHS)] + Let $\mathcal{H}$ be a Hilbert space of functions $f:\mathcal{X}\to\mathbb{R}$ defined on a nonempty set $\mathcal{X}$. + If there exists a function $k:\mathcal{X}\times\mathcal{X}\to\mathbb{R}$ such that: + + \begin{enumerate} + \item For every $x\in\mathcal{X}$, the function $k(\cdot, x)\in\mathcal{H}$. + \item For every $f\in\mathcal{H}$ and every $x\in\mathcal{X}$, the \emph{reproducing property} holds: + \begin{equation} + f(x) = \langle f,\, k(\cdot, x) \rangle_{\mathcal{H}}, + \end{equation} + \end{enumerate} + then $\mathcal{H}$ is called a \textbf{Reproducing Kernel Hilbert Space}, and $k$ is called its \textbf{reproducing kernel}. + \end{definition} + \begin{remark} + In fact, the above definition is quite intuitive once we recall that a Hilbert space differs from a general vector space primarily in its \textbf{completeness} under an \textbf{inner product}. + In other words, a Hilbert space is a complete inner product space, which extends the familiar geometric structure of finite-dimensional Euclidean spaces to infinite dimensions. + \end{remark} + \begin{note} + The completeness of a Reproducing Kernel Hilbert Space (RKHS) refers to the property that every Cauchy sequence in the space converges to a limit that still lies within the space. + In geometric terms, this means the RKHS is ``closed under limits'' with respect to its inner-product-induced norm. + + Moreover, by a fundamental result in Hilbert space theory, every separable Hilbert space admits a \emph{complete orthonormal basis}. + Hence, an RKHS possesses (possibly infinite-dimensional) orthonormal basis functions + $\{e_i\}_{i=1}^{\infty}$ such that any function $f \in \mathcal{H}$ can be expressed as + \begin{equation} + f = \sum_{i=1}^{\infty} \langle f, e_i \rangle_{\mathcal{H}}\, e_i, + \end{equation} + where the series converges in the Hilbert norm. + In the context of kernel methods, Mercer’s theorem ensures that the kernel function +itself can be decomposed as +\begin{equation} + k(x, x') = \sum_{i=1}^{\infty} \lambda_i\, \phi_i(x)\phi_i(x'), +\end{equation} +where $\{\sqrt{\lambda_i}\,\phi_i\}$ forms a complete orthonormal basis in the RKHS. +\begin{theorem}[Mercer’s Theorem] + Let $k(x, x')$ be a continuous, symmetric, and positive semi-definite kernel function + defined on a compact domain $\mathcal{X} \subset \mathbb{R}^d$. + Then there exists an orthonormal basis of eigenfunctions + $\{\phi_i\}_{i=1}^{\infty}$ in $L^2(\mathcal{X})$ and a sequence of non-negative eigenvalues + $\{\lambda_i\}_{i=1}^{\infty}$ such that + \begin{equation} + k(x, x') = \sum_{i=1}^{\infty} \lambda_i\, \phi_i(x)\, \phi_i(x'), + \end{equation} + where the series converges absolutely and uniformly. +\end{theorem} +This result shows that any Mercer kernel defines a compact, self-adjoint, and positive + operator $\mathcal{T}_k$ on $L^2(\mathcal{X})$: + \begin{equation} + (\mathcal{T}_k f)(x) = \int_{\mathcal{X}} k(x, x')\, f(x')\, dx'. + \end{equation} + The eigenfunctions $\{\phi_i\}$ and eigenvalues $\{\lambda_i\}$ then form the spectral + decomposition of $\mathcal{T}_k$. + + + \begin{proposition} + The RKHS $\mathcal{H}_k$ associated with $k$ can be realized as the completion of the span of the kernel sections + \begin{equation} + \mathcal{H}_0 = \mathrm{span}\langle\,k(x, \cdot)\mid x \in \mathcal{X}\,\rangle. + \end{equation} + Each $k(x, \cdot)$ can itself be expanded in the eigenbasis $\{\color{red}\phi_i\color{black}\}$ as + \begin{equation} + k(x, \cdot) = \sum_{i=1}^{\infty} \lambda_i\, \phi_i(x)\, \color{red}\phi_i\color{black}(\cdot). + \end{equation} + \end{proposition} + + Consequently, every function in the RKHS can be approximated arbitrarily well by + finite linear combinations of kernel sections $k(x_i, \cdot)$, since these functions span a + dense subspace of $\mathcal{H}_k$. + That is, + \begin{equation} + f = \sum_{i=1}^{\infty} \alpha_i\, k(x_i, \cdot), + \end{equation} + where the series converges in the Hilbert norm. + \end{note} + \begin{note} + I have not listed the properties of the inner product here, as they should be familiar to anyone with basic mathematical knowledge, although Prof. Zhang presented them in detail during class for the sake of completeness. + \end{note} +\vspace{1em} +Briefly, an RKHS is a space in which the value of a function at any point can be expressed as an inner product between the function itself and the corresponding kernel function. +\begin{note} + This property of the kernel function bears a striking resemblance to the idea of + \emph{tensor contraction}. + In tensor analysis, a multivariate tensor can be transformed into a new object by + contracting over one or more indices, effectively ``fixing'' certain variables while + leaving others free. + Likewise, in the context of RKHS, a kernel $k(x, x')$ can be viewed as a bivariate + function that, once one argument (say $x_i$) is fixed, induces a new function + \begin{equation} + k(x_i, \cdot): x' \mapsto k(x_i, x') + \end{equation} + in the RKHS. + These kernel sections $k(x_i, \cdot)$ thus play a role analogous to partially contracted + tensors: each retains one free variable and collectively they span the functional space. +\end{note} + +\begin{example} + Consider $f\in \mathcal H$, where $\mathcal H$ is a Hilbert sapce, $f(x)=w^\top x,\,x,w\in\mathbb R^d$, then $k(z,x)=z^\top x$ is the reproducing kernel of $\mathcal H=\mathrm{span}\langle k(x,\cdot),\, x\in\mathbb R^d\rangle$ under inner product $\langle k(w,\cdot),k(\cdot,x)\rangle_\mathcal H:=k(w,x)$. +\end{example} +\begin{proof} + \begin{equation} + \langle k(w,\cdot),k(\cdot,x)\rangle_\mathcal H=k(w,x)=w^\top x=f(x) + \end{equation} +\end{proof} + +\begin{example} + General form. $f(x)=w^\top \varphi(x)$ and $w\in\mathrm{span}\langle\varphi(x):x\in\mathbb R^d\rangle$, $\varphi(x)$ can be expand under polynomial basis. Prove $k(z,x):=\varphi^\top(z)\varphi(x)$ is a reproduce kernel under inner product $\langle k(w,\cdot),k(\cdot,x)\rangle_\mathcal H:=k(w,x)$. +\end{example} +\begin{proof} + Assume $\varphi^{-1}$ exist: + \begin{equation} + \langle k(\varphi^{-1}(w),\cdot),k(\cdot,x)\rangle_\mathcal H=w^\top\varphi(x)=f(x) + \end{equation} + Otherwise, expand $f$ with kernel basis: + \begin{equation} + \sum_{i=1}^\infty\langle \alpha_ik(x_i,\cdot),k(\cdot,x)\rangle_\mathcal H=\underbrace{\sum_{i=1}^\infty\alpha_i\varphi^\top(x_i)}_{:=\,w^\top}\varphi(x)=w^\top \varphi(x)=f(x) + \end{equation} +\end{proof} +\vspace{1em} +We can demonstrated that: +\begin{proposition} + The inner product of RKHS must have the term of: +\begin{equation} + \langle k(z,\cdot),k(\cdot,x)\rangle=k(z,x) +\end{equation} +\end{proposition} +\begin{proof} + \begin{equation} + \sum_{i=1}^\infty\langle \alpha_ik(x_i,\cdot),k(\cdot,x)\rangle_\mathcal H\underbrace{=}_{\rm by\ RKHS's\ properties}f(x)=\underbrace{\sum_{i=1}^\infty \alpha_ik(x_i,x)}_{\text{by $f(x)$'s def}} + \end{equation} + Seperate the uppon formula by $\alpha_i$: + \begin{equation} + \langle k(z,\cdot),k(\cdot,x)\rangle=k(z,x) + \end{equation} +\end{proof} + + +In a Reproducing Kernel Hilbert Space (RKHS) $\mathcal{H}$, +the norm of a function $f \in \mathcal{H}$ is induced by the inner product: +\begin{equation} + \|f\|_{\mathcal{H}}^2 = \langle f, f \rangle_{\mathcal{H}}. +\end{equation} + +For a linear function of the form +\begin{equation} + f(x) = w^\top x, +\end{equation} +we have +\begin{equation} + \langle f, f \rangle_{\mathcal{H}} + = \langle k(w, \cdot),\, k(w, \cdot) \rangle_{\mathcal{H}} + = k(w, w) + = w^\top w. +\end{equation} +Thus, the RKHS norm of $f$ coincides with the Euclidean norm of its weight vector $w$. + +For a feature-mapped function +\begin{equation} + f(x) = w^\top \varphi(x), +\end{equation} +its inner product in the RKHS is given by +\begin{equation} + \langle f, f \rangle_{\mathcal{H}} + = \langle k(\varphi^{-1}(w), \cdot),\, k(\varphi^{-1}(w), \cdot) \rangle_{\mathcal{H}} + = k(\varphi^{-1}(w), \varphi^{-1}(w)). +\end{equation} +Using the reproducing property of the kernel, we have +\begin{equation} + k(\varphi^{-1}(w), \varphi^{-1}(w)) + = \langle \varphi(\varphi^{-1}(w)),\, \varphi(\varphi^{-1}(w)) \rangle_{\mathcal{H}} + = \varphi(\varphi^{-1}(w))^\top \varphi(\varphi^{-1}(w)) + = w^\top w. +\end{equation} + +Now we came to the core of this chapter. +\begin{theorem}[Representer Theorem] + Let $\mathcal{H}$ be a Reproducing Kernel Hilbert Space (RKHS) over a set $\mathcal{X}$ with reproducing kernel $k(\cdot, \cdot)$. + Consider the regularized empirical risk minimization problem of the form: + \begin{equation} + \inf_{f \in \mathcal{H}} \; J(f) + := \sum_{i\in[n]}L(f(x_i,y_i))+ + R( \|f\|_\mathcal H), + \label{eq:representer_obj} + \end{equation} + where $L:\mathbb{R}^2 \to \mathbb{R}$ is an arbitrary loss function depending only on the function values at the sample points $\{x_i\}_{i=1}^n$, and $R$ is a regularization function that $[0,\infty)\to \mathbb R$. + + Then the minimizer $f^\star \in \mathcal{H}$ exists and admits the finite representation + \begin{equation} + f^\star = \sum_{i=1}^n \alpha_i\, k(x_i, \cdot), + \end{equation} + for some coefficients $\alpha_1, \dots, \alpha_n \in \mathbb{R}$. +\end{theorem} +\begin{proof} + Any $f \in \mathcal{H}$ can be decomposed into two orthogonal components: + \begin{equation} + f = f_\parallel + f_\perp, + \end{equation} + where $f_\parallel$ lies in the subspace $\mathcal{H}_0 := \mathrm{span}\{k(x_i, \cdot)\}_{i=1}^n$ + and $f_\perp$ is orthogonal to $\mathcal{H}_0$. + + We have + \begin{equation} + f_\perp(x_i) = \langle f_\perp,\, k(\cdot, x_i) \rangle_{\mathcal{H}} = 0, + \quad \forall i. + \end{equation} + Hence the loss term $L(f(x_1), \dots, f(x_n))$ depends only on $f_\parallel$. + Moreover, by the Pythagorean theorem, + \begin{equation} + R(\|f\|_{\mathcal{H}}) + = R\left(\sqrt{\|\sum_{i\in[n]}\alpha_i k(x_i,\cdot)\|_{\mathcal{H}}+\|f_{\bot}\|_{\mathcal{H}}}\right)\ge R\left(|\sum_{i\in[n]}\alpha_i k(x_i,\cdot)|\right) + \end{equation} + Replacing $f$ with $f_\parallel$ thus never increases the objective in + (\ref{eq:representer_obj}), implying that an optimal solution must lie in $\mathcal{H}_0$. + + Therefore, the minimizer $f^\star$ can be expressed as a finite kernel expansion: + \begin{equation} + f^\star = \sum_{i=1}^n \alpha_i\, k(x_i, \cdot). + \end{equation} +\end{proof} + The Representer Theorem carries a profound geometric interpretation. + Although the feature mapping + \begin{equation} + \varphi:\mathcal{X} \to \mathcal{H} + \end{equation} + may embed the data into a potentially infinite-dimensional manifold within the RKHS, + the optimization of $f$ does not require direct manipulation in this high-dimensional space. + + Specifically, the theorem states that the optimal function takes the form + \begin{equation} + f^\star = \sum_{i=1}^{n} \alpha_i\, k(x_i, \cdot), + \end{equation} + which lies entirely within the finite-dimensional submanifold + \begin{equation} + \mathcal{M}_n = \mathrm{span}\langle k(x_1,\cdot), k(x_2,\cdot), \dots, k(x_n,\cdot)\rangle + \subseteq \mathcal{H}. + \end{equation} + + Thus, even though $\mathcal{H}$ may be infinite-dimensional, + the optimization problem effectively reduces to solving for the coefficients + \begin{equation} + \alpha_1,\, \alpha_2,\, \dots,\, \alpha_n, + \end{equation} + within this $n$-dimensional subspace. + \begin{figure}[H] + \centering + \includegraphics{../../tikz/5/1.pdf} + \end{figure} + Geometrically, the kernel trick enables us to perform optimization on a + low-dimensional submanifold embedded in a high-dimensional feature manifold. + Therefore, instead of optimizing a weight vector $w$ in the full RKHS, + we only need to search for the optimal combination of kernel coefficients + $\{\alpha_i\}$ within the subspace spanned by the training data. + + The above discussion reveals that a wide range of learning algorithms admit solutions + expressed as weighted sums of kernel functions evaluated on finite training samples. + This formulation transforms the prediction task into a problem of computing + similarities between data points, rather than directly optimizing over high-dimensional parameters. + + In this sense, kernel methods serve as a bridge between \emph{parametric} and + \emph{non-parametric} models (such as $k$-Nearest Neighbors), + combining the expressive flexibility of similarity-based approaches with the + theoretical rigor of parameterized optimization frameworks. +\section{Dual Form of Linear Regression} +Consider the regularized least squares problem: +\begin{equation} + \min_{w} \; J(w) + = \frac{1}{2} \sum_{i \in [n]} \bigl(y_i - w^\top \varphi(x_i)\bigr)^2 + + \frac{1}{2}\lambda\, w^\top w, +\end{equation} +where $\varphi(x_i)$ is the feature mapping and $\lambda > 0$ is a regularization coefficient. +Taking the derivative of $J(w)$ with respect to $w$ gives +\begin{equation} + \frac{\partial J}{\partial w} + = \sum_{i \in [n]} \bigl(w^\top \varphi(x_i) - y_i\bigr)\varphi(x_i) + + \lambda w = 0. +\end{equation} + +Rearranging terms, we can express $w$ as a linear combination of feature vectors: +\begin{equation} + w = \sum_{i \in [n]} \alpha_i\, \varphi(x_i), +\end{equation} +where $\alpha_i \in \mathbb{R}$ are the dual coefficients. + +Let the feature matrix be +\begin{equation} + \Phi = + \begin{pmatrix} + \varphi(x_1)^\top \\ + \varphi(x_2)^\top \\ + \vdots \\ + \varphi(x_n)^\top + \end{pmatrix} + \in \mathbb{R}^{n \times d}. +\end{equation} +Then $w$ can be compactly written as +\begin{equation} + w = \Phi^\top \alpha. +\end{equation} + +Substituting into the objective function yields: +\begin{equation} + J(w) + = \frac{1}{2} (\Phi w - y)^\top (\Phi w - y) + + \frac{1}{2}\lambda\, w^\top w. +\end{equation} +By substituting $w = \Phi^\top \alpha$, we obtain: +\begin{equation} + J(\alpha) + = \frac{1}{2} (\Phi \Phi^\top \alpha - y)^\top (\Phi \Phi^\top \alpha - y) + + \frac{1}{2}\lambda\, \alpha^\top \Phi \Phi^\top \alpha. +\end{equation} + +Defining the kernel matrix (\textbf{Gram Matrix}): +\begin{equation} + K = \Phi \Phi^\top, \quad K_{ij} = \langle \varphi(x_i), \varphi(x_j) \rangle = k(x_i, x_j), +\end{equation} +the dual objective becomes: +\begin{equation} + J(\alpha) + = \frac{1}{2} (K\alpha - y)^\top (K\alpha - y) + + \frac{1}{2}\lambda\, \alpha^\top K \alpha. +\end{equation} +By derivation: +\begin{gather} + J(\alpha)=\frac{1}{2}\alpha^\top KK\alpha+\frac{1}{2}y^\top y-\alpha^\top Ky+\frac{\lambda}{2}\alpha^\top K\alpha\\ + \frac{\partial J(\alpha)}{\partial\alpha}=KK\alpha-Ky+\lambda K\alpha=0 +\end{gather} +If $K\succ 0$: +\begin{equation} + (K + \lambda I)\alpha = y\quad \Rightarrow\quad \alpha=(K+\lambda I)^{-1} y +\end{equation} + +By simplification, notice that $w=\Phi^\top\alpha$, the prediction function can be expressed as: +\begin{equation} + f(x) = w^\top \varphi(x) + = \begin{pmatrix} + k(x_1, x)\\ + \vdots\\ + k(x_n, x) + \end{pmatrix}(K+\lambda I)^{-1} y . +\end{equation} +\end{document} \ No newline at end of file diff --git a/notes/2025/mvp/chapters/6-lt.pdf b/notes/2025/mvp/chapters/6-lt.pdf new file mode 100644 index 0000000..54313db Binary files /dev/null and b/notes/2025/mvp/chapters/6-lt.pdf differ diff --git a/notes/2025/mvp/chapters/6-lt.tex b/notes/2025/mvp/chapters/6-lt.tex new file mode 100644 index 0000000..58fbc13 --- /dev/null +++ b/notes/2025/mvp/chapters/6-lt.tex @@ -0,0 +1,630 @@ +\documentclass[../main]{subfiles} +\begin{document} +\chapter{Learning Theory} +\begin{introduction} + \item In-Sample Error + \item Out-of-Sample Error + \item Generalization Error + \item PAC + \item Shelah's Lemma + \item VC-Dim +\end{introduction} +\section{Learning Theory} +Learning theory forms the foundation of machine learning. It explains why a model can generalize to data that did not appear during training. We may draw an analogy here: +\begin{itemize} +\item Training a model $\to$ Taking a course; +\item Training process $\to$ Doing exercises or homework; +\item Testing phase $\to$ Taking an exam. +\end{itemize} +From our own learning experience, we know that students who perform well during practice do not always achieve high scores in exams. Likewise, a model that fits the training data well may still perform poorly on unseen data. This motivates us to introduce theoretical concepts that can quantitatively describe and explain such phenomena. + +\begin{definition}[In-sample Error] + The \emph{in-sample error} $E_{\rm in}$ is defined as the empirical error rate of a hypothesis evaluated on the training dataset. + \end{definition} + + Let $h \in \mathcal{H}$ denote a hypothesis or model, for example a linear classifier $h(x) := \mathrm{sign}(w^\top x + b)$. + Then the in-sample error can be formally written as + \begin{equation} + E_{\rm in} = \frac{1}{n}\sum_{i \in [n]} \mathbb{1}\!\big(h(x_i) \neq y_i\big), + \qquad D := \{(x_i, y_i)\,|\, i \in [n]\}. + \end{equation} +Here $D$ denotes the training dataset, and each $y_i \in \{-1,1\}$ is the ground-truth label prepared for the classification task. + +Oppositely, we also wish to evaluate the model's ability to perform well on unseen or more general tasks. +This motivates the definition of a new concept: + +\begin{definition}[Out-of-sample Error] +The \emph{out-of-sample error} $E_{\rm out}$ measures how well a model generalizes beyond the training data. +\end{definition} + +In the previous example, $E_{\rm out}$ can be expressed as +\begin{equation} + E_{\rm out} = \mathbb{P}\big(h(x) \neq y\big) + = \mathbb{E}_{(x,y)\sim p_{xy}}\!\left[\mathbb{1}\big(h(x) \neq y\big)\right], +\end{equation} +where $(x,y) \sim p_{xy}$ are i.i.d. samples drawn from the true underlying data distribution. + +To quantitatively capture the generalization ability of a model, we define the following notion: + +\begin{definition}[Generalization Error] +The \emph{generalization error} is defined as the difference between the out-of-sample and in-sample errors: +\begin{equation} + E_{\rm out}(h) - E_{\rm in}(h). +\end{equation} +\end{definition} + +\begin{remark} +In practice, we often introduce regularization terms to prevent $E_{\rm in}$ from becoming excessively small, which can help improve generalization performance. +\end{remark} + +Our ultimate goal is to bound the quantity $E_{\rm out}(h) - E_{\rm in}(h)$. +However, it remains an open question whether such a bound can be feasibly obtained for arbitrary hypotheses. +\begin{figure}[H] + \centering + \includegraphics[scale=0.8]{../../tikz/6/1.pdf} + \caption{An example where $E_{\rm in}=0$ but $E_{\rm out}$ remains large.} +\end{figure} + +It is crucial to acknowledge that, in essence, no finite dataset can guarantee a definitive prediction of future outcomes. +The optimization of a model solely based on minimizing $E_{\rm in}(h)$ does not ensure a small generalization gap $E_{\rm out}(h) - E_{\rm in}(h)$, which may in fact be arbitrarily large. +\textbf{However}, according to probabilistic learning theory, we can establish that with high probability $(1 - \delta)$, where $\delta \ll 1$, the generalization error is bounded by a small quantity $\varepsilon(\mathcal{H}, n, \delta)$ depending on the hypothesis space $\mathcal{H}$, the number of samples $n$, and the confidence parameter $\delta$: +\begin{equation} + \mathbb{P}\!\left( + |E_{\rm out}(h) - E_{\rm in}(h)| \le \varepsilon(\mathcal{H}, n, \delta) + \right) \ge 1 - \delta. +\end{equation} +This forms the cornerstone of statistical learning theory: while perfect certainty is unattainable, we can quantify uncertainty and guarantee performance with high probability. + +The theorem above leads to the framework known as the \textbf{Probably Approximately Correct (PAC) Learning Theory}. +Before introducing the PAC formulation in detail, we first review several fundamental mathematical tools. + +\begin{theorem}[Hoeffding's Inequality] +Let $X_1, \dots, X_n$ be independent random variables such that each $X_i$ is almost surely bounded, $X_i \in [a_i, b_i]$. +Define the empirical mean $\bar{X} = \frac{1}{n}\sum_{i=1}^n X_i$. Then, for any $\varepsilon > 0$, we have +\begin{equation} + \mathbb{P}\!\left(|\bar{X} - \mathbb{E}\bar{X}| \ge \varepsilon\right) + \le 2\exp\!\left(-\frac{2n^2\varepsilon^2}{\sum_{i=1}^n (b_i - a_i)^2}\right). +\end{equation} +\end{theorem} +\begin{proof} + Let $S = \sum_{i=1}^n X_i$. We first bound the upper tail $\mathbb{P}(S - \mathbb{E}S \ge t)$; the lower tail is analogous and the two-sided bound follows by union. + + By Markov's inequality, for any $\lambda > 0$, + \begin{equation} + \mathbb{P}(S - \mathbb{E}S \ge t) + = \mathbb{P}\!\left(e^{\lambda(S - \mathbb{E}S)} \ge e^{\lambda t}\right) + \le e^{-\lambda t}\,\mathbb{E}\,e^{\lambda(S - \mathbb{E}S)}. + \end{equation} + + Because $X_1,\dots,X_n$ are independent, + \begin{equation} + \mathbb{E}\,e^{\lambda(S - \mathbb{E}S)} + = \prod_{i=1}^n \mathbb{E}\,e^{\lambda(X_i - \mathbb{E}X_i)}. + \end{equation} + + For each bounded random variable $Y \in [a,b]$ with $\mathbb{E}Y = \mu$, Hoeffding's lemma gives + \begin{equation} + \mathbb{E}\,e^{\lambda(Y - \mu)} \le \exp\!\left(\frac{\lambda^2(b - a)^2}{8}\right). + \end{equation} + + Applying this to each $X_i$ yields + \begin{equation} + \mathbb{E}\,e^{\lambda(S - \mathbb{E}S)} + \le \exp\!\left(\frac{\lambda^2}{8}\sum_{i=1}^n (b_i - a_i)^2\right). + \end{equation} + + Hence, + \begin{equation} + \mathbb{P}(S - \mathbb{E}S \ge t) + \le \exp\!\left(-\lambda t + \frac{\lambda^2}{8}\sum_{i=1}^n (b_i - a_i)^2\right). + \end{equation} + + Optimizing over $\lambda > 0$, we find + \begin{equation} + \lambda^\star = \frac{4t}{\sum_{i=1}^n (b_i - a_i)^2}, + \end{equation} + which gives + \begin{equation} + \mathbb{P}(S - \mathbb{E}S \ge t) + \le \exp\!\left(-\frac{2t^2}{\sum_{i=1}^n (b_i - a_i)^2}\right). + \end{equation} + + By symmetry, + \begin{equation} + \mathbb{P}(S - \mathbb{E}S \le -t) + \le \exp\!\left(-\frac{2t^2}{\sum_{i=1}^n (b_i - a_i)^2}\right). + \end{equation} + + Combining both tails gives + \begin{equation} + \mathbb{P}\!\left(|S - \mathbb{E}S| \ge t\right) + \le 2\exp\!\left(-\frac{2t^2}{\sum_{i=1}^n (b_i - a_i)^2}\right). + \end{equation} + + Finally, setting $t = n\varepsilon$ and noting $\bar X = S/n$, we obtain + \begin{equation} + \mathbb{P}\!\left(|\bar X - \mathbb{E}\bar X| \ge \varepsilon\right) + \le 2\exp\!\left(-\frac{2n^2\varepsilon^2}{\sum_{i=1}^n (b_i - a_i)^2}\right). + \end{equation} +\end{proof} +Now, for a \textbf{given and fixed} hypothesis $h$, we obtain the following result: + +\begin{theorem}[First Bound of PAC Learning] +For any $\varepsilon > 0$, +\begin{equation} + \mathbb{P}\!\left(E_{\rm out}(h) - E_{\rm in}(h) \ge \varepsilon\right) + \le \exp(-2n\varepsilon^2). +\end{equation} +\end{theorem} + +Here, $E_{\rm in}(h)$ serves as the empirical mean of the random variables +$\mathbb{1}\!\big(h(x_i) \neq y_i\big)$, as defined previously. +In other words, each indicator $\mathbb{1}\!\big(h(x_i) \neq y_i\big)$ represents a bounded random variable in $[0,1]$, +and Hoeffding’s inequality ensures that the deviation between the empirical error $E_{\rm in}(h)$ +and the expected error $E_{\rm out}(h)$ decreases exponentially with the number of samples $n$. +\begin{proof} + By taking expectation over the definition of $E_{\rm in}$, we have + \begin{align*} + \mathbb{E}[\bar{X}] + &= \frac{1}{n}\sum_{i=1}^n \mathbb{E}[X_i] \\ + &= \frac{1}{n}\sum_{i=1}^n \mathbb{E}_{(x_i,y_i)\sim p_{xy}} + \big[\mathbb{1}(h(x_i)\neq y_i)\big] \\ + &= \frac{1}{n}\sum_{i=1}^n E_{\rm out}(h) + = E_{\rm out}(h). + \end{align*} + The third equality follows directly from the definition of $E_{\rm out}(h)$. + Applying Hoeffding’s inequality, note that each random variable $\mathbb{1}(h(x_i)\neq y_i)$ is bounded in $\{0,1\}$, + so $(b_i - a_i)^2 = 1$ for all $i$, and the denominator $\sum_{i=1}^n (b_i - a_i)^2 = n$ cancels accordingly. + This completes the proof. + \end{proof} + + However, there is a fundamental limitation in the condition of this first PAC bound: + the hypothesis $h$ must be fixed \emph{before} observing the training data. + In practice, the learning algorithm selects $h$ based on the dataset $D$, + which introduces statistical dependence between $h$ and $D$. (Parameters of model $h$ is determined by the chosen training dataset $D$) + This dependence violates the independence assumption required by Hoeffding’s inequality, + and therefore the bound no longer holds directly once $h$ is data-dependent. + + To overcome this limitation, we turn to analyzing the \textbf{union bound} over the hypothesis space $\mathcal{H}$, + which removes the dependency on a single fixed hypothesis chosen after observing the training data. + We first assume that $\mathcal{H}$ is a finite set, which actually is not always satisfied in practice + (e.g., the space of linear models has infinite cardinality, $|\mathrm{GL}_n| = \infty$). + That is, let $\mathcal{H} = \{h_1, \dots, h_M\}$. + + In the worst case, we have + \begin{align*} + \mathbb{P}\!\left(\exists\, h \in \mathcal{H} : E_{\rm out}(h) - E_{\rm in}(h) \ge \varepsilon \right) + &= \mathbb{P}\!\left(\bigcup_{i=1}^M \{E_{\rm out}(h_i) - E_{\rm in}(h_i) \ge \varepsilon\}\right) \\ + &\!\!\!\!\!\!\!\!\!\underbrace{\le}_{\text{Union Bound}} \sum_{i=1}^M + \mathbb{P}\!\left(E_{\rm out}(h_i) - E_{\rm in}(h_i) \ge \varepsilon\right) \\ + &\le M \cdot \exp(-2n\varepsilon^2). + \end{align*} + + Hence, we obtain the following result: + + \begin{theorem}[First Practical Bound of PAC Learning] + With probability at least $1-\delta$, + \begin{equation} + E_{\rm out}(h_i) - E_{\rm in}(h_i) + < \sqrt{\frac{1}{2n}\log\frac{M}{\delta}}. + \end{equation} + \label{theo-6.3} + \end{theorem} +\begin{remark} + \begin{equation} + \delta:=M\cdot\exp(-2n\varepsilon^2)\quad\Leftrightarrow \quad \varepsilon=\sqrt{\frac{1}{2n}\log\frac{M}{\delta}} + \end{equation} +\end{remark} + This result reveals a fundamental trade-off between model complexity and generalization. + As the hypothesis space $\mathcal{H}$ becomes richer, the total number of possible hypotheses $M$ increases. + A larger $M$ expands the logarithmic term $\log\frac{M}{\delta}$, thereby loosening the bound on $E_{\rm out}-E_{\rm in}$. + In other words, even though a highly expressive model may achieve $E_{\rm in}=0$ by perfectly fitting the training data, + its generalization gap can still remain large. + To maintain reliable generalization for such expressive models, one must increase the sample size $n$ accordingly. + This highlights the essential balance between data volume and model capacity in learning theory. + + Now consider an infinite hypothesis space $\mathcal{H}$, for example the reproducing kernel Hilbert space (RKHS) + of linear classifiers $\{h : h(x) = w^\top x + b\}$. + We claim that the bound provided in the theorem above becomes an extreme overestimation in such cases. + Intuitively, the union bound treats all hypotheses as completely distinct, + even if many of them make identical predictions on the dataset. + For example, the following three classifiers $h_1$, $h_2$, and $h_3$ may produce exactly the same classification results, + yet the union bound still counts them as three separate events, + thereby inflating the estimated probability by a factor of three. +\begin{figure}[H] +\centering +\includegraphics[scale=0.9]{../../tikz/6/2.pdf} + +\end{figure} +\begin{note} + A natural idea is to quotient out an equivalence relation on $\mathcal{H}$ so that its effective measure decreases. +This leads us to the concept of \emph{dichotomy}, where we seek an isomorphism that maps hypotheses producing identical classification results to the same image, thereby identifying them within a common kernel space. +By quotienting out the equivalence classes induced by $\ker$, which acts as a normal subgroup, +we obtain a new structure—still a group (or more appropriately here, a reduced hypothesis space)— +a space of smaller measure, serving as a kind of \emph{renormalized} hypothesis space. +\end{note} + +Assume a binary classification setting +\begin{equation} + y \in \{-1,+1\}, \qquad x \in \mathcal{X}, +\end{equation} +with hypothesis space +\begin{equation} + \mathcal{H} = \{h:\mathcal{X}\to\{-1,+1\}\}. +\end{equation} + +Given $n$ samples +\begin{equation} + x_1,x_2,\dots,x_n \in \mathcal{X}, +\end{equation} +for any $h\in\mathcal{H}$ we obtain an $n$–tuple of labels +\begin{equation} + \bigl(h(x_1),h(x_2),\dots,h(x_n)\bigr) \in \{-1,+1\}^n, +\end{equation} +which is called a \emph{dichotomy} of the set $\{x_1,\dots,x_n\}$ induced by $\mathcal{H}$. + +The growth function $m_{\mathcal{H}}(n)$ is defined as: +\begin{definition}[Growth Function] + \begin{equation} + m_{\mathcal{H}}(n) + = \max_{x_1,\dots,x_n \in \mathcal{X}} + \Bigl|\bigl\{(h(x_1),\dots,h(x_n)) : h\in\mathcal{H}\bigr\}\Bigr|, + \end{equation} +\end{definition} +i.e., the maximum number of distinct dichotomies that $\mathcal{H}$ can implement +on any set of $n$ points. It measures the effective size of the hypothesis space +$\mathcal{H}$. +\begin{note} + The growth function can be interpreted as a quotient construction. + Define the evaluation map + \begin{equation} + \Phi_n : \mathcal{H} \to \{-1,+1\}^n, \qquad + \Phi_n(h) = (h(x_1), h(x_2), \dots, h(x_n)). + \end{equation} + This map induces an equivalence relation on $\mathcal{H}$: + \begin{equation} + h_1 \sim h_2 \quad \Leftrightarrow \quad h_1(x_i) = h_2(x_i), \ \forall i = 1, \dots, n. + \end{equation} + Each equivalence class corresponds to one unique dichotomy, + so two hypotheses are considered identical if they generate the same labeling + on the given sample set. Therefore, the growth function can be written as + \begin{equation} + m_{\mathcal{H}}(n) = \bigl|\mathcal{H} / \ker(\Phi_n)\bigr|, + \end{equation} + representing the number of equivalence classes of $\mathcal{H}$ under this relation. + Intuitively, this quotient construction mods out by the kernel of the evaluation map, + focusing only on the empirically distinguishable behaviors of hypotheses on $n$ samples. + \end{note} + Since there are at most $2^n$ different $\pm1$ labelings on $n$ points, we always have +\begin{equation} + m_{\mathcal{H}}(n) \le 2^n. +\end{equation} + +Now let $S = \{x_1,\dots,x_n\} \subset \mathcal{X}$ be a finite subset. +We say that $\mathcal{H}$ \emph{shatters} $S$ if it can realize all possible +$\pm1$ assignments on $S$, that is, +\begin{equation} + \mathcal{H}(x_1,\dots,x_n) = \{-1,+1\}^n. +\end{equation} +Equivalently, for every labeling $(y_1,\dots,y_n) \in \{-1,+1\}^n$ +there exists some $h \in \mathcal{H}$ such that $h(x_i) = y_i$ for all $i=1,\dots,n$. +\begin{example} + Consider three sample points $x_1, x_2, x_3$ and a hypothesis set $\mathcal{H}$ + that induces the following dichotomies: + \begin{equation} + \mathcal{H}(x_1,x_2,x_3) + = \{ (+1,-1,-1),\; (-1,+1,-1),\; (-1,+1,+1) \}. + \end{equation} +Determine which subsets of $\{x_1, x_2, x_3\}$ can be \emph{shattered} by $\mathcal{H}$. +\end{example} +\begin{solution} + \begin{itemize} + \item For any single point $\{x_i\}$, both $+1$ and $-1$ appear in the projections + of $\mathcal{H}(x_1,x_2,x_3)$, so every singleton subset is shattered. + + \item For pairs of points: + \begin{align*} + \{x_1, x_2\} &\Rightarrow \{(+1,-1),\, (-1,+1)\} \quad + \text{(missing $(+1,+1)$ and $(-1,-1)$) $\Rightarrow$ not shattered;} \\ + \{x_1, x_3\} &\Rightarrow \{(+1,-1),\, (-1,+1),\, (-1,-1)\} \quad + \text{(missing $(+1,+1)$) $\Rightarrow$ not shattered;} \\ + \{x_2, x_3\} &\Rightarrow \{(-1,-1),\, (+1,-1),\, (+1,+1)\} \quad + \text{(all $2^2=4$ combinations not present) $\Rightarrow$ not shattered.} + \end{align*} + + \item For the full set $\{x_1,x_2,x_3\}$, since only $3<2^3=8$ dichotomies are realized, + it is clearly not shattered. + \end{itemize} + + Hence, $\mathcal{H}$ can only shatter the \emph{singleton subsets} + $\{x_1\}$, $\{x_2\}$, and $\{x_3\}$, but no pair or triple of points. +\end{solution} +\begin{example} + Let $\mathcal{X} = \mathbb{R}^2$, and consider the hypothesis set of all linear classifiers + \begin{equation} + \mathcal{H} = \{\, h_{w,b}(x) = \operatorname{sign}(w^\top x + b) \mid w \in \mathbb{R}^2,\, b \in \mathbb{R} \,\}. + \end{equation} + Determine the value of $m_{\mathcal{H}}(3)$, the maximum number of dichotomies that $\mathcal{H}$ + can produce on any $3$ points in $\mathbb{R}^2$. +\end{example} +\begin{solution} + + +A separating hyperplane in $\mathbb{R}^2$ is a line of the form $w^\top x + b = 0$. +The sign of $w^\top x + b$ assigns each point to one of two half-spaces. + +For any given labeling $(y_1,y_2,y_3)\in\{-1,+1\}^3$, +we can always find some line that places the $+1$ points on one side and the $-1$ points on the other, +as long as the three points are not collinear. +This is because two points determine a line, and the third point’s label can always be satisfied +by shifting or rotating that line slightly. + +Hence every possible assignment of $\pm1$ labels to the three points is realizable: +\begin{equation} + \mathcal{H}(x_1,x_2,x_3) + = \{-1,+1\}^3. +\end{equation} + +Therefore, the growth function value is +\begin{equation} + m_{\mathcal{H}}(3) = 2^3 = 8. +\end{equation} +\end{solution} +\begin{remark} + More examples have been presented in class, but I'm not goint to note these in our lecture. +\end{remark} +$m_{\mathcal H}(n)$ quantified the ability of a model. +\section{VC-Dim} + +\begin{definition}[Vapnik--Chervonenkis Dimension] +The \emph{VC dimension} $\mathrm{dvc}(\mathcal{H})$ is the largest integer $n$ such that +\begin{equation} + m_{\mathcal{H}}(n) = 2^n. +\end{equation} +\end{definition} + +\begin{remark} +Vapnik is the inventor of the Support Vector Machine (SVM) and one of the founders of +statistical learning theory. +\end{remark} + +The VC dimension measures the \emph{capacity} or \emph{expressive power} of a hypothesis class. +Formally, it is the largest integer $n$ such that $\mathcal{H}$ can realize all +$2^n$ possible dichotomies on some set of $n$ points. +Equivalently, it quantifies how complex the decision boundaries in $\mathcal{H}$ can be: +a larger VC dimension means $\mathcal{H}$ can represent more distinct labelings of data, +i.e.\ greater capacity. + +\smallskip +It turns out that if $\mathrm{dvc}(\mathcal{H})$ is finite, +then $m_{\mathcal{H}}(n)$ can be bounded in terms of $\mathrm{dvc}(\mathcal{H})$. + +\begin{theorem}[(Sauer--)Shelah's Lemma] +Let $\mathcal{H}$ be a class of binary-valued functions on a domain $\mathcal{X}$. +Define its growth function as +\begin{equation} + m_{\mathcal{H}}(n) + = \max_{\substack{S \subseteq \mathcal{X} \\ |S| = n}} + \bigl|\{\, (h(x))_{x \in S} : h \in \mathcal{H} \}\bigr|. +\end{equation} +If $\mathrm{dvc}(\mathcal{H}) = d < \infty$, then for all $n \ge 0$, +\begin{equation} + m_{\mathcal{H}}(n) + \le \sum_{i=0}^{d} \binom{n}{i}\sim O(n^{d}) +\end{equation} +\end{theorem} +\begin{proof} + We proceed by induction on $n$. The base case is trivial and we proved that in the next proof for completeness. + + \textbf{Step 1: Divide the hypothesis class.} + Fix $x_1, \dots, x_n \in \mathcal{X}$, and define + \begin{align} + \mathcal{H}_1(x_1,\dots,x_{n-1}) + &= \{ (h(x_1), \dots, h(x_{n-1})) \mid h \in \mathcal{H},\, h(x_n)=+1 \}, \\ + \mathcal{H}_2(x_1,\dots,x_{n-1}) + &= \{ (h(x_1), \dots, h(x_{n-1})) \mid h \in \mathcal{H},\, h(x_n)=-1 \}. + \end{align} + Then + \begin{equation} + \mathcal{H}(x_1,\dots,x_n) + = \{ (\mathbf{v}, +1) \mid \mathbf{v} \in \mathcal{H}_1(x_1,\dots,x_{n-1}) \} + \cup + \{ (\mathbf{v}, -1) \mid \mathbf{v} \in \mathcal{H}_2(x_1,\dots,x_{n-1}) \}. + \end{equation} + + \textbf{Step 2: Counting distinct dichotomies.} + If $\mathbf{v}$ belongs to both $\mathcal{H}_1$ and $\mathcal{H}_2$, then $\mathcal{H}$ realizes both labelings $(\mathbf{v}, +1)$ and $(\mathbf{v}, -1)$. + Hence, + \begin{equation} + |\mathcal{H}(x_1,\dots,x_n)| + \le |\mathcal{H}_1(x_1,\dots,x_{n-1})| + + |\mathcal{H}_2(x_1,\dots,x_{n-1})|. + \end{equation} + + \textbf{Step 3: Bounding the VC dimension.} + If $\mathrm{dvc}(\mathcal{H}) = d$, then $\mathrm{dvc}(\mathcal{H}_1) \le d$ and $\mathrm{dvc}(\mathcal{H}_2) \le d-1$. + Otherwise, if $\mathcal{H}_2$ could shatter a subset of size $d$, then $\mathcal{H}$ could shatter that subset together with $x_n$, contradicting $\mathrm{dvc}(\mathcal{H}) = d$. + + \textbf{Step 4: Apply the induction hypothesis.} + By the inductive assumption, + \begin{equation} + |\mathcal{H}_1(x_1,\dots,x_{n-1})| \le \sum_{i=0}^{d} \binom{n-1}{i}, + \qquad + |\mathcal{H}_2(x_1,\dots,x_{n-1})| \le \sum_{i=0}^{d-1} \binom{n-1}{i}. + \end{equation} + Therefore, + \begin{align} + |\mathcal{H}(x_1,\dots,x_n)| + &\le \sum_{i=0}^{d} \binom{n-1}{i} + + \sum_{i=0}^{d-1} \binom{n-1}{i} \\ + &= \sum_{i=0}^{d} \left( \binom{n-1}{i} + \binom{n-1}{i-1} \right) + = \sum_{i=0}^{d} \binom{n}{i}, + \end{align} + where the last equality uses Pascal's identity. + Hence the claim holds for all $n$. + \end{proof} + \begin{tcolorbox} + \vspace{0.5em} + \color{red!50!black}$\circledast $\quad\emph{{The following proof is included out of personal interest and is not part of the course material. + Readers who wish to focus solely on the course content may safely skip to the next part.}} + \vspace{0.5em} + \end{tcolorbox} +\begin{proof} + Here we provide another proof. + + Fix $n$ and choose $S \subseteq \mathcal{X}$ with $|S|=n$ + such that $m_{\mathcal{H}}(n)=|\mathcal{H}|_S|$, where + \begin{equation} + \mathcal{H}|_S + := \{\, (h(x))_{x\in S} : h\in\mathcal{H} \} + \end{equation} + is the set of labelings realized on $S$. + We prove by induction on $n$ that + \begin{equation} + |\mathcal{H}|_S| + \le \sum_{i=0}^{d} \binom{n}{i}. + \end{equation} + + \textbf{Base cases.} + If $n=0$, then $|\mathcal{H}|_S|=1$ and + \begin{equation} + \sum_{i=0}^{d} \binom{0}{i} = 1, + \end{equation} + so the claim holds. + If $d=0$, then no single point can be labeled in two different ways, + hence $|\mathcal{H}|_S|=1$ for all $n$, while + \begin{equation} + \sum_{i=0}^{0} \binom{n}{i} = 1, + \end{equation} + so the bound again holds. + + \textbf{Inductive step.} + Assume the statement true for all pairs $(n',d')$ with $n' < n$ + and the same $d$. Pick some $x \in S$ and set + \begin{equation} + S' := S \setminus \{x\}, \qquad |S'| = n-1. + \end{equation} + For each $h\in\mathcal{H}$, let $h|_{S'}$ be its restriction to $S'$. + Define + \begin{equation} + \mathcal{A} + := \{\, h|_{S'} : h\in\mathcal{H} \}, + \end{equation} + \begin{equation} + \mathcal{B} + := \bigl\{\, h|_{S'} : h\in\mathcal{H},\; + \exists\,h' \in \mathcal{H} + \text{ with } h'(x)\neq h(x),\ h'|_{S'}=h|_{S'} \bigr\}. + \end{equation} + Thus, $\mathcal{A}$ is the set of all restrictions on $S'$, + and $\mathcal{B}$ collects those restrictions for which + $\mathcal{H}$ contains at least two hypotheses that agree on $S'$ + but disagree on $x$. + + Every labeling of $S$ comes from choosing a restriction in $\mathcal{A}$, + and then choosing the value at $x$. + Labelings for which the value at $x$ is \emph{forced} (no alternative + hypothesis with the same restriction) contribute at most $|\mathcal{A}|$ + patterns; labelings for which the value at $x$ can be flipped + (i.e.\ restrictions in $\mathcal{B}$) contribute at most $|\mathcal{B}|$ + additional patterns. + Hence + \begin{equation} + |\mathcal{H}|_S| + \le |\mathcal{A}| + |\mathcal{B}|. + \end{equation} + + Note that $|\mathcal{A}| \le m_{\mathcal{H}}(n-1)$, because $\mathcal{A}$ + is a set of labelings on $S'$ of size $n-1$. + To bound $|\mathcal{B}|$, define a new hypothesis class + \begin{equation} + \mathcal{H}_x + := \bigl\{\, h|_{S'} : h\in\mathcal{H},\; + \exists\,h' \in \mathcal{H} + \text{ with } h'(x)\neq h(x),\ h'|_{S'}=h|_{S'} \bigr\}. + \end{equation} + Then $\mathcal{B}$ is precisely $\mathcal{H}_x$ viewed as a set of labelings on $S'$, + so $|\mathcal{B}| \le m_{\mathcal{H}_x}(n-1)$. + + We claim that + \begin{equation} + \mathrm{dvc}(\mathcal{H}_x) \le d-1. + \end{equation} + Otherwise, suppose $\mathcal{H}_x$ shatters some $T \subseteq S'$ with $|T|=d$. + By the definition of $\mathcal{H}_x$, for every labeling $y_T$ on $T$ + and either choice of label $y_x \in \{-1,+1\}$ at $x$, there exists $h\in\mathcal{H}$ + such that + \begin{equation} + h|_{T} = y_T, \qquad h(x) = y_x. + \end{equation} + Hence $\mathcal{H}$ can realize all labelings on $T \cup \{x\}$, + so $\mathcal{H}$ shatters a set of size $d+1$, contradicting + $\mathrm{dvc}(\mathcal{H}) = d$. + Thus the claim holds. + + By the induction hypothesis applied to $\mathcal{H}$ on $n-1$ points + and to $\mathcal{H}_x$ whose VC dimension is at most $d-1$, we obtain + \begin{equation} + |\mathcal{A}| \le \sum_{i=0}^{d} \binom{n-1}{i}, + \qquad + |\mathcal{B}| \le \sum_{i=0}^{d-1} \binom{n-1}{i}. + \end{equation} + Therefore, + \begin{align} + |\mathcal{H}|_S| + &\le \sum_{i=0}^{d} \binom{n-1}{i} + + \sum_{i=0}^{d-1} \binom{n-1}{i} \\ + &= \sum_{i=0}^{d} \left( \binom{n-1}{i} + \binom{n-1}{i-1} \right) \\ + &= \sum_{i=0}^{d} \binom{n}{i}. + \end{align} + \end{proof} + \begin{note} + The inductive structure of the proof for the Sauer--Shelah Lemma is deeply reminiscent + of strategies used in \emph{mathematical logic}, particularly in proving + \emph{completeness theorems} that connect syntactic derivability and semantic truth. + In both settings, we start by imposing an ordering (or enumeration) on a collection + of objects—whether they are formulas, sentences, or finite subsets of $\mathcal{X}$—so that + induction can be applied coherently. + + Just as in logic we construct models step by step according to a fixed sequence + of formulas (often by extending consistent sets in a well-ordered fashion), + here we inductively analyze the growth of $\mathcal{H}$ by removing or reintroducing one + element of the sample set at a time. + The key similarity is structural: both arguments rely on well-founded recursion over + a totally ordered domain to control how local extensions (adding one more formula or one more + point) affect global consistency or combinatorial growth. + \end{note} + + \begin{remark} + Plugging this bound into the standard uniform convergence inequality + for a finite hypothesis class, we obtain a generalization bound of the form + \begin{equation} + \varepsilon + \;=\; + \sqrt{\frac{1}{2n}\,\log \frac{M}{\delta}}\sim O\left(\frac{\mathrm{dvc}\log n}{n}\right)^{1/2} + \end{equation} + Thus, when $\mathrm{dvc}(\mathcal{H})$ is finite, the estimation error + $\varepsilon$ decreases as $n$ grows, with the complexity of the class + entering only through the polynomial factor controlled by $d$. + \end{remark} + It can be rigorously shown that: + + \begin{theorem}[VC Generalization Bound] + With probability at least $1-\delta$, the following inequality holds: + \begin{equation} + E_{\rm out}(h_i) - E_{\rm in}(h_i) + < \sqrt{\frac{8}{n}\log\frac{4m_{\mathcal{H}}(2n)}{\delta}}. + \end{equation} + Moreover, + \begin{equation} + E_{\rm out}(h_i) - E_{\rm in}(h_i) + = O\!\left(\sqrt{\frac{\mathrm{d_{VC}}\log n}{n}}\right) + \xrightarrow[n\to\infty]{} 0. + \end{equation} + \end{theorem} + + Although the VC generalization bound is relatively loose in practice, + its importance lies in converting an inherently infinite hypothesis space into a finitely bounded one. + This theorem establishes a concrete analytical foundation for the entire field of machine learning, + demonstrating that with sufficient data, generalization is not merely empirical, but theoretically guaranteed. +\end{document} \ No newline at end of file diff --git a/notes/2025/mvp/chapters/preface.pdf b/notes/2025/mvp/chapters/preface.pdf new file mode 100644 index 0000000..caf2fcc Binary files /dev/null and b/notes/2025/mvp/chapters/preface.pdf differ diff --git a/notes/2025/mvp/chapters/preface.tex b/notes/2025/mvp/chapters/preface.tex new file mode 100644 index 0000000..3abe4d6 --- /dev/null +++ b/notes/2025/mvp/chapters/preface.tex @@ -0,0 +1,57 @@ +\documentclass[../main]{subfiles} +\begin{document} +\chapter*{Preface} +This lecture note is written to address the absence of a well-formatted, illustrated reference for this course. +It began as spontaneous notes taken during class, and has gradually evolved through the careful revisions of the instructor and teaching assistants, as well as polishing assisted by large language models. + +Some parts may remain incomplete or insufficiently elaborated. I sincerely ask for your understanding. +If you find any typos, inaccuracies, or suggestions for improvement, you are warmly welcome to open an issue on GitHub: + +\begin{center} +\url{https://github.com/PhotonYan/MachineLearningCourse2025} +\end{center} + +\begin{flushright} + \textsc{Shaoheng Yan} \\[3pt] + \scriptsize\textit{Tong Class 2024, Yuanpei College} \\ + \textit{M$\mu$ Lab, Institute for Artificial Intelligence (IAI)} \\[3pt] + \textit{Peking University, November 2025} +\end{flushright} +\begin{center} + { \large \textbf{Environments in This Note}} +\end{center} + + +This lecture note adopts a unified visual style for mathematical structures and remarks. +Below are the environments used throughout the text. + +\begin{definition} +\end{definition} + +\begin{theorem} +\end{theorem} + +\begin{proposition} +\end{proposition} + +\begin{remark} + Remark represents supplementary comments made by the instructor during class. +\end{remark} + +\begin{note} + Note indicates additional reflections by the author, which the reader may skip if desired. +\end{note} + +\begin{example} +\end{example} + +% \begin{exercise} +% \end{exercise} + +\begin{proof} +\end{proof} +\begin{solution} + +\end{solution} + +\end{document} \ No newline at end of file diff --git a/notes/2025/mvp/elegantbook.cls b/notes/2025/mvp/elegantbook.cls index 6eb9bea..93f7f27 100644 --- a/notes/2025/mvp/elegantbook.cls +++ b/notes/2025/mvp/elegantbook.cls @@ -914,16 +914,18 @@ lower separated=false, % before upper={\setlength{\parindent}{\normalparindent}}, coltitle=white, + after skip=12pt, + left=12pt, right=12pt, top=6pt, bottom=6pt, colback=gray!5, - boxrule=0.5pt, + boxrule=0pt, fonttitle=\bfseries, enhanced, breakable, top=8pt, - before skip=8pt, + before skip=12pt, attach boxed title to top left={ - yshift=-0.11in, - xshift=0.15in}, + yshift=-0.05in, + xshift=0.1in}, boxed title style={ boxrule=0pt, colframe=white, @@ -1159,43 +1161,138 @@ \setcounter{exam}{0} \renewcommand{\theexam}{\thechapter.\arabic{exam}} \newenvironment{example}[1][]{ - \refstepcounter{exam} - \par\noindent\textbf{\color{main}{\examplename} \theexam #1 }\rmfamily}{ - \par\ignorespacesafterend} + \refstepcounter{exam}% 使 \label 生效 + \begin{tcolorbox}[enhanced, breakable, + colback=main!2, colframe=main!18, + boxrule=0.5pt, arc=2mm, + left=10pt, right=10pt, top=6pt, bottom=6pt, + before skip=10pt, after skip=10pt, + fonttitle=\bfseries\color{main}, + title={\textcolor{main!80!black}{% + \rule{0pt}{10pt}% 顶部占位,统一标题高度 + \examplename~\theexam% + \ifx\relax#1\relax\else\quad--\quad #1\fi + }}, + ] + \rmfamily +}{ + \end{tcolorbox} +} + + %% Exercise with counter \newcounter{exer}[chapter] \setcounter{exer}{0} \renewcommand{\theexer}{\thechapter.\arabic{exer}} \newenvironment{exercise}[1][]{ - \refstepcounter{exer} - \par\noindent\makebox[-3pt][r]{ - \scriptsize\color{red!90}\HandPencilLeft\quad} - \textbf{\color{main}{\exercisename} \theexer #1 }\rmfamily}{ - \par\ignorespacesafterend} + \refstepcounter{exer}% 允许 \label 引用 + \begin{tcolorbox}[enhanced, breakable, + colback=main!1, colframe=main!15, + boxrule=0.9pt, arc=2mm, + left=10pt, right=10pt, top=6pt, bottom=6pt, + before skip=10pt, after skip=10pt, + fonttitle=\bfseries\color{main}, + title={\textcolor{main!80!black}{% + \raisebox{1pt}{\scriptsize\color{main!80!black}\HandPencilLeft}\quad + \exercisename~\theexer% + \ifx\relax#1\relax\else\quad--\quad #1\fi + }}, + ] + \rmfamily +}{ + \end{tcolorbox} +} %% Problem with counter \newcounter{prob}[chapter] \setcounter{prob}{0} \renewcommand{\theprob}{\thechapter.\arabic{prob}} \newenvironment{problem}[1][]{ - \refstepcounter{prob} - \par\noindent\textbf{\color{main}{\problemname} \theprob #1 }\rmfamily}{ - \par\ignorespacesafterend} - -\newenvironment{note}{ - \par\noindent\makebox[-3pt][r]{ - \scriptsize\color{red!90}\textdbend\quad} - \textbf{\color{second}\notename} \citshape}{\par} - -\newenvironment{proof}{ - \par\noindent\textbf{\color{second}\proofname\;} - \color{black!90}\cfs}{ - % \hfill$\Box$\quad - \par} - -\newenvironment{solution}{\par\noindent\textbf{\color{main}\solutionname} \citshape}{\par} -\newenvironment{remark}{\noindent\textbf{\color{second}\remarkname}}{\par} + \refstepcounter{prob}% 支持 \label + \begin{tcolorbox}[enhanced, breakable, + colback=main!1, colframe=main!15, + boxrule=0.9pt, arc=2mm, + left=10pt, right=10pt, top=6pt, bottom=6pt, + before skip=10pt, after skip=10pt, + fonttitle=\bfseries\color{main}, + title={\textcolor{main!80!black}{% + \rule{0pt}{10pt}% 占位对齐标题高度 + \problemname~\theprob% + \ifx\relax#1\relax\else\quad--\quad #1\fi + }}, + ] + \rmfamily +}{ + \end{tcolorbox} +} + + + \newenvironment{note}{ + \begin{tcolorbox}[enhanced, breakable, + colback=second!2, colframe=second!13, + boxrule=1pt, arc=2mm,parbox=false, + left=10pt, right=10pt, top=6pt, bottom=6pt, + before skip=10pt, after skip=10pt, + title={\textcolor{second}{{\fontsize{7.5pt}{9pt}\selectfont\textdbend}\; \raisebox{2.5pt}{\notename}}}, + fonttitle=\bfseries\color{second}, + ]\setlength{\parindent}{0em} + \citshape + }{ + % \hfill{\color{second}$\Box$} + \end{tcolorbox} + } + + \newenvironment{proof}{ + \begin{tcolorbox}[enhanced, breakable, + colback=second!3, colframe=second!3, + boxrule=1pt, arc=0mm, + left=3pt, right=3pt, top=6pt, bottom=6pt, + before skip=10pt, after skip=10pt, + title={\textcolor{second}{\rule{0pt}{10pt}\raisebox{0pt}{\proofname}}}, + fonttitle=\bfseries\color{second}, + ]\setlength{\parskip}{0.5em} + \citshape + }{ + \hfill{\color{second}$\blacksquare$} + \end{tcolorbox} + } + +% \newenvironment{proof}{ +% \par\noindent\textbf{\color{second}\proofname\;} +% \color{black!90}\cfs}{ +% \hfill\color{second}$\blacksquare $\quad +% \par} + +\newenvironment{solution}{ + \begin{tcolorbox}[enhanced, breakable, + colback=main!3, colframe=main!3, + boxrule=1pt, arc=0mm, + left=3pt, right=3pt, top=6pt, bottom=6pt, + before skip=10pt, after skip=10pt, + title={\textcolor{main}{\rule{0pt}{10pt}\raisebox{0pt}{\solutionname}}}, + fonttitle=\bfseries\color{main}, + ] + \citshape +}{ + % \hfill{\color{main}$\Box$} + \end{tcolorbox} +} +\newenvironment{remark}{ + \begin{tcolorbox}[enhanced, breakable, + colback=third!2, colframe=third!13, + boxrule=0.5pt, arc=2mm, + left=10pt, right=10pt, top=6pt, bottom=6pt, + before skip=10pt, after skip=10pt, + title={\textcolor{third!80!black}{\rule{0pt}{10pt}\raisebox{1pt}{\remarkname}}}, + fonttitle=\bfseries\color{third}, + ] + \citshape + }{ + % \hfill{\color{second}$\Box$} + \end{tcolorbox} + } + \newenvironment{assumption}{\par\noindent\textbf{\color{third}\assumptionname} \citshape}{\par} \newenvironment{conclusion}{\par\noindent\textbf{\color{third}\conclusionname} \citshape}{\par} \newenvironment{property}{\par\noindent\textbf{\color{third}\propertyname} \citshape}{\par} @@ -1222,13 +1319,13 @@ } -\newenvironment{introduction}[1][\introductionname]{ +\newenvironment{introduction}[1][\textcolor{structurecolor}{.}\;\introductionname\;\textcolor{structurecolor}{.}]{ \begin{tcolorbox}[introductionsty,title={#1}] \begin{multicols}{2} \begin{itemize}[label=\textcolor{structurecolor}{\upshape\scriptsize\SquareShadowBottomRight}]}{ \end{itemize} \end{multicols} - \end{tcolorbox}} + \end{tcolorbox}\vspace{1em}} \RequirePackage{adforn} diff --git a/notes/2025/mvp/main.pdf b/notes/2025/mvp/main.pdf index 25acc0a..848bef3 100644 Binary files a/notes/2025/mvp/main.pdf and b/notes/2025/mvp/main.pdf differ diff --git a/notes/2025/mvp/main.tex b/notes/2025/mvp/main.tex index 298d644..b1bbdd3 100644 --- a/notes/2025/mvp/main.tex +++ b/notes/2025/mvp/main.tex @@ -19,10 +19,11 @@ \newcommand{\argmax}{\mathop{\mathrm{argmax}}} \newcommand{\argmin}{\mathop{\mathrm{argmin}}} \newcommand{\rank}{{\mathrm{rank}\,}} +\setlength{\parskip}{0.5em} \tcbset{ colback=red!5!white, colframe=red!50!black, - boxrule=0.5pt, + boxrule=0.pt, arc=2mm, left=4mm, right=4mm, @@ -47,7 +48,11 @@ \tableofcontents \mainmatter +\subfile{chapters/preface.tex} \subfile{chapters/1-lr.tex} \subfile{chapters/2-lr.tex} \subfile{chapters/3-svm.tex} +% \subfile{chapters/4-nn.tex} +\subfile{chapters/5-rt.tex} +\subfile{chapters/6-lt.tex} \end{document} \ No newline at end of file diff --git a/notes/2025/tikz/3/3.pdf b/notes/2025/tikz/3/3.pdf index dbbcbe2..5d5ffa0 100644 Binary files a/notes/2025/tikz/3/3.pdf and b/notes/2025/tikz/3/3.pdf differ diff --git a/notes/2025/tikz/3/3.tex b/notes/2025/tikz/3/3.tex index 7521629..a99073f 100644 --- a/notes/2025/tikz/3/3.tex +++ b/notes/2025/tikz/3/3.tex @@ -16,7 +16,7 @@ major grid style={line width=.2pt, draw=gray!35}, axis x line=middle, axis y line=middle, - xlabel={$\xi_i$ }, + xlabel={$t$ }, ylabel={}, xmin=-4, xmax=4, ymin=0, ymax=4.5, diff --git a/notes/2025/tikz/5/1.pdf b/notes/2025/tikz/5/1.pdf new file mode 100644 index 0000000..533f11c Binary files /dev/null and b/notes/2025/tikz/5/1.pdf differ diff --git a/notes/2025/tikz/5/1.tex b/notes/2025/tikz/5/1.tex new file mode 100644 index 0000000..b5e9a3b --- /dev/null +++ b/notes/2025/tikz/5/1.tex @@ -0,0 +1,35 @@ +\documentclass[tikz, border=5pt]{standalone} +\usepackage{tikz} +\usepackage{tikz-3dplot} +\usepackage{bm} + +\begin{document} + +% 设置视角:方位角 110°, 仰角 25° +\tdplotsetmaincoords{70}{110} + +\begin{tikzpicture}[tdplot_main_coords, scale=3] + +% 坐标轴 +\draw[->, thick] (0,0,0) -- (1.4,0,0) node[below left=-2pt]{$x_1$}; +\draw[->, thick] (0,0,0) -- (0,1.4,0) node[below left=0pt]{$x_2$}; +\draw[->, thick] (0,0,0) -- (0,0,1) node[left=3pt]{$x_3$}; + +% 绘制红色 xy 平面 +\filldraw[fill=red!15, draw=red!70, opacity=0.6] + (0,0,0) -- (1.2,0,0) -- (1.2,1.2,0) -- (0,1.2,0) -- cycle; + +% 绘制向量 (1,1,0) +\draw[->, thick, blue!80!black] (0,0,0) -- (1,1,0) + node[below left=2pt]{$\bm w^\star$}; +\node[red] at(1.4,1.5,0) {Data Plane}; +% 绘制向量 (1,1,1) +\draw[->, thick, violet!80!black] (0,0,0) -- (1,1,1) + node[above left=-2pt]{$\bm w$}; + +% 可选:辅助虚线,显示投影关系 +\draw[dashed, gray] (1,1,1) -- (1,1,0); + +\end{tikzpicture} + +\end{document} \ No newline at end of file diff --git a/notes/2025/tikz/6/1.pdf b/notes/2025/tikz/6/1.pdf new file mode 100644 index 0000000..73d36cd Binary files /dev/null and b/notes/2025/tikz/6/1.pdf differ diff --git a/notes/2025/tikz/6/1.tex b/notes/2025/tikz/6/1.tex new file mode 100644 index 0000000..6235955 --- /dev/null +++ b/notes/2025/tikz/6/1.tex @@ -0,0 +1,25 @@ +\documentclass[tikz]{standalone} +\usepackage{amsmath} +\begin{document} +\begin{tikzpicture}[>=stealth,scale=1.2] + +% axes +\draw[->] (-0.2,0) -- (4.5,0) node[right] {$x$}; +\draw[->] (0,-0.2) -- (0,3.5) node[above] {$y$}; + +% 直线 y = 0.75 x +\draw[thick] (0,0) -- (4,3); + +% 倾斜的正弦虚线:y = 0.75 x + 0.4 sin(4x) +\draw[dashed,thick,domain=0:4,smooth,variable=\x] + plot ({\x},{0.75*\x + 0.4*sin(4*\x r)}); + +% 交点:sin(4x)=0 → x = k*pi/4 +\foreach \k in {0,...,4}{ + \pgfmathsetmacro{\xx}{\k*pi/4} + \pgfmathsetmacro{\yy}{0.75*\xx} + \fill (\xx,\yy) circle (1.5pt); +} + +\end{tikzpicture} +\end{document} \ No newline at end of file diff --git a/notes/2025/tikz/6/2.pdf b/notes/2025/tikz/6/2.pdf new file mode 100644 index 0000000..8936382 Binary files /dev/null and b/notes/2025/tikz/6/2.pdf differ diff --git a/notes/2025/tikz/6/2.tex b/notes/2025/tikz/6/2.tex new file mode 100644 index 0000000..452b390 --- /dev/null +++ b/notes/2025/tikz/6/2.tex @@ -0,0 +1,37 @@ +\documentclass[tikz]{standalone} +\usepackage{amsmath} +\begin{document} +\begin{tikzpicture}[>=stealth,scale=1] + +% 数据点(X 类) +\node at (-2,1.2) {$\times$}; +\node at (-1,1.8) {$\times$}; +\node at (0.0,1.3) {$\times$}; +\node at (0.7,1.9) {$\times$}; + +% 数据点(O 类) +\node at (-1.8,-0.4) {$\circ$}; +\node at (-0.8,-0.9) {$\circ$}; +\node at (0.1,-0.3) {$\circ$}; +\node at (0.9,-0.8) {$\circ$}; + +% 三条线性分类器(都给出相同分类) +\begin{scope}[shift={(0,0.3)}] + \draw[thick] (-2.5,0.4) -- (2.5,0.6); % h1 +\draw[thick] (-2.5,0.1) -- (2.5,-0.3); % h2 +\draw[thick] (-2.5,-0.2) -- (2.5,0.9); % h3 +\end{scope} + + +% 右侧标记 h1, h2, h3 + +\node[right] at (2.7,1.4) {$h_1$}; + + +\node[right] at (2.7,0.8) {$h_2$}; + + +\node[right] at (2.7,-0.1) {$h_3$}; + +\end{tikzpicture} +\end{document} \ No newline at end of file