MuLabPKU · PhotonYan · Oct 29, 2025 · Oct 29, 2025 · Oct 29, 2025 · Oct 29, 2025
diff --git a/.gitignore b/.gitignore
@@ -8,4 +8,8 @@
 *.fls
 *.fdb_latexmk
 .DS_Store
-.vscode/
+.vscode/
+
+notes/2025/tikz/4
+notes/2025/mvp/chapters/4-nn.tex
+notes/2025/mvp/chapters/4-nn.pdf
diff --git a/notes/2025/mvp/chapters/1-lr.pdf b/notes/2025/mvp/chapters/1-lr.pdf
diff --git a/notes/2025/mvp/chapters/1-lr.tex b/notes/2025/mvp/chapters/1-lr.tex
@@ -8,10 +8,7 @@ \chapter{Linear Regression}
 \item Ridge Regression ($L_2$ regularization)
 \item Lasso Regression ($L_1$ regularization)
 \end{introduction}
-\section{Basic Knowledge}
-\begin{example}
-\textbf{Linear Regression}
-\end{example}
+\section{Basic Knowledge of Linear Regression}
 
 {Settings.}
 \begin{itemize}
@@ -31,17 +28,15 @@ \section{Basic Knowledge}
   \end{definition}
 \end{itemize}
 
-\textbf{Quiz.} How to determine whether a parameter is learnable?
-Quiz: How to determine $w$ and $b$?
-Quiz: How to determine $w$ and $b$?
+\noindent\textbf{Quiz.} How to determine whether a parameter is learnable? How to determine $w$ and $b$?
+
 
 \noindent Ans: \textbf{ERM} (Empirical Risk Minimization)
 
 \begin{itemize}
   \item Loss function. Squared Loss (SE) is commonly used during optimization. The training objective can be written as:
     \begin{equation}
         \argmin_{w,b}{\color{blue}{\frac{1}{n}}}\sum_{i\in[n]}\left(y_i-(w^\top x_i+b)\right)^2
-        \argmin_{w,b}{\color{blue}{\frac{1}{n}}}\sum_{i\in[n]}\left(y_i-(w^\top x_i+b)\right)^2
       \end{equation}
 
   The blue factor $1/n$ can be omitted in theoretical analysis, but is often kept in practice to stabilize the loss function during implementation.

diff --git a/notes/2025/mvp/chapters/2-lr.pdf b/notes/2025/mvp/chapters/2-lr.pdf
diff --git a/notes/2025/mvp/chapters/2-lr.tex b/notes/2025/mvp/chapters/2-lr.tex
@@ -9,9 +9,9 @@ \chapter{Logistic Regression}
 \item Maximum a posteriori
 \end{introduction}
 \section{Classification}
-\begin{example}
-  \textbf{Binary Classification Problem}
-\end{example}
+
+  \subsection{Binary Classification Problem}
+
 Settings.
 \begin{itemize}
   \item Dataset: $D=\{(x_i,y_i)\}_{i=1}^n$, where $x_i\in\mathbb{R}^d$ and $y_i\in\{0,1\}$.  
@@ -194,9 +194,9 @@ \section{Classification}
 \end{remark}
 \vspace{1em}
 
-\begin{example}
-  \textbf{Multi-Class Classificaiton (Softmax Regression)}
-\end{example}
+
+  \subsection{Multi-Class Classificaiton (Softmax Regression)}
+
 We can combine $k$ sub-classifiers to solve a $k$-class classification task. 
 Specifically, we apply a sigmoid-like transformation to each sub-linear model 
 $f_k(x) = w_k^\top x + b_k$, and obtain the probability of class $k$ using a normalized expression:
@@ -249,9 +249,9 @@ \section{Classification}
   via the reparameterization $w = w_1 - w_2$ and $b = b_1 - b_2$.
 \end{remark}
 \section{Rethink of Linear Regression}
-\begin{example}
-  \textbf{MLE Explanation for Linear Regression}
-\end{example}
+
+  \subsection{MLE Explanation for Linear Regression}
+
 \begin{definition}[Gaussian/Normal Distribution]
     \begin{equation}
         x\sim\mathcal N(\mu,\sigma^2)\quad\Leftrightarrow\quad P(x)=\frac{1}{\sqrt{2\pi\sigma^2}}\exp\left(-\frac{(x-\mu)^2}{2\sigma^2}\right)
@@ -289,8 +289,8 @@ \section{Rethink of Linear Regression}
 \end{align}
 where the equivalence follows by dropping constants and positive scalings. 
 Equation~\eqref{2.24} recovers ERM with the squared-loss objective.
-\begin{example}\textbf{Maximum a Posteriori (MAP)}
-\end{example}
+\subsection{Maximum a Posteriori (MAP)}
+
   In the MLE perspective, $w$ and $b$ are treated as unknown fixed constants. 
   In the Bayesian framework, however, even $w,b$ are considered as random variables (R.V.). 
   A  ``fixed constant'' can be seen as a random variable with an extremely sharp distribution (near $\delta$-distribution).

diff --git a/notes/2025/mvp/chapters/3-svm.pdf b/notes/2025/mvp/chapters/3-svm.pdf
diff --git a/notes/2025/mvp/chapters/3-svm.tex b/notes/2025/mvp/chapters/3-svm.tex
@@ -347,7 +347,7 @@ \section{Dual Form Of SVM}
 \subsection{Preliminaries from Convex Analysis*}
 \begin{tcolorbox}
     \vspace{0.5em}
-        \color{red!50!black}$\circledast   $\quad\emph{\textbf{This subsection is included out of personal interest and is not part of the course material. 
+        \color{red!50!black}$\circledast   $\quad\emph{{This subsection is included out of personal interest and is not part of the course material. 
         Readers who wish to focus solely on the course content may safely skip to the next subsection.}}
     \vspace{0.5em}
 \end{tcolorbox}
@@ -557,6 +557,7 @@ \subsection{Preliminaries from Convex Analysis*}
     \end{equation}
     \end{theorem}
 \begin{definition}[Hilbert Space]
+    \label{def:3.4}
     A \textbf{Hilbert space} is a complete inner product space.  
     % Formally, let $\mathcal{H}$ be a vector space over $\mathbb{R}$ or $\mathbb{C}$ endowed with an inner product
     % \begin{equation}
@@ -1042,10 +1043,66 @@ \section{Kernel}
 
 In this way, the model, known as the \textbf{Soft-Margin SVM}, allows small violations of the margin to achieve a better balance between robustness and generalization. This is the most commonly used form of SVM in practice.
 
-% \begin{figure}[H]
-%     \centering
-%     \includegraphics{../../tikz/3/3.pdf}
-%     \caption{Hinge loss and CE loss.}
-%     \label{3-svm}
-% \end{figure}
+\vspace{1em}
+
+We claim that the constraints on the slack variables are equivalent to the following formulation:
+\begin{equation}
+    \xi_i \ge \max(0,\, 1 - y_i(w^\top x_i + b)).
+\end{equation}
+
+In the minimization problem 
+\begin{equation}
+    \arg\min_{w,b,\xi}\ \frac{1}{2}\|w\|^2 + C\sum_i \xi_i,
+\end{equation}
+it is crucial to observe that each $\xi_i$ will always take the minimal feasible value,either $0$ or $1 - y_i(w^\top x_i + b)$.  
+Otherwise, if $\xi_i$ were larger than necessary, the objective $\sum_i \xi_i$ could be further reduced by decreasing $\xi_i$.  
+
+Thus, the optimization problem can be reformulated as an unconstrained objective:
+\begin{equation}
+    \argmin_{w,b}\ \frac{1}{2}\|w\|^2 + C\sum_{i=1}^n \max(0,\, 1 - y_i(w^\top x_i + b)).
+\end{equation}
+
+This formulation introduces the \textbf{Hinge loss}.
+
+\begin{definition}[Hinge loss]
+The hinge loss function $L(z)$ is defined as:
+\begin{equation}
+    L(z) = \max(0,\, 1 - z).
+\end{equation}
+\end{definition}
+\begin{figure}[H]
+    \centering
+    \includegraphics{../../tikz/3/3.pdf}
+    \caption{Hinge loss and CE loss.}
+    \label{4-svm}
+\end{figure}
+Intuitively, the hinge loss $L_{\rm hinge}(t)$ remains zero when $t \ge 1$.  
+This property directly enforces a margin: samples that already lie beyond the decision boundary (i.e., sufficiently far from the separating hyperplane) incur no loss,  
+while those within or on the wrong side of the margin contribute a positive penalty.  
+In this sense, the hinge loss explicitly encourages a wider margin between classes.
+
+For comparison, consider the cross-entropy loss:
+\begin{equation}
+    -\log(\sigma(z)) = \log(1 + e^{-z}),
+\end{equation}
+where $\sigma(z)$ denotes the sigmoid function.  
+
+\begin{remark}
+    In class, we used $\log_2$ instead of the natural logarithm, leading to an approximation when $x\to -\infty$:
+    \begin{equation}
+        \log(1 + e^{-z}) \sim \frac{-z}{\log_2 e},
+    \end{equation}
+    which results in a slightly different scaling in the plotted curve.
+\end{remark}
+
+Both the cross-entropy loss and the hinge loss serve as \emph{substitute losses} for the zero-one loss:
+\begin{equation}
+    L_{\rm zero\text{-}one}(z) =
+    \begin{cases}
+        1, & z < 0,\\[4pt]
+        0, & z \ge 0.
+    \end{cases}
+\end{equation}
+The zero-one loss is non-differentiable and provides no gradient information,  
+so gradient-based optimization methods cannot be directly applied in this case.
 \end{document}
diff --git a/notes/2025/mvp/chapters/5-rt.pdf b/notes/2025/mvp/chapters/5-rt.pdf