% File src/library/stats/vignettes/smoothings-splines.Rnw % Part of the R package, https://www.R-project.org % Copyright 2015-2016 Martin Maechler % Distributed under GPL 2 or later \documentclass[a4paper]{article} %\VignetteIndexEntry{Smoothing Splines: Details on \texttt{smooth.splines()}} %\VignettePackage{stats} %\VignetteDepends{stats,utils,grDevices,graphics} \usepackage[a4paper, text={15cm,23cm}]{geometry} \setlength{\parindent}{0in} \setlength{\parskip}{.1in} % \usepackage{statsvign} \title{Details on \R's \texttt{smooth.spline()}} \author{Martin Maechler} \begin{document} \maketitle <>= library(stats) library(utils) library(grDevices) ps.options(pointsize = 12) options(width = 72) @ \section*{Smoothing splines penalized regression}\label{sec.smospl} Given observations (our data), $(x_i,Y_i)\ (i=1,\ldots, n)$, a quite general model for such data is \begin{eqnarray}\label{ch3.mod.npreg} Y_i = m(x_i) + \eps_i, \end{eqnarray} where $\eps_1,\ldots ,\eps_n$ i.i.d. with $\EE[\eps_i]= 0$ and $m: \mathbb{R} \to \mathbb{R}$ is an ``arbitrary'' function. The function $m(\cdot)$ is called the nonparametric regression function and it satisfies $m(x) = \EE[Y|x]$ and should fulfill some kind of smoothness conditions. One fruitful approach to estimate such a ``smooth'' function $m()$ is via so called ``smoothing splines'' (or their generalization, ``penalized regression splines''). \subsection*{Penalized sum of squares} Consider the following problem: among all functions $m$ with continuous second derivative, find the one which minimizes the penalized residual sum of squares \begin{eqnarray}\label{ch3.penrss} L_{\lambda}(m) &:=& \sum_{i=1}^n (Y_i - m(x_i))^2 + \lambda \int m''(t)^2 \; dt, \end{eqnarray} where $\lambda > 0$ is a smoothing parameter. The first term measures closeness to the data and the second term penalizes curvature (``roughness'') of the function. The two extreme cases are: \begin{itemize} \item $\lambda = 0$: As any function $m$ interpolating the data gives $L_0(m) = 0$, hence (\ref{ch3.penrss}) does require $\lambda > 0$. In the limit, $\lambda \to 0$, however, $\hat m_\lambda \to$ the well defined interpolating natural cubic spline). \footnote{We will see that taking the limit $\lambda \to 0$ is problematic directly numerically and in practice you should rather use \code{spline()} for spline \emph{interpolation} instead of smoothing.} \item $\lambda = \infty$: any linear function fulfills $m''(x) \equiv 0$, and the minimizer of (\ref{ch3.penrss}) is the least squares regression line. \end{itemize} \subsection*{The smoothing spline solution} Remarkably, the minimizer of (\ref{ch3.penrss}) is \emph{finite}-dimensional, although the criterion to be minimized is over the infinite-dimensional Sobolev space of functions for which the integral $\int m''^2$ is finite. Let us assume for now that the data has $x$ values sorted and unique, % for non-unique x_i, take means of y's and use weights! \[ x_1 < x_2 < \ldots < x_n. \] The solution $\hat{m}_{\lambda}(\cdot)$ (i.e., the unique minimizer of (\ref{ch3.penrss})) is a natural \textbf{cubic spline} with knots $t_1,t_2,\dots,t_{n_k}$ which are the sorted unique values of $\{x_1,x_2,\dots,x_n\}$. That is, $\hat{m}$ is a piecewise cubic polynomial in each interval $[t_j,t_{j+1})$ such that $\hat{m}_{\lambda}^{(k)}\ (k=0,1,2)$ is continuous everywhere and has ``natural'' boundary conditions $\hat{m}''(t_1)= \hat{m}''(t_{n_k})= 0$. For the $n_k-1$ cubic polynomials, we'd need $(n_k-1)\cdot 4$ coefficients. Since there are $(n_k-2)\cdot 3$ continuity conditions (at every ``inner knot'', $j=2,\dots,n_k-1$) plus the 2 ``natural'' conditions, this leaves $4(n_k-1) - [ 3(n_k-2) + 2 ] = n_k$ free parameters (the $\beta_j$'s below). %% Knowing that the solution is a cubic spline, it can be obtained by linear algebra. We represent \begin{eqnarray}\label{ch3.basis} m_\lambda(x) = \sum_{j=1}^{n_k} \beta_j B_j(x), \end{eqnarray} where the $B_j(\cdot)$'s are basis functions for natural splines. The unknown coefficients can then be estimated from least squares in linear regression under side constraints. The criterion in (\ref{ch3.penrss}) for $\hat{m}_{\lambda}$ as in (\ref{ch3.basis}) then becomes \begin{eqnarray*} \tilde{L}_{\lambda}(\bbeta) := L_{\lambda}(m) &=& \norm{\bY - X \bbeta}^2 + \lambda \bbeta\tr \Omega \bbeta, \end{eqnarray*} respectively, when not all weights $w_i$ are 1, \begin{eqnarray}\label{L_W.def} \tilde{L}_{\lambda}(\bbeta) &=& (\bY - X \bbeta)\tr W (\bY - X \bbeta) + \lambda \bbeta\tr\Omega\bbeta, \end{eqnarray} where the design matrix $X$ has $j$th column $(B_j(x_1),\ldots,B_j(x_n))\tr$, i.e., \begin{eqnarray*} X_{ij} &=& B_j(x_i) \ \ \mathrm{for}\ i=1,\dots,n, \\ W &=& \mathtt{diag(w)}, \ \mathrm{i.e.,} \ W_{ij} = \mathbf{1}_{[i = j]} \cdot w_i, \hspace*{2em} \mathrm{and} \\ \Omega_{jk} &=& \int B_j''(t) B_k''(t) \;dt, \ \ \mathrm{for}\ j,k = 1,\dots,n_k. \end{eqnarray*} The solution, $\Hbbeta = \argmin_{\beta} \tilde{L}_{\lambda}(\bbeta)$ can then be derived by setting the gradient $\frac{\partial}{\partial\bbeta} \tilde{L}_{\lambda}(\bbeta)$ to zero: $\mathbf{0} = -2(X\tr W \bY)\tr \bbeta + 2(X\tr W X + \lambda \Omega)\bbeta$, and hence \begin{eqnarray}\label{ch3.sol} %\Hbbeta_{n\times 1} = .... \Hbbeta = (X\tr W X + \lambda \Omega)^{-1} X\tr W \bY. \end{eqnarray} % This can be computed efficiently using fast linear algebra, particularly % when $X$ is a banded matrix. When B-splines are used as basis function $B_j$, both $X$ and $\Omega$ are \emph{banded} matrices, i.e., zero apart from a ``band'', i.e., few central diagonals. As, \begin{eqnarray*}% \label{eq:fit} \hat{m}_{\lambda}(x) = \sum_{j=1}^{n_k} \hat{\beta_j} B_j(x), \end{eqnarray*} the fitted values are $\hat\bY = X \Hbbeta$, where $\hat{Y}_i = \hat{m}_{\lambda}(x_i)\ (i=1,\ldots,n)$, and \begin{eqnarray} \label{eq:Yhat.spline} \hat\bY = X \Hbbeta = \mathcal{S}_{\lambda} \bY, \ \ \mathrm{where}\ \ \mathcal{S}_{\lambda} = X (X\tr W X + \lambda \Omega)^{-1} X\tr W . \end{eqnarray} The hat matrix $\mathcal{S}_{\lambda} = \mathcal{S}_{\lambda}\tr $ is symmetric which implies elegant mathematical properties (real-valued eigen-decomposition). \section*{........} <>= @ \section*{Notes} \begin{enumerate} \item \item \end{enumerate} \end{document}