% File src/library/stats/vignettes/smoothings-splines.Rnw
% Part of the R package, https://www.R-project.org
% Copyright 2015-2016 Martin Maechler
% Distributed under GPL 2 or later

\documentclass[a4paper]{article}
%\VignetteIndexEntry{Smoothing Splines: Details on \texttt{smooth.splines()}}
%\VignettePackage{stats}
%\VignetteDepends{stats,utils,grDevices,graphics}

\usepackage[a4paper, text={15cm,23cm}]{geometry}
\setlength{\parindent}{0in}
\setlength{\parskip}{.1in}
%
\usepackage{statsvign}

\title{Details on \R's \texttt{smooth.spline()}}
\author{Martin Maechler}

\begin{document}
\maketitle

<<init, echo=FALSE, results=hide>>=
library(stats)
library(utils)
library(grDevices)
ps.options(pointsize = 12)
options(width = 72)
@

\section*{Smoothing splines penalized regression}\label{sec.smospl}

Given observations (our data), $(x_i,Y_i)\ (i=1,\ldots, n)$, a quite
general model for such data is
\begin{eqnarray}\label{ch3.mod.npreg}
Y_i = m(x_i) + \eps_i,
\end{eqnarray}
where $\eps_1,\ldots ,\eps_n$ i.i.d. with $\EE[\eps_i]= 0$ and
$m: \mathbb{R} \to \mathbb{R}$ is an ``arbitrary'' function. The function
$m(\cdot)$ is
called the nonparametric regression function and it satisfies $m(x) =
\EE[Y|x]$ and should fulfill some kind of smoothness conditions.

One fruitful approach to estimate such a ``smooth'' function $m()$ is via
so called ``smoothing splines'' (or their generalization, ``penalized
regression splines'').

\subsection*{Penalized sum of squares}
Consider the following problem: among all functions $m$ with continuous
second derivative, find the one which minimizes the penalized residual sum
of squares
\begin{eqnarray}\label{ch3.penrss}
L_{\lambda}(m) &:=& \sum_{i=1}^n (Y_i - m(x_i))^2 + \lambda \int m''(t)^2 \; dt,
\end{eqnarray}
where $\lambda > 0$ is a smoothing parameter. The first term measures
closeness to the data and the second term penalizes curvature
(``roughness'') of the function. The two extreme cases are:
\begin{itemize}
\item $\lambda = 0$: As any function $m$ interpolating the data gives $L_0(m) =
  0$, hence (\ref{ch3.penrss}) does require $\lambda > 0$.
  In the limit, $\lambda \to 0$, however, $\hat m_\lambda \to$ the well
  defined interpolating natural cubic spline).
  \footnote{We will see that taking the limit
    $\lambda \to 0$ is problematic directly numerically and in practice
    you should rather use \code{spline()} for spline \emph{interpolation}
    instead of smoothing.}
\item $\lambda = \infty$: any linear function fulfills $m''(x) \equiv 0$,
  and the minimizer of (\ref{ch3.penrss}) is the least squares regression line.
\end{itemize}

\subsection*{The smoothing spline solution}
Remarkably, the minimizer of (\ref{ch3.penrss}) is \emph{finite}-dimensional,
although the criterion to be minimized is over the infinite-dimensional
Sobolev space of functions for which the integral $\int m''^2$ is finite.

Let us assume for now that the data has $x$
values sorted and unique, % for non-unique x_i, take means of y's and use weights!
  \[
    x_1 < x_2 < \ldots < x_n.
  \]

The solution $\hat{m}_{\lambda}(\cdot)$ (i.e., the unique minimizer of
(\ref{ch3.penrss})) is a natural \textbf{cubic spline}
with knots $t_1,t_2,\dots,t_{n_k}$ which are the sorted unique values of
$\{x_1,x_2,\dots,x_n\}$.
That is, $\hat{m}$ is a piecewise cubic polynomial in
each interval $[t_j,t_{j+1})$ such that $\hat{m}_{\lambda}^{(k)}\ (k=0,1,2)$
is continuous everywhere and has ``natural'' boundary conditions
$\hat{m}''(t_1)= \hat{m}''(t_{n_k})= 0$.
For the $n_k-1$ cubic polynomials, we'd need $(n_k-1)\cdot 4$ coefficients.  Since
there are $(n_k-2)\cdot 3$ continuity conditions (at every ``inner knot'',
$j=2,\dots,n_k-1$) plus the 2 ``natural'' conditions, this leaves
$4(n_k-1) - [ 3(n_k-2) + 2 ] = n_k$ free parameters (the $\beta_j$'s below).
%%
Knowing that the solution is a cubic spline, it can be obtained
by linear algebra.  We represent
\begin{eqnarray}\label{ch3.basis}
  m_\lambda(x) = \sum_{j=1}^{n_k} \beta_j B_j(x),
\end{eqnarray}
where the $B_j(\cdot)$'s are basis functions for natural splines. The
unknown coefficients can then be estimated from least squares in linear
regression under side constraints. The criterion in (\ref{ch3.penrss}) for
$\hat{m}_{\lambda}$ as in (\ref{ch3.basis}) then becomes
\begin{eqnarray*}
\tilde{L}_{\lambda}(\bbeta) := L_{\lambda}(m) &=& \norm{\bY - X \bbeta}^2 + \lambda \bbeta\tr \Omega \bbeta,
\end{eqnarray*}
respectively, when not all weights $w_i$ are 1,
\begin{eqnarray}\label{L_W.def}
\tilde{L}_{\lambda}(\bbeta) &=&
        (\bY - X \bbeta)\tr W (\bY - X \bbeta) + \lambda \bbeta\tr\Omega\bbeta,
\end{eqnarray}
where the design matrix $X$ has $j$th column $(B_j(x_1),\ldots,B_j(x_n))\tr$,
i.e.,
\begin{eqnarray*}
       X_{ij} &=& B_j(x_i) \ \ \mathrm{for}\ i=1,\dots,n, \\
       W     &=& \mathtt{diag(w)}, \ \mathrm{i.e.,}
                 \ W_{ij} = \mathbf{1}_{[i = j]} \cdot w_i, \hspace*{2em} \mathrm{and} \\
  \Omega_{jk} &=& \int B_j''(t) B_k''(t) \;dt, \ \ \mathrm{for}\ j,k = 1,\dots,n_k.
\end{eqnarray*}
The solution, $\Hbbeta = \argmin_{\beta} \tilde{L}_{\lambda}(\bbeta)$ can
then be derived by setting the gradient
$\frac{\partial}{\partial\bbeta} \tilde{L}_{\lambda}(\bbeta)$ to zero:
$\mathbf{0} = -2(X\tr W \bY)\tr \bbeta + 2(X\tr W X + \lambda \Omega)\bbeta$,
and hence
\begin{eqnarray}\label{ch3.sol}
%\Hbbeta_{n\times 1} = ....
 \Hbbeta             = (X\tr W X + \lambda \Omega)^{-1} X\tr W \bY.
\end{eqnarray}
% This can be computed efficiently using fast linear algebra, particularly
% when $X$ is a banded matrix.
When B-splines are used as basis function $B_j$, both $X$ and $\Omega$ are
\emph{banded} matrices, i.e., zero apart from a ``band'', i.e., few central
diagonals.
As,
\begin{eqnarray*}% \label{eq:fit}
  \hat{m}_{\lambda}(x) = \sum_{j=1}^{n_k} \hat{\beta_j} B_j(x),
\end{eqnarray*}
the fitted values are  $\hat\bY = X \Hbbeta$, where $\hat{Y}_i =
\hat{m}_{\lambda}(x_i)\ (i=1,\ldots,n)$,
and
\begin{eqnarray}
  \label{eq:Yhat.spline}
  \hat\bY = X \Hbbeta = \mathcal{S}_{\lambda} \bY, \ \ \mathrm{where}\ \
  \mathcal{S}_{\lambda} = X (X\tr W X + \lambda \Omega)^{-1} X\tr W .
\end{eqnarray}

The hat matrix $\mathcal{S}_{\lambda} = \mathcal{S}_{\lambda}\tr $ is symmetric which
implies elegant mathematical properties (real-valued eigen-decomposition).

\section*{........}

<<results=hide>>=
@

\section*{Notes}
\begin{enumerate}
\item
\item
\end{enumerate}

\end{document}