# Rscript for (nonparametric) regression

# (You need to install package 'car' beforehand)

# Load package 'car'
library(car)

# Load data set Prestige
data(Prestige)
Prestige

# Get information about data set
?Prestige
# education, prestige and other characteristics of 102 
# different occupations

# get the column names
names(Prestige)

# Sort the data based on income
ind <- order(Prestige$income)
sorted <- Prestige[ind,]
attach(sorted)

# Change one value to show the effect of an outlier:
prestige[30] <- 120
# note that R is case sensitive, and that the above command refers to the *column* "prestige"


########################################################
# Nonparametric regression with non-overlapping strips #
########################################################

# Make scatterplot
plot(income, prestige,
     main="Nonparametric regression using strips")

# Draw the cut-off values:
# abline(a,b) draws a line with intercept a and slope b
# abline(v=10) draws a vertical line with x-coordinate 10
# abline(h=10) draws a horizontal line with y-coordinate 10
# lty stands for line type, lty=2 gives a dashed line
abline(v=income[20],lty=2)
abline(v=income[40],lty=2)
abline(v=income[60],lty=2)
abline(v=income[80],lty=2)

# Compute the mean of income and prestige in each group, and 
# add these to the plot. We use the command 'c' (concatenate)
# to put the values together in a vector
xmeans <- c(mean(income[1:20]),mean(income[21:40]),
            mean(income[41:60]),mean(income[61:80]),
            mean(income[81:102]))
ymeans <- c(mean(prestige[1:20]),mean(prestige[21:40]),
            mean(prestige[41:60]),mean(prestige[61:80]),
            mean(prestige[81:102]))

# add the points to the plot
points(xmeans, ymeans, pch=3, col="red")

# connect the points by a line
lines(xmeans,ymeans,col="red")


####################################################
# Nonparametric regression with overlapping strips #
####################################################

# Make scatterplot
plot(income,prestige,main="Local averaging and loess")

# We now use local averaging (overlapping strips), with 
# 21 data points. For the first 11 strips, we average 
# the first 21 values of the data set, using a for-loop:
resy <- NULL
for(i in 1:11){
  resy <- c(resy,mean(prestige[1:21]))
}

# For the i=12 up to the 91, we average the values from 
# i-10 up to i+10:
for(i in 12:91){
  resy <- c(resy,mean(prestige[(i-10):(i+10)]))
}

# For the last 11 strips, we average the last 21 values 
# of the data set:
for (i in 92:102){
  resy <- c(resy,mean(prestige[82:102]))
}

lines(income,resy,lwd=2, col="red")

# identify outlier
abline(v=income[30],lty=2)

# add a smooth curve computed by loess
lines(loess.smooth(income,prestige),lwd=2,
      col="blue",type="l")