### reading in the dataset
## (use read.csv for comma-separated files or the sep option)
dat <- read.table(file="brainsize.txt",header=TRUE)
pdf(file="tmp.pdf",width=10,height=10)
plot(dat,pch=20,col=rgb(0.2,0.2,0.4,0.4))
dev.off()

#### taking a look at the data
dim(dat)
str(dat)
dat[1:8,]


## response Y is FSIQ (full-scale IQ), explanatory variables are
## Gender, Height, Weight, MRI_Count (brainsize)
Y <- dat$FSIQ
X <- dat[, c("Gender","Weight","Height","MRI_Count")]
## turn factor "Gender" into numeric dummy variable
X[,"Gender"] <- as.numeric( X[,"Gender"] == "Female")


## fit a linear model
linmod <- lm( Y ~ . , data=X)
summary(linmod)


## compute the coefficients "by hand" and add intercept column
colX <- colnames(X)
X <- cbind( rep(1,nrow(X)),  as.matrix(X))
colnames(X) <- c("Intercept", colX)
X[1:8,]

thetahat <- solve( t(X)%*%X, t(X)%*%Y)  
print(thetahat)
print(coef(linmod))


print(cbind( Y, X))


## Tukey-Anscombe plot of residuals versus fitted values
#pdf(file="brainsize_tukey.pdf",width=6,height=6)
plot(fitted(linmod) , residuals(linmod))
abline(h=0,lty=3)
#dev.off()