### reading in the dataset ## (use read.csv for comma-separated files or the sep option) dat <- read.table(file="brainsize.txt",header=TRUE) #### taking a look at the data dim(dat) str(dat) dat[1:8,] ## response Y is FSIQ (full-scale IQ), explanatory variables are ## Gender, Height, Weight, MRI_Count (brainsize) Y <- dat$FSIQ X <- dat[, c("Gender","Weight","Height","MRI_Count")] ## turn factor "Gender" into numeric dummy variable X[,"Gender"] <- as.numeric( X[,"Gender"] == "Female") ## add intercept column X <- cbind( rep(1,nrow(X)), as.matrix(X)) n <- nrow(X) ## number n of samples p <- ncol(X) ## number p of predictors (including intercept) ## do nsim simulations with a known theta value nsim <- 1000 thetahat <- matrix(nrow=nsim,ncol=p) for (sim in 1:nsim){ ## generate pseudo-real data with "true" theta=(0,0,-1,4,0) theta <- c(0,0,-1,4,0) Y <- X %*% theta + runif(n,-1,1) ## estimate theta and write into next row of matrix thetahat thetahat[sim,] <- solve( t(X)%*%X, t(X)%*%Y) } boxplot(thetahat) ## plot results par(mfrow=c(p,1)) for (k in 1:p){ hist( thetahat[,k],col=2,xlim=c(-6,6)) abline(v= theta[k],lwd=1,col=4) } ## compare predicted standard deviation with theoretical values sdrealized <- apply(thetahat,2,sd) sdtheory <- sqrt(diag(solve(t(X)%*%X))) * 0.5746 ## 0.54746 is sd of uniform[-1,1] distributed random variable plot( sdtheory,sdrealized,pch=20,col=rgb(0.2,0.2,0.4,0.6),cex=2) abline(c(0,1))