library(fields) library(randomForest) library(NNLM) plotim <- function(x,main=NULL) image.plot( 1:ncol(x),1:nrow(x),t(x)[,nrow(x):1],col=tim.colors(100),main=main ) plotimmnist <- function(x,main=NULL){ x <- matrix(x,nrow=28) image( (1-x)[, ncol(x):1],col=gray.colors(200)) } ## microarray data ####################### ##################################### load(file="microarray/leukemia_new.rda") X <- leukemia.x[,1:500] #Y <- leukemia.y X <- scale(X) X <- sign(X)*sqrt(abs(X)) evd <- eigen( t(X)%*%X) sv <- svd(X) par(mfrow=c(4,4),mar=rep(2,4)) ## full score and loadings matrix (without truncation) A <- sv$u %*% diag(sv$d) ## scores H <- t(sv$v) ## plot for rank-q approximation the original data, the approximated data, and the score and loadings matrix for (q in c(2,5,10,40)){ Vq <- evd$vectors[,1:q] plotim(X[order(Y),],main="original") plotim(sv$u[,1:q] %*% diag(sv$d[1:q]) %*% t(sv$v[,1:q]),main="PCA approximation") Atrunc <- A Atrunc[,(q+1):ncol(A)] <- 0 Htrunc <- H Htrunc[(q+1):nrow(Htrunc),] <- 0 plotim(Atrunc,main="SCORES") plotim(Htrunc,main="LOADINGS") } ### climate data load(file="January.rda") X <- TEMPm for (year in 1:16) image.plot(X[,,year]) X <- t(apply(TEMPm,3,as.vector)) X <- scale(X, scale=FALSE) for (year in 1:16) image.plot(matrix( X[year,], nrow=144)) sv <- svd(X) dd <- sv$d q <- 30 dd[(q+1):length(dd)] <- 0 Xhat <- sv$u %*% diag(dd) %*% t(sv$v) par(mfrow=c(4,4)) for (year in 1:8){ image.plot(matrix( X[year,], nrow=144),main=paste("orginal year", year)) image.plot(matrix( Xhat[year,], nrow=144),main=paste("PCA approx year", year)) } for (comp in 1:16){ image.plot( matrix( sv$v[,comp], nrow=144)) } ## att faces #################### ################################# imp <- function(x,main=NULL) image.plot( t(matrix(x,nrow=112))[,112:1], col=gray.colors(200),axes=FALSE,main=main) load(file="faces.rda") Xa <- X X <- t(apply(Xa,3,as.vector)) ## compute SVD up to 200 singular vectors sv <- svd( X, nu=200, nv=200) pictvec <- c(1,50,100,200,300,400) ## select these faces qvec <- c(2,5,10,30,100,200) ## approximate qith these low-rank approximations ## plot original data par(mfrow=c(length(qvec)+1,length(pictvec))) for (pict in pictvec){ imp( X[pict,]) } ## plot approximations for the values of q in qvec for (qcomp in qvec){ Xhat <- sv$u[,1:qcomp,drop=FALSE] %*% diag(sv$d[1:qcomp]) %*% t(sv$v[,1:qcomp,drop=FALSE]) for (pict in pictvec){ imp(Xhat[pict,],main=qcomp) } } ## show the eigenfaces (the eigenvectors) par(mfrow=c(4,4)) for (qc in 1:16){ imp(sv$v[,qc],main=qc) } ## stock data ####################### ##################################### # load sp500 stock prices, compute log-returns and do a PCA dat <- read.csv(file="SP/sp500hst.txt",header=FALSE) colnames(dat) <- c("date","stock","open","high","low","close","tmp") agg <- aggregate( dat[,"date"], by=list(stock=dat[,"stock"]), FUN=function(x) length(unique(x))) selectstocks <- agg[agg[,2]==245,1] datred <- dat[dat[,"stock"] %in% selectstocks,c("date","stock","open")] dates <- unique(datred[,"date"]) n <- length(dates) p <- length(selectstocks) X <- matrix(nrow=n,ncol=p) colnames(X) <- selectstocks rownames(X) <- dates for (i in 1:n){ X[i,] <- datred[ which(datred[,"date"]==dates[i]),"open"] } RET <- apply(log(X),2,diff) evd <- eigen( t(RET)%*%RET) plot(cumsum(evd$values)) Vq <- evd$vectors[,1:200] RETRED <- RET %*% Vq %*% t(Vq) colnames(RETRED) <- colnames(RET) matplot(RET,type="l",lty=1) matplot(RETRED,type="l",lty=1) matplot(apply(RET[,1:10],2,cumsum),type="l") matplot(apply(RETRED[,1:10],2,cumsum),type="l") ### movie reviews #################### ###################################### ## load movie review data library(text2vec) library(data.table) data("movie_review") setDT(movie_review) setkey(movie_review, id) set.seed(2016L) all_ids <- movie_review$id train_ids <- sample(all_ids, 4000) test_ids <- setdiff(all_ids, train_ids) train <- movie_review[J(train_ids)] test <- movie_review[J(test_ids)] prep_fun <- tolower tok_fun <- word_tokenizer it_train <- itoken(train$review, preprocessor = prep_fun, tokenizer = tok_fun, ids = train$id, progressbar = FALSE) vocab <- create_vocabulary(it_train) vectorizer <- vocab_vectorizer(vocab) ## data in matrix form in a bag-of-words approach X <- create_dtm(it_train, vectorizer) Y <- train[['sentiment']] ##compute svd up to q terms q <- 100 sv <- svd(X, nu=q,nv=q) ## computing svd takes a while -- if reloading later can store and load the results like here: save(sv,file="sv.rda") load(file="sv.rda") ## cKeep the scores A <- sv$u[,1:q] %*% diag(sv$d[1:q]) ## use dimension q and record error rate of errvec <- rep(0.5,q) for (dimc in 2:q){ ## train a Random Forest classifier rf <- randomForest( A[,1:dimc],as.factor(Y)) print(rf) errvec[dimc] <- mean(rf$confusion[,3]) plot(errvec[1:dimc],type="b") } ## as comparison can check 100 random dimensions rf <- randomForest( as.matrix(X[,sample(1:p,100)]),as.factor(Y)) ############ MNIST data ########## ###################################### load(file="mnist/train.RData") X <- trainData svM <- svd(X) save(svM,file="svM.rda") load(file="svM.rda") par(mfrow=c(4,4)) for (i in 1:16){ x <- X[i,] plotimmnist( x ) } par(mfrow=c(1,1)) plot( cumsum( svM$d^2)/sum(svM$d^2)) par(mfrow=c(4,4)) q <- 100 par(mfrow=c(4,4)) for (i in 1:8){ x <- X[i,] xhat <- svM$u[i,1:q] %*% diag(svM$d[1:q]) %*% t(svM$v[,1:q]) plotimmnist(x ) plotimmnist(xhat) } ## first eigenvectors par(mfrow=c(4,4)) for (k in 1:16){ atom <- svM$v[,k] plotimmnist(atom) }