[Bioc-devel] ExperimentHub::GSE62944 outdated

Fri Jun 3 13:56:35 CEST 2016

FYI

That works for me, but maybe this is also of interest for others, so I
wonder if somebody of the Bioc annotation/experiment team (Sonali,
Valerie, Martin?) could update this accordingly for ExperimentHub?

Best,
Ludwig

-- 
Dr. Ludwig Geistlinger

Lehr- und Forschungseinheit für Bioinformatik
Institut für Informatik
Ludwig-Maximilians-Universität München
Amalienstrasse 17, 2. Stock, Büro A201
80333 München

Tel.: 089-2180-4067
eMail: Ludwig.Geistlinger at bio.ifi.lmu.de

> Hi Ludwig,
>
> In november I sent the updated recipe to Martin, but I think it was not
> updated yet.
>
> Anyway, you can do it yourself with the code here below:
>
>     library("GEOquery")
>     library("Biobase")
>
>     suppl <- GEOquery::getGEOSuppFiles("GSE62944")
>
>     setwd("GSE62944")
>
>     clinvar <-
>     read.delim("GSE62944_06_01_15_TCGA_24_548_Clinical_Variables_9264_Samples.txt.gz")
>     clinvar2 <- t(clinvar)
>
>     # add variable names
>     colnames(clinvar2) <- clinvar2[1,]
>     # and remove the 2nd abbreviation, with the CDE_ID too
>     clinvar3 <- clinvar2[-c(1:3),]
>
>     # substitute dots with dashes in the ids, to be consistent with
>     previous object
>     clinvar4 <- clinvar3
>     rownames(clinvar4) <- gsub("\\.","-",rownames(clinvar3))
>     clinvar4 <- as.data.frame(clinvar4)
>
>     CancerType <-
>     read.delim("GSE62944_06_01_15_TCGA_24_CancerType_Samples.txt.gz",
>                   header=FALSE, colClasses=c("character", "factor"),
>                   col.names=c("sample", "type"))
>     idx <- match(rownames(clinvar4), CancerType$sample)
>     # these are already nicely sorted
>     clinvar4$CancerType <- CancerType$type[idx]
>
>
>     countFile <-
>     "GSM1536837_06_01_15_TCGA_24.tumor_Rsubread_FeatureCounts.txt.gz"
>     untar("GSE62944_RAW.tar", countFile)
>
>     counts <- local({
>        data <- scan(countFile, what=character(), sep="\t", quote="")
>        m <- matrix(data, 9265)
>        dimnames(m) <- list(m[,1], m[1,])
>        m <- t(m[-1, -1])
>        mode(m) <- "integer"
>        m
>     })
>
>     # just to be sure
>     gplots::venn(list(colnames(counts),rownames(cl4))) # they are all
>     there, but not correctly sorted
>     head(colnames(counts))
>     head(rownames(clinvar4))
>
>     # re-sorting according to the counts object
>     cl5 <-
>     clinvar4[rownames(clinvar4)[match(colnames(counts),rownames(clinvar4))],]
>     head(rownames(cl5),20)
>     head(colnames(counts),20)
>
>     # as in your example
>     eset_new <- Biobase::ExpressionSet(counts, AnnotatedDataFrame(cl5))
>
>     # or as SummarizedExperiment
>     library("GenomicRanges")
>     se <- SummarizedExperiment(assays=list(counts))
>     colData(se) <- S4Vectors::DataFrame(cl5)
>
>     # data exploration to see how samples are related to each other
>     library("DESeq2")
>     ddsTCGA <- DESeqDataSet(se,design=~CancerType)
>
>     ddsTCGA <- estimateSizeFactors(ddsTCGA)
>     log2tcga <- log2(1+counts(ddsTCGA,normalized=TRUE))
>     se_log2tcga <- SummarizedExperiment(assays=list(log2tcga))
>     colData(se_log2tcga) <- colData(ddsTCGA) # the rlog transform takes
>     very long time, so just a quick and dirty check
>
>     pca_d4 <- function (x, intgroup = "condition", ntop = 500,
>     returnData = FALSE,title=NULL,
>                          pcX = 1, pcY = 2,text_labels=TRUE,point_size=3)
>     # customized principal components
>     {
>        library("DESeq2")
>        library("genefilter")
>        library("ggplot2")
>        rv <- rowVars(assay(x))
>        select <- order(rv, decreasing =
> TRUE)[seq_len(min(ntop,length(rv)))]
>        pca <- prcomp(t(assay(x)[select, ]))
>        percentVar <- pca$sdev^2/sum(pca$sdev^2)
>
>        intgroup.df <- as.data.frame(colData(x)[, intgroup, drop = FALSE])
>        group <- factor(apply(intgroup.df, 1, paste, collapse = " : "))
>        d <- data.frame(PC1 = pca$x[, pcX], PC2 = pca$x[, pcY], group =
>     group,
>                        intgroup.df, names = colnames(x))
>        colnames(d)[1] <- paste0("PC",pcX)
>        colnames(d)[2] <- paste0("PC",pcY)
>        if (returnData) {
>          attr(d, "percentVar") <- percentVar[1:2]
>          return(d)
>        }
>        # clever way of positioning the labels
>        d$hjust = ifelse((sign(d[,paste0("PC",pcX)])==1),0.9,0.1)# (1 +
>     varname.adjust * sign(PC1))/2)
>        g <- ggplot(data = d, aes_string(x = paste0("PC",pcX), y =
>     paste0("PC",pcY), color = "group")) +
>          geom_point(size = point_size) +
>          xlab(paste0("PC",pcX,": ", round(percentVar[pcX] * 100,digits =
>     2), "% variance")) +
>          ylab(paste0("PC",pcY,": ", round(percentVar[pcY] * 100,digits =
>     2), "% variance"))
>        if(text_labels) g <- g + geom_text(mapping =
>     aes(label=names,hjust=hjust, vjust=-0.5), show.legend = F)
>        if(!is.null(title)) g <- g + ggtitle(title)
>        g
>     }
>
>     pdf("allTCGA_diy.pdf",height=30,width=30)
>     pca_d4(se_log2tcga,intgroup="CancerType",text_labels = FALSE)
>     dev.off()
>
> If nothing changed in the data structure over there it should be
> correctly reproduced. Apologize if something indeed should *not work*, I
> did not touch the code that much afterwards.
> But for a starter, it might be covering your need.
>
> If you have any suggestion or improvement, I'm all ears ;)
>
> Best,
>
> Federico
>
> --
> Federico Marini, M.Sc.
> Institute of Medical Biostatistics, Epidemiology and Informatics (IMBEI)
> Division Biostatistics and Bioinformatics
> University Medical Center of the Johannes Gutenberg University Mainz
> Postal address: 55101 Mainz
> Office and delivery address: Obere Zahlbacher Str. 69, 55131 Mainz
>
> Phone: +49 (0) 6131 17-8108
> Fax: +49 (0) 6131 17-2968
> Web: www.imbei.uni-mainz.de
> E-Mail: marinif at uni-mainz.de
>
>