[BioC] DEXseq: Making exonCountSet object
Elena Sorokin
sorokin at wisc.edu
Sun Feb 12 00:52:29 CET 2012
Dear Simon,
Thanks for your reply, even on a Saturday. =)
You were right about the incorrect file path- that was problem. (To anybody else out there struggling with general issues getting their files into R, my code is below. Hope I save you some time).
Another issue I was hoping you might help me with: I have no differentially-expressed exons (p-value = NA for all exons) at the end of the analysis!! I followed your other recommendations, and have spent some time trying to figure this out, but am still unsure. The fact that there are no integers in the p-value column has me worried.
Maybe the problem is that the column names in counts(ecs) do not word-for-word match up with my rownames in the samples dataframe. I didn't know how to change the colnames in counts(ecs), so I left them as they were, noting that the sample identity seemed to have been preserved.
Any advice on how to change the count(ecs) headers, or other advice about my non-existent p-values would be appreciated! =)
Many thanks,
Elena
P.S. The graphics w/ gene models in DEXseq are very useful and convenient!
______________________________
library(DEXSeq)
options(digits=3)
setwd("C:\\Rdata\\DEXseq")
library(DEXSeq)
rm(list=ls())
# this GFF file is an output from Simon's script dexseq_prepare_annotations.py
annotationfile = file.path("/Rdata/DEXseq/realFioles/DEXSeq_annotations.gff")
annotationfile
[1] "/Rdata/DEXseq/realFioles/DEXSeq_annotations.gff"
> samples = data.frame(
+ condition = c("drug","drug","vehicle","vehicle"),
+ replicate = c(1:2,1:2),
+ row.names = c("drug1","drug2","veh1","veh2"),
+ stringsAsFactors = TRUE,
+ check.names = FALSE
+ )
> samples
condition replicate
drug1 drug 1
drug2 drug 2
veh1 vehicle 1
veh2 vehicle 2
> fullFilenames<- list.files("C:/Rdata/DEXseq/realFioles/",full.names=TRUE,pattern="counts.txt")
> fullFilenames
[1] "C:/Rdata/DEXseq/realFioles/drug1_counts.txt" "C:/Rdata/DEXseq/realFioles/drug2_counts.txt" "C:/Rdata/DEXseq/realFioles/veh1_counts.txt"
[4] "C:/Rdata/DEXseq/realFioles/veh2_counts.txt"
> ecs<- read.HTSeqCounts(countfiles = fullFilenames,design = samples,flattenedfile = annotationfile)
> head(counts(ecs))
C:/Rdata/DEXseq/realFioles/drug1_counts.txt C:/Rdata/DEXseq/realFioles/drug2_counts.txt C:/Rdata/DEXseq/realFioles/veh1_counts.txt
2L52.1:001 0 2 2
2L52.1:002 4 12 13
2L52.1:003 7 8 7
2L52.1:004 6 4 4
2L52.1:005 9 6 16
2L52.1:006 6 4 13
C:/Rdata/DEXseq/realFioles/veh2_counts.txt
2L52.1:001 4
2L52.1:002 20
2L52.1:003 8
2L52.1:004 7
2L52.1:005 14
2L52.1:006 12
> head(fData(ecs))
geneID exonID testable dispBeforeSharing dispFitted dispersion pvalue padjust chr start end strand transcripts
2L52.1:001 2L52.1 E001 FALSE NA 0.5145 0.5145 NA NA chrII 1867 1911 + 2L52.1
2L52.1:002 2L52.1 E002 TRUE 1.39e-01 0.0899 0.1391 NA NA chrII 2506 2694 + 2L52.1
2L52.1:003 2L52.1 E003 TRUE 9.56e-10 0.1436 0.1436 NA NA chrII 2738 2888 + 2L52.1
2L52.1:004 2L52.1 E004 TRUE 2.52e-10 0.2022 0.2022 NA NA chrII 2931 3036 + 2L52.1
2L52.1:005 2L52.1 E005 TRUE 2.16e-09 0.0979 0.0979 NA NA chrII 3406 3552 + 2L52.1
2L52.1:006 2L52.1 E006 TRUE 2.46e-09 0.1238 0.1238 NA NA chrII 3802 3984 + 2L52.1
# Size factors
> ecs<- estimateSizeFactors(ecs)
sizeFactors(ecs)
# Dispersion
> ecs<- estimateDispersions(ecs)
# Fit Dispersion
> ecs<- fitDispersionFunction(ecs)
# Plot Individual exons via mean expression
> meanvalues<- rowMeans(counts(ecs))
> plot(meanvalues, fData(ecs)$dispBeforeSharing, log="xy", main="mean vs CR dispersion")
> x<- 0.01:max(meanvalues)
> y<- ecs at dispFitCoefs[1] + ecs at dispFitCoefs[2] / x
> lines(x, y, col="red")
# Plot looks good
# Test for Expression Difference
> test<- testForDEU(ecs)
# Seems to work
> ecs<- estimatelog2FoldChanges(ecs)
> res1<- DEUresultTable(ecs)
head(res1)
geneID exonID dispersion pvalue padjust meanBase log2fold(drug/vehicle)
2L52.1:001 2L52.1 E001 0.5145 NA NA 2.01 -0.844
2L52.1:002 2L52.1 E002 0.1391 NA NA 12.25 -0.304
2L52.1:003 2L52.1 E003 0.1436 NA NA 7.45 0.738
2L52.1:004 2L52.1 E004 0.2022 NA NA 5.22 0.595
2L52.1:005 2L52.1 E005 0.0979 NA NA 11.18 -0.264
2L52.1:006 2L52.1 E006 0.1238 NA NA 8.71 -0.586
> table(res1$padjust< 0.1)
character(0)
# Are there any p-values?
> table(res1$pvalue< 1)
character(0)
More information about the Bioconductor
mailing list