> ## read in count data > > countdata <- read.table("Combined_Count_PrimaryTN.bed", header=F, sep="\t", stringsAsFactors=F) > colnames(countdata) <- c("chr", "start", "stop","N1","N2","N3","N4","N5","N6","N7","N8","T1","T2","T3","T4","T5","T6","T7","T8","rowno","peakid") > rownames(countdata) <- countdata$peakid > countdata.m <- as.matrix(countdata[,c(4:19)]) > ## read in GC content > > gccontent <- read.table("Combined_Count_PrimaryTN.bed_GCcontent", header=F, sep="\t", stringsAsFactors=F) > colnames(gccontent) <- c("chr", "start", "stop","N1","N2","N3","N4","N5","N6","N7","N8","T1","T2","T3","T4","T5","T6","T7","T8","rowno","peakid", + "pct_at","pct_gc","num_A","num_C","num_G","num_T","num_N","num_oth","seq_len") > gccontent <- gccontent[,c("peakid","pct_gc")] > rownames(gccontent) <- gccontent$peakid > gccontent$peakid <- NULL > ## calculate length of peak regions > gclength <- as.data.frame(countdata$stop-countdata$start) > rownames(gclength) <- countdata$peakid > colnames(gclength) <- "length" > ## create pheno data > id = c("N1","N2","N3","N4","N5","N6","N7","N8","T1","T2","T3","T4","T5","T6","T7","T8") > conditions = c("N","N","N","N","N","N","N","N","T","T","T","T","T","T","T","T") > replicates = c("1","2","3","4","5","6","8","9","1","2","3","4","5","6","7","8") > batch=c("1","1","1","2","2","3","3","3","1","1","1","2","2","3","3","3") > pheno = data.frame(id, conditions,batch,replicates) > rownames(pheno) <- colnames(countdata.m) > ## Create seqexpressionset > feature <- data.frame(gc = gccontent, length = gclength) > data <- newSeqExpressionSet(exprs = countdata.m,featureData = feature,phenoData = pheno) > ## Normalization > dataOffstWithin <- withinLaneNormalization(data, "pct_gc", which = "full",offset=T) > dataOffset <- betweenLaneNormalization(dataOffstWithin, which = "full",offset=T) > ## Differential expression analysis with GC normalisation > EDASeqNormFactors <- exp(-1 * offst(dataOffset)) > EDASeqNormFactors <- EDASeqNormFactors / mean(EDASeqNormFactors) > dds <- DESeqDataSetFromMatrix(countData = countdata.m, colData = pheno, design = ~replicates+conditions) > normalizationFactors(dds) <- EDASeqNormFactors > dds <- estimateDispersions(dds) > dds <- nbinomWaldTest(dds) > res <- results(dds) > jpeg("DispersionEstimate_GCcorrected.jpg") ; plotDispEsts(dds) ; dev.off() > ## Differential expression analysis without GC normalisation > dds_nongc <- DESeqDataSetFromMatrix(countData = countdata.m, colData = pheno, design = ~replicates+conditions) > dds2_nongc <- DESeq(dds_nongc) > res <- results(dds2_nongc) > jpeg("DispersionEstimate_NotGCcorrected.jpg") ; plotDispEsts(dds) ; dev.off()