######################################## ## bash # set scrolling of terminal to unlimited ######################################## ######################################## ############### R BASICS ############### # assignments ("<-" or "=") # vector # variables (data$counts) # dim() # head(,param) # sessionInfo() # ?rtfm ######################################## ######################################## source("http://www.bioconductor.org/biocLite.R") biocLite("pheatmap") library("edgeR") library("DESeq2") library("Rsubread") library("GenomicAlignments") library("BiocParallel") library("pheatmap") library("RColorBrewer") ######################################## ## bash # check number of CPUs less /proc/cpuinfo # check memory less /proc/meminfo ######################################## loadDir<-"/seq/common/data/Course_BioTopIVB/bamtest/" saveDir<-"/home/phpeters/biotopCourse/" alignFile <- "/seq/common/data/Course_BioTopIVB/bamtest/Tni_noninf_01.bam" alignFile <- paste0(loadDir,"Tni_noninf_01.bam") alignFiles <- c(paste0(loadDir,"Tni_noninf_01.bam")) annotFile <- paste0(loadDir,"Tni_transcripts.gtf") outFile <- paste0(saveDir,"counts.txt") file.exists(alignFile) file.exists(annotFile) file.exists(outFile) ######################################## ## bash # check the sam file (PG -> bowtie-command) less /seq/common/data/Course_BioTopIVB/bamtest/Tni_transcripts.gtf # check the gtf file (transcripts, gene_id) less /seq/common/data/Course_BioTopIVB/bamtest/Tni_transcripts.gtf ######################################## # featureCount # rtfm # https://www.rdocumentation.org/packages/Rsubread/versions/1.22.2/topics/featureCounts ?featureCounts features<-featureCounts(alignFile,annot.ext=annotFile,isGTFAnnotationFile = TRUE,GTF.featureType = "transcript", GTF.attrType = "gene_id", isPairedEnd=FALSE,strandSpecific=1,nthreads=16); # Running time : 0.11 minutes ## get some information summary(features) head(features$counts,3) colnames(features$counts)<-"noninf1" head(features$counts,3) # do it for all alignFiles<-c(paste0(loadDir,"Tni_noninf_01.bam"),paste0(loadDir,"Tni_noninf_02.bam"),paste0(loadDir,"Tni_noninf_03.bam"),paste0(loadDir,"Tni_noninf_04.bam"),paste0(loadDir,"Tni_inf_01.bam"),paste0(loadDir,"Tni_inf_02.bam"),paste0(loadDir,"Tni_inf_03.bam"),paste0(loadDir,"Tni_inf_04.bam")) file.exists(alignFiles) features<-featureCounts(alignFiles,annot.ext=annotFile,isGTFAnnotationFile = TRUE,GTF.featureType = "transcript", GTF.attrType = "gene_id", isPairedEnd=FALSE,strandSpecific=1,nthreads=16); colnames(features$counts)<-c("noninf1","noninf2","noninf3","noninf4","inf1","inf2","inf3","inf4") summary(features$counts) head(features$counts) # access only one column head(features$counts[,"noninf1"],3) head(features$counts[,1],3) # access only one row features$counts[1,] features$counts["GBKU01000001.1",] # save results outFile=paste0(saveDir,"countsAndLength.txt") write.table(x=data.frame(features$annotation[,c("GeneID","Length")],features$counts,stringsAsFactors=FALSE),file=outFile,quote=FALSE,sep="\t",row.names=FALSE) outFile=paste0(saveDir,"counts.txt") write.table(x=data.frame(features$annotation[,c("GeneID")],features$counts,stringsAsFactors=FALSE),file=outFile,quote=FALSE,sep="\t",row.names=FALSE) #################################################### #bash: nano saveDir,"counts.txt" #################################################### #################################################### ## Normalisation # TPM rate <-features$counts/features$annotation[,c("Length")] denom <- sum(rate)/1000000 tpm<-rate/denom head(tpm) #RPKM rate2 <-features$counts/(features$annotation[,c("Length")]/1000) denom2 <- sum(features$counts)/1000000 rpkm<-rate2/denom2 head(rpkm) # conversion RPKM to TPM tpmtest<-rpkm/(sum(rpkm)/1000000) head(tpmtest) # check sample differences I # heatmap sampleDists <- dist(t(tpm)) sampleDistMatrix <- as.matrix(sampleDists) pheatmap(sampleDistMatrix,clustering_distance_rows=sampleDists,clustering_distance_cols=sampleDists) colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255) pheatmap(sampleDistMatrix,clustering_distance_rows=sampleDists,clustering_distance_cols=sampleDists,col=colors) # save plot pdf(file=paste0("heatmap.pdf")); pheatmap(sampleDistMatrix,clustering_distance_rows=sampleDists,clustering_distance_cols=sampleDists,col=colors) dev.off() # check sample differences II pairs(~tpm[,1]+tpm[,2]+tpm[,3]+tpm[,4]+tpm[,5]+tpm[,6]+tpm[,7]+tpm[,8],labels=colnames(features$counts)) # correlation cor(tpm[,1],tpm[,2]) panel.cor <- function(x, y, digits=2, prefix="", cex.cor){ usr <- par("usr"); on.exit(par(usr)) par(usr = c(0, 1, 0, 1)) r <- abs(cor(x, y)) txt <- format(c(r, 0.123456789), digits=digits)[1] txt <- paste(prefix, txt, sep="") if(missing(cex.cor)) cex <- 0.6/strwidth(txt) text(0.5, 0.5, txt, cex = cex) } pairs(~tpm[,1]+tpm[,2]+tpm[,3]+tpm[,4]+tpm[,5]+tpm[,6]+tpm[,7]+tpm[,8],lower.panel=panel.smooth, upper.panel=panel.cor,labels=colnames(features$counts)) #################################################### #################################################### ## Differential Expression Analysis # read in the reads countFile <- paste0(saveDir,"counts.txt") rawcountdata <- read.table(countFile, header=T, row.names=1) # take only the good samples countData<-rawcountdata[,c(1:3,5:7)] #################################################### # bash # set up the csv file (overview over the experiments) #################################################### csvFile<-paste0(saveDir,"Tni_experiments.csv") (sampleTable <- read.csv(csvFile,row.names=1)) names(countData)<-rownames(sampleTable) (group<-factor(sampleTable[,2])) # create edgeR object countsDGE<-DGEList(counts=as.matrix(countData),group=group) countsDGE$samples$group <- relevel(countsDGE$samples$group,ref="noninf") countsDGE colSums(countsDGE$counts) #normalizing by library size countsDGE <- calcNormFactors(countsDGE) countsDGE$samples plotMDS(countsDGE, method="bcv", col=as.numeric(countsDGE$samples$group)); legend("right", as.character(unique(countsDGE$samples$group)), col=1:3, pch=20) # estimate dispersion countsDGE_1 <- estimateCommonDisp(countsDGE, verbose=T) # For routine differential expresion analysis, we use empirical Bayes tagwise dispersions. Note that common dispersion needs to be estimated before estimating tagwise disp countsDGE_1 <- estimateTagwiseDisp(countsDGE_1) # plotBCV() plots the tagwise biological coefficient of variation (square root of dispersions) against log2-CPM. plotBCV(countsDGE_1) plotMeanVar(countsDGE_1, show.tagwise.vars=TRUE, NBline=TRUE,main="common") # set p-value parameter ALPHA=0.05 # test countsDGE_1ET <- exactTest(countsDGE_1, pair=c(1,2), dispersion="tagwise") countsDGE_1ET countsDGE_1ET_Dec <- decideTestsDGE(countsDGE_1ET, adjust.method="BH", p.value=ALPHA) summary(countsDGE_1ET_Dec) topTags(countsDGE_1ET, n=10) dim(topTags(countsDGE_1ET, n=Inf,p.value=ALPHA)) countsDGE_1tags <- rownames(countsDGE_1)[as.logical(countsDGE_1ET_Dec)] plotSmear(countsDGE_1ET, de.tags=countsDGE_1tags); abline(h = c(-2, 2), col = "blue") # and fit the model design <- model.matrix(~ 0 + countsDGE$samples$group) colnames(design) <- levels(countsDGE$samples$group) rownames(design) <- colnames(countsDGE) countsDGE_2 <- estimateGLMCommonDisp(countsDGE,design) countsDGE_2 <- estimateGLMTrendedDisp(countsDGE_2,design) #, method="power") # You can change method to "auto", "bin.spline", "power", "spline", "bin.loess".# The default is "auto" which chooses "bin.spline" when > 200 tags and "power" otherwise. countsDGE_2 <- estimateGLMTagwiseDisp(countsDGE_2,design) plotBCV(countsDGE_2) plotMeanVar(countsDGE_2, show.tagwise.vars=TRUE, NBline=TRUE,main="GLM") #Differential Expression # GLM countsDGE_2FIT <- glmFit(countsDGE_2, design) # compare (group 1 - group 2) to 0: # this is equivalent to comparing group 1 to group 2 countsDGE_2FITLRT <- glmLRT(countsDGE_2FIT,contrast=c(-1,1)) topTags(countsDGE_2FITLRT, n=10) dim(topTags(countsDGE_2FITLRT, n=Inf,p.value=ALPHA)) countsDGE_2FITLRT_Dec <- decideTestsDGE(countsDGE_2FITLRT, adjust.method="BH", p.value = ALPHA) summary(countsDGE_2FITLRT_Dec) countsDGE_2FITLRTtags <- rownames(countsDGE_2)[as.logical(countsDGE_2FITLRT_Dec)] plotSmear(countsDGE_2FITLRT, de.tags=countsDGE_2FITLRTtags); abline(h = c(-2, 2), col = "blue") # DESeq2 # put the countdata (with infos from above!) into a matrix and collect the column data countData_HostNoninf_vs_HostPACE<-as.matrix(countdata) (colData_HostNoninf_vs_HostPACE <- DataFrame(sampleTable)) # create DESeq object dds_HostNoninf_vs_HostPACE <- DESeqDataSetFromMatrix(countData=countData_HostNoninf_vs_HostPACE,colData=colData_HostNoninf_vs_HostPACE,design= ~ treat) # set reference level dds_HostNoninf_vs_HostPACE$treat<-relevel(dds_HostNoninf_vs_HostPACE$treat, ref=reffi) dds_HostNoninf_vs_HostPACE # make DESeq analysis dds_HostNoninf_vs_HostPACE_analysed <- DESeq(dds_HostNoninf_vs_HostPACE) # get the results res_dds_HostNoninf_vs_HostPACE_analysed <- results(dds_HostNoninf_vs_HostPACE_analysed) resOrdered_dds_HostNoninf_vs_HostPACE_analysed <- res_dds_HostNoninf_vs_HostPACE_analysed[order(res_dds_HostNoninf_vs_HostPACE_analysed$padj),] # get significant and non significant genes and write them to files sig_resOrdered_dds_HostNoninf_vs_HostPACE_analysed <- subset(resOrdered_dds_HostNoninf_vs_HostPACE_analysed, padj keep all) keep <- rowSums(cpm(countsDGE)>=CPM_THRESHOLD) >= SAMPLE_THRESHOLD countsDGE <- countsDGE[keep,] dim(countsDGE) #reset library sizes after filtering #realVec<-c(10193886,10780609,8567800,5904961,5608920,4050367) # realVec<-c(10341831,8426358,9010495,3730722,3571162,3051243) # plusVec<-c(0,0,0,12800520,10733629,7812879) #24 virus reads chen paper #plusVec<-c(0,0,0,9366234,9394251,21587056) #48 virus reads chen paper #countsDGE$samples$lib.size <- realVec countsDGE$samples$lib.size <- colSums(countsDGE$counts) #countsDGE$samples$lib.size <- colSums(countsDGE$counts)+plusVec colSums(countsDGE$counts) colSums(countsDGE.full$counts) plotMDS(countsDGE, method="bcv", col=as.numeric(countsDGE$samples$group)); legend("bottomleft", as.character(unique(countsDGE$samples$group)), col=1:3, pch=20) # estimate dispersion countsDGE_1 <- estimateCommonDisp(countsDGE, verbose=T) # For routine differential expresion analysis, we use empirical Bayes tagwise dispersions. Note that common dispersion needs to be estimated before estimating tagwise disp countsDGE_1 <- estimateTagwiseDisp(countsDGE_1) # plotBCV() plots the tagwise biological coefficient of variation (square root of dispersions) against log2-CPM. plotBCV(countsDGE_1) plotMeanVar(countsDGE_1, show.tagwise.vars=TRUE, NBline=TRUE,main="common") countsDGE_1ET <- exactTest(countsDGE_1, pair=c(1,2), dispersion="tagwise") topTags(countsDGE_1ET, n=10) dim(topTags(countsDGE_1ET, n=Inf,p.value=ALPHA)) countsDGE_1ET_Dec <- decideTestsDGE(countsDGE_1ET, adjust.method="BH", p.value=ALPHA) summary(countsDGE_1ET_Dec) countsDGE_1tags <- rownames(countsDGE_1)[as.logical(countsDGE_1ET_Dec)] plotSmear(countsDGE_1ET, de.tags=countsDGE_1tags); abline(h = c(-2, 2), col = "blue") out <- topTags(countsDGE_1ET, n=Inf,p.value=ALPHA) write.table(out$table, file=outFile1, row.names = TRUE, col.names = NA, quote=FALSE, sep = "\t") out2<-out$table["logFC"]>0 outup<-out[out2,] out2<-out$table["logFC"]<0 outdown<-out[out2,] write.table(outup$table, file=outFile1u, row.names = TRUE, col.names = NA, quote=FALSE, sep = "\t") write.table(outdown$table, file=outFile1d, row.names = TRUE, col.names = NA, quote=FALSE, sep = "\t") # and fit the model design <- model.matrix(~ 0 + countsDGE$samples$group) colnames(design) <- levels(countsDGE$samples$group) rownames(design) <- colnames(countsDGE) countsDGE_2 <- estimateGLMCommonDisp(countsDGE,design) countsDGE_2 <- estimateGLMTrendedDisp(countsDGE_2,design) #, method="power") # You can change method to "auto", "bin.spline", "power", "spline", "bin.loess".# The default is "auto" which chooses "bin.spline" when > 200 tags and "power" otherwise. countsDGE_2 <- estimateGLMTagwiseDisp(countsDGE_2,design) plotBCV(countsDGE_2) plotMeanVar(countsDGE_2, show.tagwise.vars=TRUE, NBline=TRUE,main="GLM") #Differential Expression # GLM countsDGE_2FIT <- glmFit(countsDGE_2, design) # compare (group 1 - group 2) to 0: # this is equivalent to comparing group 1 to group 2 countsDGE_2FITLRT <- glmLRT(countsDGE_2FIT,contrast=c(-1,1)) topTags(countsDGE_2FITLRT, n=10) dim(topTags(countsDGE_2FITLRT, n=Inf,p.value=ALPHA)) countsDGE_2FITLRT_Dec <- decideTestsDGE(countsDGE_2FITLRT, adjust.method="BH", p.value = ALPHA) summary(countsDGE_2FITLRT_Dec) countsDGE_2FITLRTtags <- rownames(countsDGE_2)[as.logical(countsDGE_2FITLRT_Dec)] plotSmear(countsDGE_2FITLRT, de.tags=countsDGE_2FITLRTtags); abline(h = c(-2, 2), col = "blue") out <- topTags(countsDGE_2FITLRT, n=Inf,p.value=ALPHA) write.table(out$table, file=outFile2, row.names = TRUE, col.names = NA, quote=FALSE, sep = "\t") out2<-out$table["logFC"]>0 outup<-out[out2,] out2<-out$table["logFC"]<0 outdown<-out[out2,] write.table(outup$table, file=outFile2u, row.names = TRUE, col.names = NA, quote=FALSE, sep = "\t") write.table(outdown$table, file=outFile2d, row.names = TRUE, col.names = NA, quote=FALSE, sep = "\t") ################################################################################################################ ################################################################################################################ ################################################################################################################ # DESeq2 # put the countdata (with infos from above!) into a matrix and collect the column data countData_HostNoninf_vs_HostPACE<-as.matrix(countdata) (colData_HostNoninf_vs_HostPACE <- DataFrame(sampleTable)) # create DESeq object dds_HostNoninf_vs_HostPACE <- DESeqDataSetFromMatrix(countData=countData_HostNoninf_vs_HostPACE,colData=colData_HostNoninf_vs_HostPACE,design= ~ treat) # set reference level dds_HostNoninf_vs_HostPACE$treat<-relevel(dds_HostNoninf_vs_HostPACE$treat, ref=reffi) dds_HostNoninf_vs_HostPACE # make DESeq analysis dds_HostNoninf_vs_HostPACE_analysed <- DESeq(dds_HostNoninf_vs_HostPACE) # get the results res_dds_HostNoninf_vs_HostPACE_analysed <- results(dds_HostNoninf_vs_HostPACE_analysed) resOrdered_dds_HostNoninf_vs_HostPACE_analysed <- res_dds_HostNoninf_vs_HostPACE_analysed[order(res_dds_HostNoninf_vs_HostPACE_analysed$padj),] # get significant and non significant genes and write them to files sig_resOrdered_dds_HostNoninf_vs_HostPACE_analysed <- subset(resOrdered_dds_HostNoninf_vs_HostPACE_analysed, padj0) nonsig_resOrdered_dds_HostNoninf_vs_HostPACE_analysed <- subset(resOrdered_dds_HostNoninf_vs_HostPACE_analysed, padj>=ALPHA) write.table(as.data.frame(sig_resOrdered_dds_HostNoninf_vs_HostPACE_analysed),file=outFile3,sep="\t",quote=F) write.table(as.data.frame(nonsig_resOrdered_dds_HostNoninf_vs_HostPACE_analysed),file=outFile4,sep="\t",quote=F) write.table(as.data.frame(up_sig_resOrdered_dds_HostNoninf_vs_HostPACE_analysed),file=outFile3u,sep="\t",quote=F) write.table(as.data.frame(down_sig_resOrdered_dds_HostNoninf_vs_HostPACE_analysed),file=outFile3d,sep="\t",quote=F) sig_resOrdered_dds_HostNoninf_vs_HostPACE_analysed nonsig_resOrdered_dds_HostNoninf_vs_HostPACE_analysed # print summary and plotMA the results summary(resOrdered_dds_HostNoninf_vs_HostPACE_analysed,alpha=ALPHA) #pdf(file='/project/explusi/phpeters/GEA/LFCRegulatedTni_24.pdf'); #png(filename='/project/explusi/phpeters/GEA/LFCRegulatedTni.png'); plotMA(resOrdered_dds_HostNoninf_vs_HostPACE_analysed,main="Regulated genes of Trichoplusia ni\n24h vs. 24h",alpha=ALPHA,ylim=c(-2,2)) #dev.off() #display the changes ## ----heatmap, dev="pdf", fig.width=5, fig.height=7----------------------- library("pheatmap") select <- order(rowMeans(counts(dds,normalized=TRUE)), decreasing=TRUE)[1:20] nt <- normTransform(dds) # defaults to log2(x+1) log2.norm.counts <- assay(nt)[select,] df <- as.data.frame(colData(dds)[,c("condition","type")]) pheatmap(log2.norm.counts, cluster_rows=FALSE, show_rownames=FALSE, cluster_cols=FALSE, annotation_col=df) pheatmap(assay(rld)[select,], cluster_rows=FALSE, show_rownames=FALSE, cluster_cols=FALSE, annotation_col=df) pheatmap(assay(vsd)[select,], cluster_rows=FALSE, show_rownames=FALSE, cluster_cols=FALSE, annotation_col=df) ### testing for specific logFC ## ----ddsNoPrior---------------------------------------------------------- ddsNoPrior <- DESeq(dds, betaPrior=FALSE) ## ----lfcThresh, fig.show="asis", fig.cap='MA-plots of tests of log2 fold change with respect to a threshold value. Going left to right across rows, the tests are for \\Robject{altHypothesis = "greaterAbs"}, \\Robject{"lessAbs"}, \\Robject{"greater"}, and \\Robject{"less"}.\\label{figure/lfcThresh-1}'---- par(mfrow=c(2,2),mar=c(2,2,1,1)) yl <- c(-2.5,2.5) resGA <- results(dds, lfcThreshold=.5, altHypothesis="greaterAbs") resLA <- results(ddsNoPrior, lfcThreshold=.5, altHypothesis="lessAbs") resG <- results(dds, lfcThreshold=.5, altHypothesis="greater") resL <- results(dds, lfcThreshold=.5, altHypothesis="less") plotMA(resGA, ylim=yl) abline(h=c(-.5,.5),col="dodgerblue",lwd=2) plotMA(resLA, ylim=yl) abline(h=c(-.5,.5),col="dodgerblue",lwd=2) plotMA(resG, ylim=yl) abline(h=.5,col="dodgerblue",lwd=2) plotMA(resL, ylim=yl) abline(h=-.5,col="dodgerblue",lwd=2) ## ----mcols--------------------------------------------------------------- mcols(dds,use.names=TRUE)[1:4,1:4] # here using substr() only for display purposes substr(names(mcols(dds)),1,10) mcols(mcols(dds), use.names=TRUE)[1:4,]