######################################## ## bash # set scrolling of terminal to unlimited ######################################## source("http://www.bioconductor.org/biocLite.R") biocLite("pheatmap") library("edgeR") library("DESeq2") library("Rsubread") library("GenomicAlignments") library("BiocParallel") library("RColorBrewer") alignFile<-"/seq/common/data/Course_BioTopIVB/bamtest/Tni_noninf_01.bam" annotFile<-"/seq/common/data/Course_BioTopIVB/bamtest/Tni_transcripts.gtf" outfile="/home/phpeters/biotopCourse/counts.txt" file.exists(alignFile) file.exists(annotFile) ######################################## ## bash # check number of CPUs less /proc/cpuinfo # check memory less /proc/meminfo # check the gtf file (transcripts, gene_id) less /seq/common/data/Course_BioTopIVB/bamtest/Tni_transcripts.gtf ######################################## # featureCount # rtfm # https://www.rdocumentation.org/packages/Rsubread/versions/1.22.2/topics/featureCounts ?featureCounts features<-featureCounts(alignFile,annot.ext=annotFile,isGTFAnnotationFile = TRUE,GTF.featureType = "transcript", GTF.attrType = "gene_id", isPairedEnd=FALSE,strandSpecific=1,nthreads=16); # Running time : 0.11 minutes ## get some information summary(features) head(features$counts,3) colnames(features$counts)<-"noninf1" head(features$counts,3) # do it for all workdir<-"/seq/common/data/Course_BioTopIVB/bamtest/" alignFiles<-c(paste0(workdir,"Tni_noninf_01.bam"),paste0(workdir,"Tni_inf_01.bam")) file.exists(alignFiles) features<-featureCounts(alignFiles,annot.ext=annotFile,isGTFAnnotationFile = TRUE,GTF.featureType = "transcript", GTF.attrType = "gene_id", isPairedEnd=FALSE,strandSpecific=1,nthreads=16); colnames(features$counts)<-c("noninf1","noninf2","noninf3","noninf4","inf1","inf2","inf3","inf4") summary(features$counts) head(features$counts) outfile="/home/phpeters/teaching/biotop_course/mapping/counts.txt" write.table(x=data.frame(bloppo$annotation[,c("GeneID","Length")],bloppo$counts,stringsAsFactors=FALSE),file=outfile,quote=FALSE,sep="\t",row.names=FALSE) # TPM calculation rate <-features$counts/features$annotation$Length denom <- sum(rate) tpm<-rate / denom *1e6 head(tpm) sampleDists <- dist(t(onlytpms)) sampleDistMatrix <- as.matrix(sampleDists) pheatmap(sampleDistMatrix,clustering_distance_rows=sampleDists,clustering_distance_cols=sampleDists,col=colors) # seperate inf: low read counts # misclustered non-inf: high read count, perhaps something fishy here # => both have to be checked with more # corellation panel.cor <- function(x, y, digits=2, prefix="", cex.cor){ usr <- par("usr"); on.exit(par(usr)) par(usr = c(0, 1, 0, 1)) r <- abs(cor(x, y)) txt <- format(c(r, 0.123456789), digits=digits)[1] txt <- paste(prefix, txt, sep="") if(missing(cex.cor)) cex <- 0.6/strwidth(txt) #text(0.5, 0.5, txt, cex = cex * r) text(0.5, 0.5, txt, cex = cex) } pairs(~onlytpms$noninf1+onlytpms$noninf2+onlytpms$noninf3+onlytpms$noninf4+onlytpms$inf1+onlytpms$inf2+onlytpms$inf3+onlytpms$inf4+onlytpms$noninf4, lower.panel=panel.smooth, upper.panel=panel.cor,labels=colnames(onlytpms)) ##### # tests countFile<-"/home/phpeters/Desktop/code2.txt" countdata <- read.table(countFile, header=T, row.names=1) csvFile<-"/home/phpeters/biotopCourse/counts2.csv" (sampleTable <- read.csv(csvFile,row.names=1)) names(countdata)<-rownames(sampleTable) group<-factor(sampleTable[,2]) sampleTable[,2] #[1] noninf inf #Levels: inf noninf edgeRCountDGE<-DGEList(counts=as.matrix(countdata),group=group) edgeRCountDGE$samples$group <- relevel(edgeRCountDGE$samples$group,ref="noninf") edgeRCountDGE edgeRCountDGE$samples$lib.size <- colSums(edgeRCountDGE$counts) #edgeRCountDGE$samples$lib.size <- colSums(edgeRCountDGE$counts)+plusVec colSums(edgeRCountDGE$counts)