########################################
## bash
# set scrolling of terminal to unlimited

########################################

source("http://www.bioconductor.org/biocLite.R")
biocLite("pheatmap")


library("edgeR")
library("DESeq2")
library("Rsubread")
library("GenomicAlignments")
library("BiocParallel")
library("RColorBrewer")


alignFile<-"/seq/common/data/Course_BioTopIVB/bamtest/Tni_noninf_01.bam"
annotFile<-"/seq/common/data/Course_BioTopIVB/bamtest/Tni_transcripts.gtf"
outfile="/home/phpeters/biotopCourse/counts.txt"
file.exists(alignFile)
file.exists(annotFile)

########################################
## bash
# check number of CPUs
less /proc/cpuinfo
# check memory
less /proc/meminfo

# check the gtf file (transcripts, gene_id)
less /seq/common/data/Course_BioTopIVB/bamtest/Tni_transcripts.gtf

########################################


# featureCount
# rtfm
# https://www.rdocumentation.org/packages/Rsubread/versions/1.22.2/topics/featureCounts

?featureCounts

features<-featureCounts(alignFile,annot.ext=annotFile,isGTFAnnotationFile = TRUE,GTF.featureType = "transcript", GTF.attrType = "gene_id", isPairedEnd=FALSE,strandSpecific=1,nthreads=16);
# Running time : 0.11 minutes

## get some information
summary(features)
head(features$counts,3)
colnames(features$counts)<-"noninf1"
head(features$counts,3)

# do it for all
workdir<-"/seq/common/data/Course_BioTopIVB/bamtest/"
alignFiles<-c(paste0(workdir,"Tni_noninf_01.bam"),paste0(workdir,"Tni_inf_01.bam"))
file.exists(alignFiles)

features<-featureCounts(alignFiles,annot.ext=annotFile,isGTFAnnotationFile = TRUE,GTF.featureType = "transcript", GTF.attrType = "gene_id", isPairedEnd=FALSE,strandSpecific=1,nthreads=16);

colnames(features$counts)<-c("noninf1","noninf2","noninf3","noninf4","inf1","inf2","inf3","inf4")

summary(features$counts)
head(features$counts)

outfile="/home/phpeters/teaching/biotop_course/mapping/counts.txt"
write.table(x=data.frame(bloppo$annotation[,c("GeneID","Length")],bloppo$counts,stringsAsFactors=FALSE),file=outfile,quote=FALSE,sep="\t",row.names=FALSE)


# TPM calculation

rate <-features$counts/features$annotation$Length
denom <- sum(rate)
tpm<-rate / denom *1e6
head(tpm)


sampleDists <- dist(t(onlytpms))
sampleDistMatrix <- as.matrix(sampleDists)
pheatmap(sampleDistMatrix,clustering_distance_rows=sampleDists,clustering_distance_cols=sampleDists,col=colors)

# seperate inf: low read counts
# misclustered non-inf: high read count, perhaps something fishy here
# => both have to be checked with more 


# corellation
panel.cor <- function(x, y, digits=2, prefix="", cex.cor){
usr <- par("usr"); on.exit(par(usr))
par(usr = c(0, 1, 0, 1))
r <- abs(cor(x, y))
txt <- format(c(r, 0.123456789), digits=digits)[1]
txt <- paste(prefix, txt, sep="")
if(missing(cex.cor)) cex <- 0.6/strwidth(txt)
#text(0.5, 0.5, txt, cex = cex * r)
text(0.5, 0.5, txt, cex = cex)
}
pairs(~onlytpms$noninf1+onlytpms$noninf2+onlytpms$noninf3+onlytpms$noninf4+onlytpms$inf1+onlytpms$inf2+onlytpms$inf3+onlytpms$inf4+onlytpms$noninf4, lower.panel=panel.smooth, upper.panel=panel.cor,labels=colnames(onlytpms))


#####
# tests

countFile<-"/home/phpeters/Desktop/code2.txt"
countdata <- read.table(countFile, header=T, row.names=1)

csvFile<-"/home/phpeters/biotopCourse/counts2.csv"
(sampleTable <- read.csv(csvFile,row.names=1))


names(countdata)<-rownames(sampleTable)
group<-factor(sampleTable[,2])
sampleTable[,2]
#[1] noninf inf   
#Levels: inf noninf
edgeRCountDGE<-DGEList(counts=as.matrix(countdata),group=group)
edgeRCountDGE$samples$group <- relevel(edgeRCountDGE$samples$group,ref="noninf")
edgeRCountDGE

edgeRCountDGE$samples$lib.size <- colSums(edgeRCountDGE$counts)
#edgeRCountDGE$samples$lib.size <- colSums(edgeRCountDGE$counts)+plusVec
colSums(edgeRCountDGE$counts)