
# To work with latest R type R-3.0.3 in the terminal of R-3.1.2
# fast R for matrix multiplication/PCA: type  /usr/bin/R


##########################                    get gene count data

RNA=read.table('/virdir/Backup/RP3_data/run1_count_files/genecounts/combined_gene_count_run_1.txt',sep='\t',header=TRUE,check.names=FALSE) # per row one gene
save(RNA,file='/virdir/Backup/RP3_data/run1_count_files/genecounts/combined_gene_count_run_1_R_object.RData')
load('/virdir/Backup/RP3_data/run1_count_files/genecounts/combined_gene_count_run_1_R_object.RData')

##     get IDs
source('/home/rjansen/R  scripts/getView.R') # getView.R By Maarten van Iterson see https://git.lumc.nl/rp3/rscripts  for last version
freeze1 <- getView(view="rnaseqFreeze1", design="identifiers") # 2116 records

##  take Freeze 1 and replace with BIOS ID

RNA = RNA[,colnames(RNA) %in% freeze1$rnaseq_run_id]
colnames(RNA)=freeze1$uuid[match(colnames(RNA),freeze1$rnaseq_run_id)]

##  remove genes with standard deviation = 0

nz=which(sd(t(RNA))!=0);length(nz);nrow(RNA)
RNA=RNA[nz,]

##   TMM normalization per subject

library(edgeR)
D <- DGEList(counts=RNA)
d <- calcNormFactors(D)
scalar <- d$samples$lib.size*d$samples$norm.factors/exp(mean(log(d$samples$lib.size*d$samples$norm.factors)))
scal.mat <- outer(rep(1,nrow(d$counts)), scalar)
RNAs =  d$counts/scal.mat

save(RNA,file='/virdir/Backup/RP3_data/run1_count_files/genecounts/gene_count_freeze1_R_object.RData')
save(RNAs,file='/virdir/Backup/RP3_data/run1_count_files/genecounts/gene_count_freeze1_TMM_normalized_R_object.RData')

##########################                  get Exon count data

load('/virdir/Backup/pthoen/run1_exon_count_analysis/combined_exon_count_run_1_R_object.R')

##     get IDs
source('/home/rjansen/R  scripts/getView.R')
freeze1 <- getView(view="rnaseqFreeze1", design="identifiers") # 2116 records

##  take Freeze 1 and replace with BIOS ID

idtemp=colnames(RNA)
for (i in 1: length(idtemp)){
  idtemp[i]=gsub("[.]","-", idtemp[i]) }

RNA = RNA[,idtemp %in% freeze1$rnaseq_run_id]
idtemp=idtemp[idtemp %in% freeze1$rnaseq_run_id]
colnames(RNA)=freeze1$uuid[match(idtemp,freeze1$rnaseq_run_id)]

# ##  remove exons with standard deviation = 0 (none in exon counts)
# 
# nz=which(sd(t(RNA))!=0);length(nz);nrow(RNA)
# RNA=RNA[nz,]

##   TMM normalization per subject

library(edgeR)
D <- DGEList(counts=RNA)
d <- calcNormFactors(D)
scalar <- d$samples$lib.size*d$samples$norm.factors/exp(mean(log(d$samples$lib.size*d$samples$norm.factors)))
scal.mat <- outer(rep(1,nrow(d$counts)), scalar)
RNAs =  d$counts/scal.mat

save(RNA,file='/virdir/Backup/RP3_data/run1_count_files/exoncounts/exon_count_freeze1_R_object.RData')
save(RNAs,file='/virdir/Backup/RP3_data/run1_count_files/exoncounts/exon_count_freeze1_TMM_normalized_R_object.RData')

##########################           get transcript counts

RNA=read.table('/virdir/Backup/RP3_data/run1_count_files/transcriptcounts/combined_transcript_count_run_1.txt',as.is=TRUE,sep='\t',check.names=FALSE,header=TRUE)
save(RNA, file='/virdir/Backup/RP3_data/run1_count_files/transcriptcounts/combined_transcript_count_run_1.RData')
load('/virdir/Backup/RP3_data/run1_count_files/transcriptcounts/combined_transcript_count_run_1.RData')

##     get IDs
source('/home/rjansen/R  scripts/getView.R')
freeze1 <- getView(view="rnaseqFreeze1", design="identifiers") # 2116 records

##  take Freeze 1 and replace with BIOS ID

rownames(RNA)=RNA[,1]
RNA=RNA[,-1]
RNA = RNA[,colnames(RNA) %in% freeze1$rnaseq_run_id]
colnames(RNA)=freeze1$uuid[match(colnames(RNA),freeze1$rnaseq_run_id)]

# ##  remove transcripts with standard deviation = 0 (none)
# 
 #nz=which(sd(t(RNA))!=0);length(nz);nrow(RNA)
# RNA=RNA[nz,]

##   TMM normalization per subject

library(edgeR)
D <- DGEList(counts=RNA)
d <- calcNormFactors(D)
scalar <- d$samples$lib.size*d$samples$norm.factors/exp(mean(log(d$samples$lib.size*d$samples$norm.factors)))
scal.mat <- outer(rep(1,nrow(d$counts)), scalar)
RNAs =  d$counts/scal.mat

save(RNA,file='/virdir/Backup/RP3_data/run1_count_files/transcriptcounts/transcript_count_freeze1_R_object.RData')
save(RNAs,file='/virdir/Backup/RP3_data/run1_count_files/transcriptcounts/transcript_count_freeze1_TMM_normalized_R_object.RData')

##########################   get Phenotypes and technical covariates

  
source('/home/rjansen/R  scripts/getView.R')

S=getView(view="rnaseq", design="samplesheets") # 4039 records
save(S, file='/virdir/Backup/RP3_data/Phenotypes/rna_seq_sample_sheets.RData')
write.table(S,file='/virdir/Backup/RP3_data/Phenotypes/rna_seq_sample_sheets.csv',col.names=TRUE,row.names=FALSE, quote=FALSE,sep='\t')

P = getView(design="phenotypes", view="allPhenotypes") # 4564
save(P, file='/virdir/Backup/RP3_data/Phenotypes/BIOS_Phenotypes.RData')
write.table(P,file='/virdir/Backup/RP3_data/Phenotypes/BIOS_Phenotypes.csv',col.names=TRUE,row.names=FALSE, quote=FALSE,sep='\t')

F <- getView(view="getIds", design="identifiers") # 6037 records

# get lane and index from run id
rid=F[,5];lane=NULL;index=NULL
for(i in 1: length(rid)){
  lane[i]=strsplit(rid[i],'-')[[1]][2]
  index[i]=strsplit(rid[i],'-')[[1]][3]}

F=as.data.frame(cbind(F,lane,index))

save(F, file='/virdir/Backup/RP3_data/Phenotypes/BIOS_IDs.RData')
write.table(F,file='/virdir/Backup/RP3_data/Phenotypes/BIOS_IDs.csv',col.names=TRUE,row.names=FALSE, quote=FALSE,sep='\t')
  
  