##################### Differential Analysis Expression #########################
################################################################################

### ---------------------Packages
#common packages
packages <- c("optparse", # to read arguments from a command line
              "ggplot2", # nice plots
              "gridExtra", # multiple plots on one page
              "grid", # multiple plots on one page for MAplots
              "RColorBrewer", # color in plots
              "mixOmics", # for heatmap plot
              "reshape2",  # reshape grouped data
              "knitr", # create a pdf file with images
              "markdown", # create a pdf file with images
              "RJSONIO") # work with JSON file

for(package in packages){
  # if package is installed locally, load
  if (!(package %in% rownames(installed.packages()))) {
    install.packages(package, repos="http://cran.univ-paris1.fr")
  }
  do.call('library', list(package))
} 


#Bioconductor package
bioPackages <- c("edgeR", # RNAseq data normalization
                 "HTSFilter") # performs an independant filtering

for(package in bioPackages){
  # if package is installed locally, load
  if (!(package %in% rownames(installed.packages()))) {
    source("http://www.bioconductor.org/biocLite.R")
    biocLite(package)
  }
  do.call('library', list(package))
} 


### ---------------------Parameters

#############
###### Parameter initialization when the script is launched from command line
############
option_list = list(
  make_option(c("-f", "--file"), type = "character", default = NULL, 
              help = "tabulated raw count matrix with library name as header
                      (e.g. #gene_id     lib1    lib2...)", metavar = "character"),
  make_option(c("-n", "--norm"), type = "character", default = NULL, 
              help = "file with normalized factors with library name
              (e.g. sample.name	 lib.size	 norm.factors). This file is obtained 
              with script 'Normalization.R'", 
              metavar = "character"),
  make_option(c("-o", "--out"), type = "character", default = NULL, 
              help = "folder path where results are stored", 
              metavar = "character"),
  make_option(c("--pool1"), type = "character", default = NULL,
              help = "library name in pool 1 separeted by ','", 
              metavar = "character"),
  make_option(c("--pool2"), type = "character", default = NULL,
              help = "library name in pool 2 separeted by ','", 
              metavar = "character"),
  make_option(c("--filter"), type = "logical", default = TRUE,
              help = "if TRUE low expressed genes are removed [default=%default]", 
              metavar = "character"),
  make_option(c("--alpha"), type = "double", default = 0.05,
              help = "significance level of the tests (i.e. acceptable rate of 
              false-positive in the list of differentially expressed genes) 
              [default=%default]", 
              metavar = "character"),
  make_option(c("--correct"), type = "character", default = "BH",
              help = "method used to adjust p-values for multiple testing 
              ('BH', 'BY' or 'fdr') [default=%default]", metavar = "character"),
  make_option(c("--MAplots"), type = "logical", default = FALSE,
              help = "if TRUE all MA plots are saved [default=%default]", 
              metavar = "character")
); 

opt_parser = OptionParser(usage="Imports data, converts them into a DGEList object 
                                  (edgeR), does tests to find differentially
                                  expressed genes and produces diagnostic plots.",
                          option_list = option_list);
opt = parse_args(opt_parser);


##############
#### Parameter initialization when the script is launched from R or Rstudio
##file: tabulated raw count matrix with library name as header
#(e.g. #gene_id     lib1    lib2...)
##norm: file with normalized factors with library name
#(e.g. sample.name	 lib.size	 norm.factors). This file is obtained with script 
#'Normalization.R'
##out: folder path where results are stored
##pool1: library name in pool 1 separeted by ', '
##pool2: library name in pool 2 separeted by ', '
##filter: if TRUE low expressed genes are removed
##alpha: significance level of the tests (i.e. acceptable rate of 
#false-positive in the list of differentially expressed genes)
##correct: method used to adjust p-values for multiple testing 
#('BH', 'BY' or 'fdr')
##MAplots: if TRUE all MA plots are saved 
##############

#opt=list()
#opt$file <- XXX # for example "C:/My documents/raw_count.csv"
#opt$norm <- XXX # for example "C:/My documents/RLE_info.csv"
#opt$out <- XXX # for example "C:/My documents/results"
#opt$pool1 <- XXX # for example "lib1, lib2"
#opt$pool2 <- XXX # for example  "lib3, lib4"
#opt$alpha <- 0.05
#opt$filter <- TRUE
#opt$correct <- "BH"
#opt$MAplots <- FALSE


#Directory, count table file name or conditions are needed to continue
if (is.null(opt$file) || is.null(opt$norm) || is.null(opt$out) || 
    is.null(opt$pool1) || is.null(opt$pool2)) {
  print_help(opt_parser)
  stop("count table file name, norm, output directory or conditions missing.\n", 
       call. = FALSE)
}

#multiple testing correction
if (!(opt$correct %in% c("BH", "bonferroni"))) {
  print_help(opt_parser)
  stop("you can't use this correction to adjust p-values.\n", call. = FALSE)
}

#Create output folder if it doesn't exist
ifelse(!dir.exists(opt$out), dir.create(opt$out), FALSE)

#Stop the script if files doesn't exist
if (!file.exists(normalizePath(opt$file)) | 
    !file.exists(normalizePath(file.path(opt$norm), 
                               mustWork = FALSE))) {
  print_help(opt_parser)
  stop("count table file or file with normalization factor doesn't exist.\n", 
       call. = FALSE)
}


#####################################################
### ---------------------Input data and preprocessing
#####################################################
#Raw and normalized count
PC_before <- read.table(normalizePath(opt$file), header = TRUE, 
                        row.names = 1, comment.char="")

norm_fact <- read.table(normalizePath(file.path(opt$norm), 
                                      mustWork = FALSE), header = TRUE)

#Conditions
pool1 <- data.frame(lib_name = unlist(strsplit(opt$pool1, ',')),
                    conditions = rep(1, length(opt$pool1)))
pool2 <- data.frame(lib_name = unlist(strsplit(opt$pool2, ',')),
                    conditions = rep(2, length(opt$pool2)))

if (nrow(pool1) < 1 || nrow(pool2) < 1) {
  opt$filter <- FALSE
  opt$MAplots <- FALSE
}

sample_info <- rbind(pool1, pool2)
PC_before <- PC_before[, colnames(PC_before) %in% as.character(sample_info$lib_name)]
PC_before <- PC_before[, match(colnames(PC_before), sample_info$lib_name)]
norm_fact <- norm_fact[as.character(norm_fact$sample.name) %in% 
                         as.character(sample_info$lib_name), ]

#DGEList object
dge_before <- DGEList(PC_before, remove.zeros = TRUE, 
                      group = sample_info$conditions)
dge_after <- DGEList(PC_before, remove.zeros = TRUE, 
                     group = sample_info$conditions,
                     norm.factors = norm_fact$norm.factors)

#Pseudo-counts
PC_before_log <- log2(dge_before$counts + 1)
PC_after_log <- log2(cpm(dge_after) + 1)


##############################################
### ---------------------Plots : normalization
##############################################

#save images in a pdf file
pdf(file.path(opt$out, "de_images.pdf"), title = "Plots from differential analysis",
    width = 12, height = 10)

PC_before_log_long <- melt(PC_before_log)
colnames(PC_before_log_long) <- c("name", "lib", "log2CPM")
PC_before_log_long <- merge(PC_before_log_long, sample_info, by.x = "lib", 
                            by.y = "lib_name")


PC_after_log_long <- melt(PC_after_log)
colnames(PC_after_log_long) <- c("name", "lib", "log2CPM")
PC_after_log_long <- merge(PC_after_log_long, sample_info, by.x = "lib", 
                           by.y = "lib_name")

#### boxplot
PC_boxplot <- function(dat, tit, filename) {
  p <- ggplot(dat, aes(x=lib, y=log2CPM)) + 
    geom_boxplot(aes(fill = factor(conditions))) + theme_bw() + xlab(label = "") + 
    ylab(label = expression(log[2]*"(counts + 1)")) + ggtitle(tit) +
    scale_fill_manual(name = "Conditions", values = c("steelblue3", "springgreen3"))
  print(p)
}

PC_boxplot(PC_before_log_long, "Boxplot on raw pseudo-counts", 
           "rawcount_boxplot.png")
PC_boxplot(PC_after_log_long, "Boxplot on pseudo-counts after normalization", 
           "normcount_boxplot.png")


#### MDS
if (nrow(pool1) > 1 && nrow(pool2) > 1) {
  PC_MDS <- function(dat, tit, filename) {
    dev.new()
    mds <- data.frame(plotMDS(dat)$cmdscale.out)
    dev.off()
    mds$lib.name <- rownames(mds)
    p <- ggplot(mds, aes(x = X1, y = X2, label = lib.name, 
                    color = as.factor(sample_info$conditions))) + 
      geom_point(size=4) + geom_text(hjust=0.5, vjust=-0.5) + theme_bw() + 
      xlab(label = "Dimension 1") + ylab(label = "Dimension 2") + 
      ggtitle(tit) + scale_color_manual(name = "Conditions", 
                                        values = c("steelblue3", "springgreen3"))
    print(p)
    
    write.table(mds, file = normalizePath(file.path(opt$out, 
                                                       paste0(filename,".csv")), 
                                             mustWork = FALSE), sep = "\t", 
                row.names = TRUE, quote = FALSE)
  }
  
  PC_MDS(PC_before_log, "PCA plot on raw pseudo-counts", "rawcount_MDS")
  PC_MDS(PC_after_log, "PCA plot on pseudo-counts after normalization", 
         "normcount_MDS")
}


#### density
PC_density <- function(dat, tit, filename) {
  p <- ggplot(dat, aes(x=log2CPM, group = lib)) + 
    geom_density(aes(colour = factor(conditions))) + 
    theme_bw() + xlab(label = expression(log[2]*"(counts + 1)")) + 
    ylab(label = "Density") + ggtitle(tit) + 
    guides(color=guide_legend(title="Conditions")) + 
    scale_color_manual(name = "Conditions", 
                       values = c("steelblue3", "springgreen3"))
  print(p)
}

PC_density(PC_before_log_long, "Density plot on raw pseudo-counts", 
           "rawcount_density.png")
PC_density(PC_after_log_long, "Density plot on pseudo-counts after normalization", 
           "normcount_density.png")


#save data used for boxplots and density plots
#Raw counts
write.table(PC_before_log_long, file = normalizePath(file.path(opt$out, 
                                                   "raw_boxplot_density.csv"), 
                                                     mustWork = FALSE), sep = "\t", 
            row.names = TRUE, quote = FALSE)

#Normalized counts
write.table(PC_after_log_long, file = normalizePath(file.path(opt$out, 
                                            paste0("norm_boxplot_density.csv")), 
                                                    mustWork = FALSE), sep = "\t", 
            row.names = TRUE, quote = FALSE)


#### MAplots
MAplot_name <- character()
pos <- 1
if (opt$MAplots && nrow(pool1) > 1 && nrow(pool2) > 1) {
  ifelse(!dir.exists(file.path(opt$out, "MAplots")), 
         dir.create(file.path(opt$out, "MAplots")), FALSE)
  
  PC_MAplot <- function(lib1, lib2, tit, filename) {
    avalues <- (lib1 + lib2)/2
    mvalues <- (lib1 - lib2)
    MA <- data.frame(avalues = avalues, mvalues = mvalues)
    p <- ggplot(MA, aes(x=avalues, y=mvalues)) +
      geom_point(size=1) + theme_bw() + ggtitle(tit) + xlab("A") + ylab("M") + 
      geom_hline(yintercept = 0, col = "red") + 
      theme(text = element_text(size=5))
    #print(p)

    return(list(avalues,mvalues,p))
  }
  
  a_before <- data.frame(name=rownames(PC_before_log))
  m_before <- data.frame(name=rownames(PC_before_log))
  a_after <- data.frame(name=rownames(PC_after_log))
  m_after <- data.frame(name=rownames(PC_after_log))
  
  for(cond in unique(sample_info$conditions)) {
    I <- sum(sample_info$conditions == cond)-1
    J <- sum(sample_info$conditions == cond)
    
    grid.newpage()
    
    # Create layout : nrow = 2, ncol = 2
    pushViewport(viewport(layout = grid.layout(I, I*2)))
    
    pos_graph <- 1
    for(i in 1:I) {
      pos_graph <- i*2 - 1
      for(j in (i+1):J) {
        col_name <- paste0("cond",cond, "_", 
                           sample_info$lib_name[sample_info$conditions == cond][i], 
                           "vs", 
                           sample_info$lib_name[sample_info$conditions == cond][j])
        bef <- PC_MAplot(PC_before_log[, sample_info$conditions == cond][, i], 
                  PC_before_log[, sample_info$conditions == cond][, j], 
                  paste0("MA plot on raw pseudo-counts: \n", 
                         sample_info$lib_name[sample_info$conditions == cond][i],
                         " vs. ", 
                         sample_info$lib_name[sample_info$conditions == cond][j]),
                  paste0("raw_MAplot_", cond, "_", i, "vs", j, ".png"))
        
        MAplot_name[pos] <- paste0("raw_MAplot_", cond, "_", i, "vs", j, ".png")
        pos <- pos + 1
        
        a_before <- cbind(a_before, bef[[1]])
        m_before <- cbind(m_before, bef[[2]])
        colnames(a_before)[ncol(a_before)] <- col_name
        colnames(m_before)[ncol(m_before)] <- col_name
        
        print(bef[[3]], vp=viewport(layout.pos.row = i, layout.pos.col = pos_graph))
        
        aft <- PC_MAplot(PC_after_log[, sample_info$conditions == cond][, i], 
                  PC_after_log[, sample_info$conditions == cond][, j], 
                  paste0("MA plot on normalized pseudo-counts: \n", 
                         sample_info$lib_name[sample_info$conditions == cond][i],
                         " vs. ", 
                         sample_info$lib_name[sample_info$conditions == cond][j]),
                  paste0("norm_MAplot_", cond, "_", i, "vs", j, ".png"))
        
        MAplot_name[pos] <- paste0("norm_MAplot_", cond, "_", i, "vs", j, ".png")
        pos <- pos + 1
        
        a_after <- cbind(a_after, plot_name=aft[[1]])
        m_after <- cbind(m_after, plot_name=aft[[2]])
        colnames(a_after)[ncol(a_after)] <- col_name
        colnames(m_after)[ncol(m_after)] <- col_name
        
        print(aft[[3]], vp=viewport(layout.pos.row = i, layout.pos.col = pos_graph+1))
        
        pos_graph <- pos_graph + 2
      }
    }

  }
  
  write.table(a_before, file = normalizePath(file.path(opt$out, "MAplots",
                                                       paste0("raw_MAplot_a.csv")), 
                                             mustWork = FALSE), sep = "\t", 
              row.names = FALSE, quote = FALSE)
  
  write.table(m_before, file = normalizePath(file.path(opt$out, "MAplots",
                                                       paste0("raw_MAplot_m.csv")), 
                                             mustWork = FALSE), sep = "\t", 
              row.names = FALSE, quote = FALSE)
  
  write.table(a_after, file = normalizePath(file.path(opt$out, "MAplots",
                                                      paste0("norm_MAplot_a.csv")), 
                                            mustWork = FALSE), sep = "\t", 
              row.names = FALSE, quote = FALSE)
  
  write.table(a_after, file = normalizePath(file.path(opt$out, "MAplots",
                                                      paste0("norm_MAplot_m.csv")), 
                                            mustWork = FALSE), sep = "\t", 
              row.names = FALSE, quote = FALSE)
}


#########################################################
### ---------------------Differential expression analysis
#########################################################

if (nrow(pool1) > 1 && nrow(pool2) > 1) { # at least TWO replicats by conditions
  #Estimate dispersion
  dge_after <- estimateCommonDisp(dge_after)
  dge_after <- estimateTagwiseDisp(dge_after)
  
  #Independant filtering
  dge_after_filt <- dge_after
  if (opt$filter && nrow(pool1) > 1 && nrow(pool2) > 1) {
    dge_after_filt <- HTSFilter(dge_after, plot = FALSE)$filteredData
  }
  
  ### ---------------------Test with edgeR
  dge_after_res_nofilter <- exactTest(dge_after)
  res_nofilter <- topTags(dge_after_res_nofilter, 
                          n = nrow(dge_after_res_nofilter$table))$table
  
  dge_after_res <- exactTest(dge_after_filt)
  res <- topTags(dge_after_res, n = nrow(dge_after_res$table), 
                 adjust.method = opt$correct, sort.by = "none")$table
  colnames(res)[4] <- "FDR"
  
} else { # Less than two replicats for one or two conditions
  ### ---------------------Fisher exact test
  PC_after <- cpm(dge_after)
  
  fisher <- function(gene, dat) {
    dat<-t(rowsum(t(dat), group = as.factor(dge_after$samples$group)))
    t <- rbind(dat[gene, ], colSums(dat[-gene, ])) 
    return(fisher.test(t)$p.value)
  }

  pval_nofilter <- apply(data.frame(1:nrow(PC_after)), 1, fisher, PC_after)
  padj_nofilter <- p.adjust(pval_nofilter)
  
  #LFC
  if (sum(dge_after$samples$group == 1) > 1){
    LFC1 <- log2(rowMeans(PC_after[,dge_after$samples$group == 1]) + 0.125)
  } else {
    LFC1 <- log2(PC_after[,dge_after$samples$group == 1] + 0.125)
  }
  if (sum(dge_after$samples$group == 2) > 1){
    LFC2 <- log2(rowMeans(PC_after[,dge_after$samples$group == 2]) + 0.125)
  } else {
    LFC2 <- log2(PC_after[,dge_after$samples$group == 2] + 0.125)
  }
  
  res_nofilter <- data.frame(logFC = LFC2 - LFC1, 
                             logCPM = aveLogCPM(dge_after), PValue = pval_nofilter, 
                             FDR = padj_nofilter)
  rownames(res_nofilter) <- rownames(PC_after)
  res <- res_nofilter
  
}


######################################
### ---------------------Results files
######################################
## DE genes
diff <- res[, -2]
colnames(diff)[3] <- "padj"
diff <- diff[diff$padj < opt$alpha,]

# if "up" gene is over-expressed in pool 2
diff$overExpr <- ifelse(diff$logFC > 0, "pool1", "pool2")

# sort by adjusted p-value
diff <- diff[order(diff$padj), ]

# save .csv
write.table(cbind("#name" = rownames(diff), diff), 
            file = normalizePath(file.path(opt$out, "resDEG.csv"), 
                                 mustWork = FALSE), sep = "\t", 
            row.names = FALSE, quote = FALSE)

# save .json
diffbis<-cbind(name=rownames(diff[1:min(20,nrow(diff)),]),diff[1:min(20,nrow(diff)),])
rownames(diffbis)<-NULL
top20<-apply(diffbis,1,function(x) as.list(x))

## number of overexpressed genes in JSON file
if (nrow(pool1) > 1 && nrow(pool2) > 1){
  test <- "edgeR"
} else {
  test <- "fisher"
}

nbtot <- nrow(diff)
nb1 <- nrow(diff[diff$overExpr == "pool1",])
nb2 <- nrow(diff[diff$overExpr == "pool2",])
nbcount0 <- nrow(PC_before)-nrow(dge_before$counts)

if (opt$filter && nrow(pool1) > 1 && nrow(pool2) > 1){
  nbfilt <- nrow(dge_after$counts) - nrow(dge_after_filt$counts)
} else {
  nbfilt <- -1
}

stat <- list("test" = test, 
             "overexp_both_pools" = nbtot, 
             "over_expressed_pool1 " = nb1,
             "over_expressed_pool2 " = nb2,
             "no_count" = nbcount0,
             "filtered" = nbfilt)


write(gsub(" ", "", toJSON(list(stat=stat, top20=top20), collapse = "")), 
            file = normalizePath(file.path(opt$out, "results.json"), 
                                 mustWork = FALSE))
write.table(t(as.data.frame(stat)), 
            file = normalizePath(file.path(opt$out, "statistics.txt"), 
                                 mustWork = FALSE), quote = FALSE, col.names = FALSE)

####################################
### ---------------------Plots : DEG
####################################

#### histogram
h1 <- ggplot(data=res_nofilter, aes(PValue)) + geom_histogram() + 
  theme_bw() + xlab(label = "p-value") + 
  ylab(label = "Count") + ggtitle("Histogram on raw p-value without filter")

h2 <- ggplot(data=res_nofilter, aes(FDR)) + geom_histogram() + 
  theme_bw() + xlab(label = "p-value") + 
  ylab(label = "Count") + ggtitle("Histogram on adjust p-value without filter")


write.table(cbind(Pvalue = res_nofilter$PValue, adjPvalue = res_nofilter$FDR), 
            file = normalizePath(file.path(opt$out, "histogram_nofilter.csv"), 
                                 mustWork = FALSE), sep = "\t", 
            row.names = FALSE, quote = FALSE)

if (opt$filter) {
  h3 <- ggplot(data=res, aes(FDR)) + geom_histogram() + 
    theme_bw() + xlab(label = "p-value") + 
    ylab(label = "Count") + ggtitle("Histogram on adjust p-value with filter")
  
  write.table(cbind(adjPvalue = res$FDR), 
              file = normalizePath(file.path(opt$out, "histogram_filter.csv"), 
                                   mustWork = FALSE), sep = "\t", 
              row.names = FALSE, quote = FALSE)
  
  grid.arrange(h1, h2, h3, ncol=3)
  
} else {
  grid.arrange(h1, h2, ncol=2)
}


#### MAplots
res$DEG <- ifelse(res$FDR < opt$alpha, "Yes", "No")

if (nrow(pool1) > 1 | nrow(pool2) > 1) {
  p <- ggplot(res, aes(x=logCPM, y=logFC, group=DEG)) +
    geom_point(aes(color=as.factor(DEG)), size=3)+
    theme_bw() + scale_color_manual(values=c('#000000','#CC0000')) + 
    guides(color=guide_legend(title="DEG ?")) + 
    ggtitle("MA plot on normalized pseudo-counts")
  print(p)
  write.table(cbind(A = res$logCPM, M = res$logFC, DEG = res$DEG), 
              file = normalizePath(file.path(opt$out, "DE_MAplot.csv"), 
                                   mustWork = FALSE), sep = "\t", 
              row.names = FALSE, quote = FALSE)
} else {
  avalues <- (PC_after_log[,1] + PC_after_log[,2])/2
  mvalues <- (PC_after_log[,1] - PC_after_log[,2])
  p <- ggplot(data.frame(PC_after_log), aes(x=avalues, y=mvalues)) +
    geom_point(aes(color=as.factor(res$DEG)), size=3)+
    theme_bw() + scale_color_manual(values=c('#000000','#CC0000')) + 
    guides(color=guide_legend(title="DEG ?")) + 
    ggtitle("MA plot on normalized pseudo-counts") +
    xlab("A") + ylab("M")
  print(p)
  write.table(cbind(A = avalues, M = mvalues, DEG = res$DEG), 
              file = normalizePath(file.path(opt$out, "DE_MAplot.csv"), 
                                   mustWork = FALSE), sep = "\t", 
              row.names = FALSE, quote = FALSE)
}


#### heatmap and PCA plot
if (nrow(pool1) > 1 && nrow(pool2) > 1 && nrow(diff) > 0) {
  #heatmap
  y <- cpm(dge_after_filt, log = TRUE, prior.count = 1)
  selY <- y[rownames(res)[res$FDR < opt$alpha],]
  
  cimColor <- colorRampPalette(rev(brewer.pal(9, "Blues")))(255)[255:1]
  
  cim(t(selY), color=cimColor, symkey=FALSE, 
      main = "Heatmap on differentialy expressed genes")
  
  write.table(t(selY), 
              file = normalizePath(file.path(opt$out, "DE_heatmap.csv"), 
                                   mustWork = FALSE), sep = "\t", 
              row.names = FALSE, quote = FALSE)
  
  #PCA plot
  PC_MDS(PC_after_log[rownames(PC_after_log) %in% rownames(diff),], 
         "PCA plot on differentialy expressed genes", "DE_MDS")

}


dev.off()