##################### Differential Analysis Expression ######################### ################################################################################ ### ---------------------Packages #common packages packages <- c("optparse", # to read arguments from a command line "ggplot2", # nice plots "gridExtra", # multiple plots on one page "grid", # multiple plots on one page for MAplots "RColorBrewer", # color in plots "mixOmics", # for heatmap plot "reshape2", # reshape grouped data "knitr", # create a pdf file with images "markdown", # create a pdf file with images "RJSONIO") # work with JSON file for(package in packages){ # if package is installed locally, load if (!(package %in% rownames(installed.packages()))) { install.packages(package, repos="http://cran.univ-paris1.fr") } do.call('library', list(package)) } #Bioconductor package bioPackages <- c("edgeR", # RNAseq data normalization "HTSFilter") # performs an independant filtering for(package in bioPackages){ # if package is installed locally, load if (!(package %in% rownames(installed.packages()))) { source("http://www.bioconductor.org/biocLite.R") biocLite(package) } do.call('library', list(package)) } ### ---------------------Parameters ############# ###### Parameter initialization when the script is launched from command line ############ option_list = list( make_option(c("-f", "--file"), type = "character", default = NULL, help = "tabulated raw count matrix with library name as header (e.g. #gene_id lib1 lib2...)", metavar = "character"), make_option(c("-n", "--norm"), type = "character", default = NULL, help = "file with normalized factors with library name (e.g. sample.name lib.size norm.factors). This file is obtained with script 'Normalization.R'", metavar = "character"), make_option(c("-o", "--out"), type = "character", default = NULL, help = "folder path where results are stored", metavar = "character"), make_option(c("--pool1"), type = "character", default = NULL, help = "library name in pool 1 separeted by ','", metavar = "character"), make_option(c("--pool2"), type = "character", default = NULL, help = "library name in pool 2 separeted by ','", metavar = "character"), make_option(c("--filter"), type = "logical", default = TRUE, help = "if TRUE low expressed genes are removed [default=%default]", metavar = "character"), make_option(c("--alpha"), type = "double", default = 0.05, help = "significance level of the tests (i.e. acceptable rate of false-positive in the list of differentially expressed genes) [default=%default]", metavar = "character"), make_option(c("--correct"), type = "character", default = "BH", help = "method used to adjust p-values for multiple testing ('BH', 'BY' or 'fdr') [default=%default]", metavar = "character"), make_option(c("--MAplots"), type = "logical", default = FALSE, help = "if TRUE all MA plots are saved [default=%default]", metavar = "character") ); opt_parser = OptionParser(usage="Imports data, converts them into a DGEList object (edgeR), does tests to find differentially expressed genes and produces diagnostic plots.", option_list = option_list); opt = parse_args(opt_parser); ############## #### Parameter initialization when the script is launched from R or Rstudio ##file: tabulated raw count matrix with library name as header #(e.g. #gene_id lib1 lib2...) ##norm: file with normalized factors with library name #(e.g. sample.name lib.size norm.factors). This file is obtained with script #'Normalization.R' ##out: folder path where results are stored ##pool1: library name in pool 1 separeted by ', ' ##pool2: library name in pool 2 separeted by ', ' ##filter: if TRUE low expressed genes are removed ##alpha: significance level of the tests (i.e. acceptable rate of #false-positive in the list of differentially expressed genes) ##correct: method used to adjust p-values for multiple testing #('BH', 'BY' or 'fdr') ##MAplots: if TRUE all MA plots are saved ############## #opt=list() #opt$file <- XXX # for example "C:/My documents/raw_count.csv" #opt$norm <- XXX # for example "C:/My documents/RLE_info.csv" #opt$out <- XXX # for example "C:/My documents/results" #opt$pool1 <- XXX # for example "lib1, lib2" #opt$pool2 <- XXX # for example "lib3, lib4" #opt$alpha <- 0.05 #opt$filter <- TRUE #opt$correct <- "BH" #opt$MAplots <- FALSE #Directory, count table file name or conditions are needed to continue if (is.null(opt$file) || is.null(opt$norm) || is.null(opt$out) || is.null(opt$pool1) || is.null(opt$pool2)) { print_help(opt_parser) stop("count table file name, norm, output directory or conditions missing.\n", call. = FALSE) } #multiple testing correction if (!(opt$correct %in% c("BH", "bonferroni"))) { print_help(opt_parser) stop("you can't use this correction to adjust p-values.\n", call. = FALSE) } #Create output folder if it doesn't exist ifelse(!dir.exists(opt$out), dir.create(opt$out), FALSE) #Stop the script if files doesn't exist if (!file.exists(normalizePath(opt$file)) | !file.exists(normalizePath(file.path(opt$norm), mustWork = FALSE))) { print_help(opt_parser) stop("count table file or file with normalization factor doesn't exist.\n", call. = FALSE) } ##################################################### ### ---------------------Input data and preprocessing ##################################################### #Raw and normalized count PC_before <- read.table(normalizePath(opt$file), header = TRUE, row.names = 1, comment.char="") norm_fact <- read.table(normalizePath(file.path(opt$norm), mustWork = FALSE), header = TRUE) #Conditions pool1 <- data.frame(lib_name = unlist(strsplit(opt$pool1, ',')), conditions = rep(1, length(opt$pool1))) pool2 <- data.frame(lib_name = unlist(strsplit(opt$pool2, ',')), conditions = rep(2, length(opt$pool2))) if (nrow(pool1) < 1 || nrow(pool2) < 1) { opt$filter <- FALSE opt$MAplots <- FALSE } sample_info <- rbind(pool1, pool2) PC_before <- PC_before[, colnames(PC_before) %in% as.character(sample_info$lib_name)] PC_before <- PC_before[, match(colnames(PC_before), sample_info$lib_name)] norm_fact <- norm_fact[as.character(norm_fact$sample.name) %in% as.character(sample_info$lib_name), ] #DGEList object dge_before <- DGEList(PC_before, remove.zeros = TRUE, group = sample_info$conditions) dge_after <- DGEList(PC_before, remove.zeros = TRUE, group = sample_info$conditions, norm.factors = norm_fact$norm.factors) #Pseudo-counts PC_before_log <- log2(dge_before$counts + 1) PC_after_log <- log2(cpm(dge_after) + 1) ############################################## ### ---------------------Plots : normalization ############################################## #save images in a pdf file pdf(file.path(opt$out, "de_images.pdf"), title = "Plots from differential analysis", width = 12, height = 10) PC_before_log_long <- melt(PC_before_log) colnames(PC_before_log_long) <- c("name", "lib", "log2CPM") PC_before_log_long <- merge(PC_before_log_long, sample_info, by.x = "lib", by.y = "lib_name") PC_after_log_long <- melt(PC_after_log) colnames(PC_after_log_long) <- c("name", "lib", "log2CPM") PC_after_log_long <- merge(PC_after_log_long, sample_info, by.x = "lib", by.y = "lib_name") #### boxplot PC_boxplot <- function(dat, tit, filename) { p <- ggplot(dat, aes(x=lib, y=log2CPM)) + geom_boxplot(aes(fill = factor(conditions))) + theme_bw() + xlab(label = "") + ylab(label = expression(log[2]*"(counts + 1)")) + ggtitle(tit) + scale_fill_manual(name = "Conditions", values = c("steelblue3", "springgreen3")) print(p) } PC_boxplot(PC_before_log_long, "Boxplot on raw pseudo-counts", "rawcount_boxplot.png") PC_boxplot(PC_after_log_long, "Boxplot on pseudo-counts after normalization", "normcount_boxplot.png") #### MDS if (nrow(pool1) > 1 && nrow(pool2) > 1) { PC_MDS <- function(dat, tit, filename) { dev.new() mds <- data.frame(plotMDS(dat)$cmdscale.out) dev.off() mds$lib.name <- rownames(mds) p <- ggplot(mds, aes(x = X1, y = X2, label = lib.name, color = as.factor(sample_info$conditions))) + geom_point(size=4) + geom_text(hjust=0.5, vjust=-0.5) + theme_bw() + xlab(label = "Dimension 1") + ylab(label = "Dimension 2") + ggtitle(tit) + scale_color_manual(name = "Conditions", values = c("steelblue3", "springgreen3")) print(p) write.table(mds, file = normalizePath(file.path(opt$out, paste0(filename,".csv")), mustWork = FALSE), sep = "\t", row.names = TRUE, quote = FALSE) } PC_MDS(PC_before_log, "PCA plot on raw pseudo-counts", "rawcount_MDS") PC_MDS(PC_after_log, "PCA plot on pseudo-counts after normalization", "normcount_MDS") } #### density PC_density <- function(dat, tit, filename) { p <- ggplot(dat, aes(x=log2CPM, group = lib)) + geom_density(aes(colour = factor(conditions))) + theme_bw() + xlab(label = expression(log[2]*"(counts + 1)")) + ylab(label = "Density") + ggtitle(tit) + guides(color=guide_legend(title="Conditions")) + scale_color_manual(name = "Conditions", values = c("steelblue3", "springgreen3")) print(p) } PC_density(PC_before_log_long, "Density plot on raw pseudo-counts", "rawcount_density.png") PC_density(PC_after_log_long, "Density plot on pseudo-counts after normalization", "normcount_density.png") #save data used for boxplots and density plots #Raw counts write.table(PC_before_log_long, file = normalizePath(file.path(opt$out, "raw_boxplot_density.csv"), mustWork = FALSE), sep = "\t", row.names = TRUE, quote = FALSE) #Normalized counts write.table(PC_after_log_long, file = normalizePath(file.path(opt$out, paste0("norm_boxplot_density.csv")), mustWork = FALSE), sep = "\t", row.names = TRUE, quote = FALSE) #### MAplots MAplot_name <- character() pos <- 1 if (opt$MAplots && nrow(pool1) > 1 && nrow(pool2) > 1) { ifelse(!dir.exists(file.path(opt$out, "MAplots")), dir.create(file.path(opt$out, "MAplots")), FALSE) PC_MAplot <- function(lib1, lib2, tit, filename) { avalues <- (lib1 + lib2)/2 mvalues <- (lib1 - lib2) MA <- data.frame(avalues = avalues, mvalues = mvalues) p <- ggplot(MA, aes(x=avalues, y=mvalues)) + geom_point(size=1) + theme_bw() + ggtitle(tit) + xlab("A") + ylab("M") + geom_hline(yintercept = 0, col = "red") + theme(text = element_text(size=5)) #print(p) return(list(avalues,mvalues,p)) } a_before <- data.frame(name=rownames(PC_before_log)) m_before <- data.frame(name=rownames(PC_before_log)) a_after <- data.frame(name=rownames(PC_after_log)) m_after <- data.frame(name=rownames(PC_after_log)) for(cond in unique(sample_info$conditions)) { I <- sum(sample_info$conditions == cond)-1 J <- sum(sample_info$conditions == cond) grid.newpage() # Create layout : nrow = 2, ncol = 2 pushViewport(viewport(layout = grid.layout(I, I*2))) pos_graph <- 1 for(i in 1:I) { pos_graph <- i*2 - 1 for(j in (i+1):J) { col_name <- paste0("cond",cond, "_", sample_info$lib_name[sample_info$conditions == cond][i], "vs", sample_info$lib_name[sample_info$conditions == cond][j]) bef <- PC_MAplot(PC_before_log[, sample_info$conditions == cond][, i], PC_before_log[, sample_info$conditions == cond][, j], paste0("MA plot on raw pseudo-counts: \n", sample_info$lib_name[sample_info$conditions == cond][i], " vs. ", sample_info$lib_name[sample_info$conditions == cond][j]), paste0("raw_MAplot_", cond, "_", i, "vs", j, ".png")) MAplot_name[pos] <- paste0("raw_MAplot_", cond, "_", i, "vs", j, ".png") pos <- pos + 1 a_before <- cbind(a_before, bef[[1]]) m_before <- cbind(m_before, bef[[2]]) colnames(a_before)[ncol(a_before)] <- col_name colnames(m_before)[ncol(m_before)] <- col_name print(bef[[3]], vp=viewport(layout.pos.row = i, layout.pos.col = pos_graph)) aft <- PC_MAplot(PC_after_log[, sample_info$conditions == cond][, i], PC_after_log[, sample_info$conditions == cond][, j], paste0("MA plot on normalized pseudo-counts: \n", sample_info$lib_name[sample_info$conditions == cond][i], " vs. ", sample_info$lib_name[sample_info$conditions == cond][j]), paste0("norm_MAplot_", cond, "_", i, "vs", j, ".png")) MAplot_name[pos] <- paste0("norm_MAplot_", cond, "_", i, "vs", j, ".png") pos <- pos + 1 a_after <- cbind(a_after, plot_name=aft[[1]]) m_after <- cbind(m_after, plot_name=aft[[2]]) colnames(a_after)[ncol(a_after)] <- col_name colnames(m_after)[ncol(m_after)] <- col_name print(aft[[3]], vp=viewport(layout.pos.row = i, layout.pos.col = pos_graph+1)) pos_graph <- pos_graph + 2 } } } write.table(a_before, file = normalizePath(file.path(opt$out, "MAplots", paste0("raw_MAplot_a.csv")), mustWork = FALSE), sep = "\t", row.names = FALSE, quote = FALSE) write.table(m_before, file = normalizePath(file.path(opt$out, "MAplots", paste0("raw_MAplot_m.csv")), mustWork = FALSE), sep = "\t", row.names = FALSE, quote = FALSE) write.table(a_after, file = normalizePath(file.path(opt$out, "MAplots", paste0("norm_MAplot_a.csv")), mustWork = FALSE), sep = "\t", row.names = FALSE, quote = FALSE) write.table(a_after, file = normalizePath(file.path(opt$out, "MAplots", paste0("norm_MAplot_m.csv")), mustWork = FALSE), sep = "\t", row.names = FALSE, quote = FALSE) } ######################################################### ### ---------------------Differential expression analysis ######################################################### if (nrow(pool1) > 1 && nrow(pool2) > 1) { # at least TWO replicats by conditions #Estimate dispersion dge_after <- estimateCommonDisp(dge_after) dge_after <- estimateTagwiseDisp(dge_after) #Independant filtering dge_after_filt <- dge_after if (opt$filter && nrow(pool1) > 1 && nrow(pool2) > 1) { dge_after_filt <- HTSFilter(dge_after, plot = FALSE)$filteredData } ### ---------------------Test with edgeR dge_after_res_nofilter <- exactTest(dge_after) res_nofilter <- topTags(dge_after_res_nofilter, n = nrow(dge_after_res_nofilter$table))$table dge_after_res <- exactTest(dge_after_filt) res <- topTags(dge_after_res, n = nrow(dge_after_res$table), adjust.method = opt$correct, sort.by = "none")$table colnames(res)[4] <- "FDR" } else { # Less than two replicats for one or two conditions ### ---------------------Fisher exact test PC_after <- cpm(dge_after) fisher <- function(gene, dat) { dat<-t(rowsum(t(dat), group = as.factor(dge_after$samples$group))) t <- rbind(dat[gene, ], colSums(dat[-gene, ])) return(fisher.test(t)$p.value) } pval_nofilter <- apply(data.frame(1:nrow(PC_after)), 1, fisher, PC_after) padj_nofilter <- p.adjust(pval_nofilter) #LFC if (sum(dge_after$samples$group == 1) > 1){ LFC1 <- log2(rowMeans(PC_after[,dge_after$samples$group == 1]) + 0.125) } else { LFC1 <- log2(PC_after[,dge_after$samples$group == 1] + 0.125) } if (sum(dge_after$samples$group == 2) > 1){ LFC2 <- log2(rowMeans(PC_after[,dge_after$samples$group == 2]) + 0.125) } else { LFC2 <- log2(PC_after[,dge_after$samples$group == 2] + 0.125) } res_nofilter <- data.frame(logFC = LFC2 - LFC1, logCPM = aveLogCPM(dge_after), PValue = pval_nofilter, FDR = padj_nofilter) rownames(res_nofilter) <- rownames(PC_after) res <- res_nofilter } ###################################### ### ---------------------Results files ###################################### ## DE genes diff <- res[, -2] colnames(diff)[3] <- "padj" diff <- diff[diff$padj < opt$alpha,] # if "up" gene is over-expressed in pool 2 diff$overExpr <- ifelse(diff$logFC > 0, "pool1", "pool2") # sort by adjusted p-value diff <- diff[order(diff$padj), ] # save .csv write.table(cbind("#name" = rownames(diff), diff), file = normalizePath(file.path(opt$out, "resDEG.csv"), mustWork = FALSE), sep = "\t", row.names = FALSE, quote = FALSE) # save .json diffbis<-cbind(name=rownames(diff[1:min(20,nrow(diff)),]),diff[1:min(20,nrow(diff)),]) rownames(diffbis)<-NULL top20<-apply(diffbis,1,function(x) as.list(x)) ## number of overexpressed genes in JSON file if (nrow(pool1) > 1 && nrow(pool2) > 1){ test <- "edgeR" } else { test <- "fisher" } nbtot <- nrow(diff) nb1 <- nrow(diff[diff$overExpr == "pool1",]) nb2 <- nrow(diff[diff$overExpr == "pool2",]) nbcount0 <- nrow(PC_before)-nrow(dge_before$counts) if (opt$filter && nrow(pool1) > 1 && nrow(pool2) > 1){ nbfilt <- nrow(dge_after$counts) - nrow(dge_after_filt$counts) } else { nbfilt <- -1 } stat <- list("test" = test, "overexp_both_pools" = nbtot, "over_expressed_pool1 " = nb1, "over_expressed_pool2 " = nb2, "no_count" = nbcount0, "filtered" = nbfilt) write(gsub(" ", "", toJSON(list(stat=stat, top20=top20), collapse = "")), file = normalizePath(file.path(opt$out, "results.json"), mustWork = FALSE)) write.table(t(as.data.frame(stat)), file = normalizePath(file.path(opt$out, "statistics.txt"), mustWork = FALSE), quote = FALSE, col.names = FALSE) #################################### ### ---------------------Plots : DEG #################################### #### histogram h1 <- ggplot(data=res_nofilter, aes(PValue)) + geom_histogram() + theme_bw() + xlab(label = "p-value") + ylab(label = "Count") + ggtitle("Histogram on raw p-value without filter") h2 <- ggplot(data=res_nofilter, aes(FDR)) + geom_histogram() + theme_bw() + xlab(label = "p-value") + ylab(label = "Count") + ggtitle("Histogram on adjust p-value without filter") write.table(cbind(Pvalue = res_nofilter$PValue, adjPvalue = res_nofilter$FDR), file = normalizePath(file.path(opt$out, "histogram_nofilter.csv"), mustWork = FALSE), sep = "\t", row.names = FALSE, quote = FALSE) if (opt$filter) { h3 <- ggplot(data=res, aes(FDR)) + geom_histogram() + theme_bw() + xlab(label = "p-value") + ylab(label = "Count") + ggtitle("Histogram on adjust p-value with filter") write.table(cbind(adjPvalue = res$FDR), file = normalizePath(file.path(opt$out, "histogram_filter.csv"), mustWork = FALSE), sep = "\t", row.names = FALSE, quote = FALSE) grid.arrange(h1, h2, h3, ncol=3) } else { grid.arrange(h1, h2, ncol=2) } #### MAplots res$DEG <- ifelse(res$FDR < opt$alpha, "Yes", "No") if (nrow(pool1) > 1 | nrow(pool2) > 1) { p <- ggplot(res, aes(x=logCPM, y=logFC, group=DEG)) + geom_point(aes(color=as.factor(DEG)), size=3)+ theme_bw() + scale_color_manual(values=c('#000000','#CC0000')) + guides(color=guide_legend(title="DEG ?")) + ggtitle("MA plot on normalized pseudo-counts") print(p) write.table(cbind(A = res$logCPM, M = res$logFC, DEG = res$DEG), file = normalizePath(file.path(opt$out, "DE_MAplot.csv"), mustWork = FALSE), sep = "\t", row.names = FALSE, quote = FALSE) } else { avalues <- (PC_after_log[,1] + PC_after_log[,2])/2 mvalues <- (PC_after_log[,1] - PC_after_log[,2]) p <- ggplot(data.frame(PC_after_log), aes(x=avalues, y=mvalues)) + geom_point(aes(color=as.factor(res$DEG)), size=3)+ theme_bw() + scale_color_manual(values=c('#000000','#CC0000')) + guides(color=guide_legend(title="DEG ?")) + ggtitle("MA plot on normalized pseudo-counts") + xlab("A") + ylab("M") print(p) write.table(cbind(A = avalues, M = mvalues, DEG = res$DEG), file = normalizePath(file.path(opt$out, "DE_MAplot.csv"), mustWork = FALSE), sep = "\t", row.names = FALSE, quote = FALSE) } #### heatmap and PCA plot if (nrow(pool1) > 1 && nrow(pool2) > 1 && nrow(diff) > 0) { #heatmap y <- cpm(dge_after_filt, log = TRUE, prior.count = 1) selY <- y[rownames(res)[res$FDR < opt$alpha],] cimColor <- colorRampPalette(rev(brewer.pal(9, "Blues")))(255)[255:1] cim(t(selY), color=cimColor, symkey=FALSE, main = "Heatmap on differentialy expressed genes") write.table(t(selY), file = normalizePath(file.path(opt$out, "DE_heatmap.csv"), mustWork = FALSE), sep = "\t", row.names = FALSE, quote = FALSE) #PCA plot PC_MDS(PC_after_log[rownames(PC_after_log) %in% rownames(diff),], "PCA plot on differentialy expressed genes", "DE_MDS") } dev.off()