################################ Normalization ################################# ################################################################################ ### ---------------------Packages #common packages packages <- c("optparse", # to read arguments from a command line "ggplot2", # nice plots "reshape2") # reshape grouped data for(package in packages){ # if package is installed locally, load if (!(package %in% rownames(installed.packages()))) { install.packages(package, repos="http://cran.univ-paris1.fr") } do.call('library', list(package)) } #Bioconductor package if (!("edgeR" %in% rownames(installed.packages()))) { source("http://www.bioconductor.org/biocLite.R") biocLite("edgeR") } library(edgeR) # RNAseq data normalization ### ---------------------Parameters ############# ###### Parameter initialization when the script is launched from command line ############ option_list = list( make_option(c("-f", "--file"), type = "character", default = NULL, help = "tabulated raw count matrix with library name as header (e.g. #gene_id lib1 lib2)", metavar = "character"), make_option(c("-o", "--out"), type = "character", default = NULL, help = "folder path where results are stored", metavar = "character") ); opt_parser = OptionParser(usage="Imports data, converts them into a DGEList object (edgeR), does all normalization methods and produces diagnostic plots.", option_list = option_list); opt = parse_args(opt_parser); ############## ## Parameter initialization when the script is launched from R or Rstudio #file: tabulated raw count matrix with library name as header #(e.g. gene_id lib1 lib2) #out: folder path where results are stored ############## #opt=list() #opt$file=XXX # for example "C:/My documents/raw_count.csv" #opt$out=XXX # for example"C:/My documents/results" #Directory and count table file name are needed to continue if (is.null(opt$file) | is.null(opt$out)) { print_help(opt_parser) stop("count table file name or output directory missing.\n", call. = FALSE) } #Create output folder if it didn't exist ifelse(!dir.exists(opt$out), dir.create(opt$out), FALSE) ##################################################### ### ---------------------Input data and preprocessing ##################################################### #Raw count raw_count_table <- read.table(normalizePath(opt$file), header = TRUE, row.names = 1, comment.char="") #DGEList object dge <- DGEList(raw_count_table, remove.zeros = TRUE) #--Pseudo counts before normalization PC_before <- dge$counts PC_before_log <- log2(dge$counts + 1) ##############-Plots before normalization PC_before_log_long <- melt(PC_before_log) colnames(PC_before_log_long) <- c("name", "lib", "log2CPM") ## boxplot ggplot(PC_before_log_long, aes(x=lib, y=log2CPM)) + geom_boxplot(fill='gray', color="black") + theme_bw() + xlab(label = "") + ylab(label = expression(log[2]*"(counts + 1)")) + ggtitle("Boxplot on raw pseudo-counts") ggsave(filename = normalizePath(file.path(opt$out, "rawcount_boxplot.svg"), mustWork = FALSE), width = 11, height = 7) ## MDS if (ncol(PC_before) > 2){ MDS_PC <- data.frame(plotMDS(PC_before_log)$cmdscale.out) ggplot(MDS_PC, aes(x = X1, y = X2, label = rownames(MDS_PC))) + geom_point(size=4) + geom_text(hjust=0.5, vjust=-0.5) + theme_bw() + xlab(label = "Dimension 1") + ylab(label = "Dimension 2") + ggtitle("PCA plot on raw pseudo-counts") ggsave(filename = normalizePath(file.path(opt$out, "rawcount_MDS.svg"), mustWork = FALSE), width = 11, height = 7) } ## density ggplot(PC_before_log_long, aes(x=log2CPM, colour=lib)) + geom_density() + theme_bw() + xlab(label = expression(log[2]*"(counts + 1)")) + ylab(label = "Density") + ggtitle("Density plot on raw pseudo-counts") + guides(color=guide_legend(title="Library")) ggsave(filename = normalizePath(file.path(opt$out, "rawcount_density.svg"), mustWork = FALSE), width = 8, height = 6) #save data used for plots write.table(PC_before_log_long, file = normalizePath(file.path(opt$out, "raw_boxplot_density.csv"), mustWork = FALSE), sep = "\t", row.names = TRUE, quote = FALSE) if (ncol(PC_before) > 2){ write.table(MDS_PC, file = normalizePath(file.path(opt$out, "raw_MDS.csv"), mustWork = FALSE), sep = "\t", row.names = TRUE, quote = FALSE) } ####################################### ### --------------------- Normalization ####################################### normalization <- function(dge, method) { #norm factor dgeNorm <- calcNormFactors(dge, method = method) #Pseudo-counts PC_after <- cpm(dgeNorm) PC_after_log <- log2(PC_after + 1) #Normalized data PC_after <- cbind("#name" = rownames(PC_after), PC_after) sample_info <- data.frame(sample.name=rownames(dgeNorm$samples), dgeNorm$samples[,c("lib.size","norm.factors")]) write.table(PC_after, file = normalizePath(file.path(opt$out, paste0(method, "_pseudocounts.csv")), mustWork = FALSE), sep = "\t", row.names = FALSE, quote = FALSE) write.table(sample_info, file = normalizePath(file.path(opt$out, paste0(method, "_info.txt")), mustWork = FALSE), sep = "\t", row.names = FALSE, quote = FALSE) ##### Plots PC_after_log_long <- melt(PC_after_log) colnames(PC_after_log_long) <- c("name", "lib", "log2CPM") #title tit <- method if (method == "upperquartile") { tit <- "upper-quartile" } #boxplot ggplot(PC_after_log_long, aes(x=lib, y=log2CPM)) + geom_boxplot(fill='gray', color="black") + theme_bw() + xlab(label = "") + ylab(label = expression(log[2]*"(counts + 1)")) + ggtitle(paste0("Boxplot on pseudo-counts after normalization (method = ", tit, ")")) ggsave(filename = normalizePath(file.path(opt$out, paste0(method, "_boxplot.svg")), mustWork = FALSE), width = 8, height = 6) #MDS if (ncol(PC_before) > 2){ MDS_PC <- data.frame(plotMDS(PC_after_log)$cmdscale.out) ggplot(MDS_PC, aes(x = X1, y = X2, label = rownames(MDS_PC))) + geom_point(size=4) + geom_text(hjust=0.5, vjust=-0.5) + theme_bw() + xlab(label = "Dimension 1") + ylab(label = "Dimension 2") + ggtitle(paste0("PCA plot on pseudo-counts after normalization (method = ", tit, ")")) ggsave(filename = normalizePath(file.path(opt$out, paste0(method, "_MDS.svg")), mustWork = FALSE), width = 11, height = 7) } #density ggplot(PC_after_log_long, aes(x=log2CPM, colour=lib)) + geom_density() + theme_bw() + xlab(label = expression(log[2]*"(counts + 1)")) + ylab(label = "Density") + ggtitle(paste0("Density plot on pseudo-counts after normalization (method = ", tit, ")")) + guides(color=guide_legend(title="Library")) ggsave(filename = normalizePath(file.path(opt$out, paste0(method, "_density.svg")), mustWork = FALSE), width = 8, height = 6) #save data used for plots write.table(PC_after_log_long, file = normalizePath(file.path(opt$out, paste0(method, "_boxplot_density.csv")), mustWork = FALSE), sep = "\t", row.names = TRUE, quote = FALSE) if (ncol(PC_before) > 2){ write.table(MDS_PC, file = normalizePath(file.path(opt$out, paste0(method, "_MDS.csv")), mustWork = FALSE), sep = "\t", row.names = TRUE, quote = FALSE) } } normalization(dge, "upperquartile") normalization(dge, "RLE") normalization(dge, "TMM")