## Set of graphical methods for phyloseq objects (mainly related to ordination, ## and library size after normalisation) require(gridBase) rarecurve2 <- function (x, step = 1, sample, xlab = "Sample Size", ylab = "Species", label = TRUE, col = "black", ...) ## See documentation for vegan rarecurve, col is now used to define ## custom colors for lines and panels { tot <- rowSums(x) S <- specnumber(x) nr <- nrow(x) out <- lapply(seq_len(nr), function(i) { n <- seq(1, tot[i], by = step) if (n[length(n)] != tot[i]) n <- c(n, tot[i]) drop(rarefy(x[i, ], n)) }) Nmax <- sapply(out, function(x) max(attr(x, "Subsample"))) Smax <- sapply(out, max) plot(c(1, max(Nmax)), c(1, max(Smax)), xlab = xlab, ylab = ylab, type = "n", ...) if (!missing(sample)) { abline(v = sample) rare <- sapply(out, function(z) approx(x = attr(z, "Subsample"), y = z, xout = sample, rule = 1)$y) abline(h = rare, lwd = 0.5) } for (ln in seq_len(length(out))) { color <- col[((ln-1) %% length(col)) + 1] N <- attr(out[[ln]], "Subsample") lines(N, out[[ln]], col = color, ...) } if (label) { ordilabel(cbind(tot, S), labels = rownames(x), col = col, ...) } invisible(out) } ## Plot composition at taxaRank2 level within taxa taxaSet1 at taxaRank1 level ## Restricts plot to numberOfTaxa taxa plot_composition <- function(physeq, taxaRank1 = "Phylum", taxaSet1 = "Proteobacteria", taxaRank2 = "Family", numberOfTaxa = 9, fill = NULL, x = "Sample", y = "Abundance", facet_grid = NULL) { ## Args: ## - physeq: phyloseq class object ## - taxaRank1: taxonomic level in which to do the first subsetting ## - taxaSet1: subset of level taxaRank1 to use ## - taxaRank2: taxonomic level used to agglomerate ## - numberOfTaxa: number of (top) taxa to keep at level taxaRank2 ## ## Returns: ## - ggplot2 graphics ggdata <- ggformat(physeq, taxaRank1, taxaSet1, taxaRank2, numberOfTaxa) p <- ggplot(ggdata, aes_string(x = x, y = y, fill = fill, color = fill, group = "Sample")) ## Manually change color scale to assign grey to "Unknown" (if any) if (!is.null(fill) && any(c("Unknown", "Other") %in% unique(ggdata[, fill]))) { ranks <- as.character(unique(ggdata[, fill])) ranks <- ranks[ ! ranks %in% c("Unknown", "Other")] colvals <- c(gg_color_hue(length(ranks)), "grey45", "black") names(colvals) <- c(ranks, "Unknown", "Other") ## Now add the manually re-scaled layer with Unassigned as grey p <- p + scale_fill_manual(values=colvals) + scale_color_manual(values = colvals) } p <- p + geom_bar(stat = "identity", position = "stack") if ( !is.null(facet_grid)) { p <- p + facet_grid(facets = facet_grid, scales = "free_x") } p <- p + theme(axis.text.x=element_text(angle=90), axis.title.x=element_blank()) p <- p + ggtitle(paste("Composition within", taxaSet1, "(", numberOfTaxa, "top", taxaRank2, ")")) return(p) } correct_levels <- function(physeq, DF, map.var) { oldLevels <- character(0) if (any(DF[ , map.var] == "samples")) { oldLevels <- unique(tax_table(physeq)[ , map.var]) } if (any(DF[ , map.var] == "taxa")) { oldLevels <- levels(get_variable(physeq, map.var)) } newLevels <- setdiff(levels(DF[ , map.var]), oldLevels) DF[ , map.var] <- factor(DF[ , map.var], levels = c(oldLevels, newLevels)) return(DF) } ## Find numberOfTaxa most abundant taxa at taxaRank level and return their relative ## abundance in all sample top_taxa_abundance<- function(physeq, numberOfTaxa = 9, raw = FALSE) { ## Args: ## - physeq: phyloseq class object ## - raw: (Required). Defaults to 'FALSE', logical. Should abundances be transformed to ## frequencies before sorting otus from most to less abundant. ## - numberOfTaxa: number of (top) taxa to keep ## ## Returns: ## - x: Data frame with numberOfTaxa rows (one per final otu) and column 'Abundance' standing ## for total counts (if raw = TRUE) or average frequency ## in samples of physeq. Rownames of x are either otus' names (if taxaRank = NULL) or names ## corresponding to taxonomic rank 'TaxaRank'. All NA ranks are assigned to 'Unassigned'. stopifnot(!is.null(tax_table(physeq, FALSE))) otutab <- otu_table(physeq) if ( !taxa_are_rows(otutab) ) {otutab = t(otutab)} otutab <- as(otutab, "matrix") if (raw) { Abundance <- rowSums(otutab) } else { otutab <- apply(otutab, 2, function(x) x / sum(x)) Abundance <- rowMeans(otutab) } ## Get top taxa mdf <- data.frame(OTU = names(Abundance), Abundance = Abundance) ## Add taxonomic information tax <- as(tax_table(physeq), "matrix") tax <- data.frame(OTU = rownames(tax), tax) mdf <- merge(mdf, tax, by.x = "OTU") ## Keep only numberOfTaxa top taxa topTaxa <- names(sort(Abundance, decreasing = TRUE))[1:numberOfTaxa] mdf <- mdf[ match(topTaxa, mdf$OTU), ] mdf$OTU <- factor(mdf$OTU, levels = unique(mdf$OTU)) return(mdf) } ## Find numberOfTaxa most abundant taxa at taxaRank level top_taxa <- function(physeq, taxaRank, numberOfTaxa = 9) { stopifnot(!is.null(tax_table(physeq, FALSE))) otutab <- otu_table(physeq) if ( !taxa_are_rows(otutab) ) {otutab = t(otutab)} otutab <- as(otutab, "matrix") otutab <- apply(otutab, 2, function(x) x / sum(x)) ## Subset to OTUs belonging to taxaSet1 to fasten process stopifnot(taxaRank %in% colnames(tax_table(physeq))) mdf <- melt(data = otutab, varnames = c("OTU", "Sample")) colnames(mdf)[3] <- "Abundance" mdf <- mdf[mdf$Abundance > 0, ] ## Add taxonomic information tax <- as(tax_table(physeq), "matrix") tax <- data.frame(OTU = rownames(tax), tax) mdf <- merge(mdf, tax, by.x = "OTU") ## Aggregate by taxaRank and recover most abundant taxa abundanceByTaxa <- aggregate(as.formula(paste("Abundance ~", taxaRank)), data = mdf, FUN = sum) ii <- order(abundanceByTaxa$Abundance, decreasing = TRUE) ## Keep only numberOfTaxa top taxa topTaxa <- (abundanceByTaxa[ii, taxaRank])[1:min(numberOfTaxa, nrow(abundanceByTaxa))] return(as(topTaxa, "character")) } ## Find numberOfConditions most abundant conditions in variable top_conditions <- function(physeq, variable, numberOfConditions = 9) { stopifnot(!is.null(sample_data(physeq, FALSE))) var <- get_variable(physeq, variable) sortedCond <- names(sort(table(var), decreasing = TRUE)) topCond <- sortedCond[1:min(numberOfConditions, length(sortedCond))] return(topCond) } ## Aggregate coordinates by replicate to have "mean" coordinates ## for a given replicate condition. replicate_means <- function(DF, replicate) { Axis1 <- colnames(DF)[1] Axis2 <- colnames(DF)[2] res1 <- aggregate(as.formula(paste(Axis1, "~", replicate)), data = DF, FUN = mean) res2 <- aggregate(as.formula(paste(Axis2, "~", replicate)), data = DF, FUN = mean) res <- merge(res1, res2, by.x = replicate, sort = FALSE) return(res) } ## Custom modification of plot_ordination to label outliers species and add mean location ## of samples aggregated by variable. If shape maps to more than 9 symbols, ## use only symbols corresponding to 9 most abundant symbols plot_biplot <- function(physeq, ordination, axes=c(1, 2), color = NULL, replicate = color, shape = NULL, label = NULL, title = NULL, outlier = NULL) { DF <- plot_ordination(physeq, ordination, "biplot", axes, color, shape, label, title, TRUE) ## Retrieve correct levels if(!is.null(shape)) { DF <- correct_levels(physeq, DF, shape) } if(!is.null(color)) { DF <- correct_levels(physeq, DF, color) } ## Keep only top 9 shapes if (!is.null(shape) & length(levels(DF[ ,shape])) > (9+1)) { if ( !"taxa" %in% levels(DF[, shape])) { DF[ , shape] <- factor(DF[ ,shape], levels = c(top_taxa(physeq, shape), "samples")) } else { DF[ , shape] <- factor(DF[ ,shape], levels = c(top_conditions(physeq, shape), "taxa")) } } ## Name dimensions x <- colnames(DF)[1] y <- colnames(DF)[2] ## label outliers if ( !is.null(outlier) ) { outliers <- taxa_names(physeq)[taxa_sums(physeq) > sum(otu_table(physeq)) * outlier] } ## Get mean of each batch of replicate if (!is.null(replicate)) { sampleCoordinates <- replicate_means(DF, replicate) } ## Mapping section p <- ggplot(DF, aes_string(x = x, y = y)) ## Plot building (do not include aes in main figure, allows displays of further layers) if ( is.null(color) ) { ## Rename color title in legend p <- p + geom_point(aes_string(color = "id.type", shape = shape), na.rm = TRUE) p <- update_labels(p, list(colour = "type")) } else { p <- p + geom_point(aes_string(size = "id.type", color = color, shape = shape), na.rm = TRUE) ## Check if variable is discrete if( is.discrete(DF[, color]) ){ colvals <- gg_color_hue(length(levels(as(DF[, color], "factor")))) names(colvals) <- levels(as(DF[, color], "factor")) ## Now make the taxa or samples dark grey colvals[names(colvals) %in% c("samples", "taxa")] <- "grey45" ## Now add the manually re-scaled layer with taxa/samples as grey p <- p + scale_colour_manual(values=colvals) } ## Adjust size so that samples are bigger than taxa by default. p <- p + scale_size_manual("type", values=c(samples=5, taxa=2)) } ## Adjust shape scale shape.names <- levels(DF[, shape]) shape.scale <- 17:(17 + length(shape.names) - 1) names(shape.scale) <- shape.names shape.scale["samples"] <- 16 p <- p + scale_shape_manual(values = shape.scale) ## Add the text labels if( !is.null(label) ){ label_map <- aes_string(x=x, y=y, color = color, label=label, na.rm=TRUE) p <- p + geom_text(label_map, data=DF[ !is.na(DF[ , label]), ], size=3, vjust=1.5, na.rm=TRUE) } ## Add replicates if( !is.null(replicate) ){ rep_map <- aes_string(x=x, y=y, label=replicate, na.rm=TRUE) if (replicate == color) { repcols <- colvals[sampleCoordinates[ , replicate]] } else { repcols <- "black" } p <- p + geom_text(rep_map, data = sampleCoordinates, colour = repcols, size=4, vjust=1.5, na.rm=TRUE) } ## Optionally add a title to the plot if( !is.null(title) ){ p <- p + ggtitle(title) } ## Add fraction variability to axis labels, if available if( length(extract_eigenvalue(ordination)[axes]) > 0 ){ ## Only attempt to add fraction variability ## if extract_eigenvalue returns something eigvec = extract_eigenvalue(ordination) ## Fraction variability, fracvar fracvar = eigvec[axes] / sum(eigvec) ## Percent variability, percvar percvar = round(100*fracvar, 1) ## The string to add to each axis label, strivar ## Start with the curent axis labels in the plot strivar = as(c(p$label$x, p$label$y), "character") ## paste the percent variability string at the end strivar = paste0(strivar, " [", percvar, "%]") ## Update the x-label and y-label p = p + xlab(strivar[1]) + ylab(strivar[2]) } ## Add outliers attribute if ( !is.null(outlier) ) {attr(p, "outliers") <- outliers} if ( !is.null(replicate) ) {attr(p, "repcols") <- repcols} ## return the ggplot object return(p) } ## Custom modification of plot_ordination to add mean location ## of samples aggregated by variable. plot_samples <- function(physeq, ordination, axes=c(1, 2), color = NULL, replicate = color, shape = NULL, label = NULL, title = NULL) { DF <- plot_ordination(physeq, ordination, "samples", axes, color, shape, label, title, TRUE) ## Retrieve correct levels if(!is.null(shape)) { DF <- correct_levels(physeq, DF, shape) } if(!is.null(color)) { DF <- correct_levels(physeq, DF, color) } ## Name dimensions x <- colnames(DF)[1] y <- colnames(DF)[2] ## Get mean of each batch of replicate if (!is.null(replicate)) { sampleCoordinates <- replicate_means(DF, replicate) } ## Mapping section p <- ggplot(DF, aes_string(x = x, y = y)) ## Plot building (do not include aes in main figure, allows displays of further layers) p <- p + geom_point(aes_string(color = color, shape = shape), na.rm = TRUE) ## Add the text labels if( !is.null(label) ){ label_map <- aes_string(x=x, y=y, label=label, color = color, na.rm=TRUE) p <- p + geom_text(label_map, data=DF[!is.na(DF[ , label]) , ], size=3, vjust=1.5, na.rm=TRUE) } ## Add replicates if( !is.null(replicate) ){ if (color == replicate) { ## map color to replicate if color and replicate grouping agree rep_map <- aes_string(x=x, y=y, label=replicate, color = replicate, na.rm=TRUE) } else { ## don't set color aes rep_map <- aes_string(x=x, y=y, label=replicate, na.rm=TRUE) } p <- p + geom_text(rep_map, data = sampleCoordinates, size=4, vjust=1.5, na.rm=TRUE) } ## Optionally add a title to the plot if( !is.null(title) ){ p <- p + ggtitle(title) } ## Add fraction variability to axis labels, if available if( length(extract_eigenvalue(ordination)[axes]) > 0 ){ ## Only attempt to add fraction variability ## if extract_eigenvalue returns something eigvec = extract_eigenvalue(ordination) ## Fraction variability, fracvar fracvar = eigvec[axes] / sum(eigvec) ## Percent variability, percvar percvar = round(100*fracvar, 1) ## The string to add to each axis label, strivar ## Start with the curent axis labels in the plot strivar = as(c(p$label$x, p$label$y), "character") ## paste the percent variability string at the end strivar = paste0(strivar, " [", percvar, "%]") ## Update the x-label and y-label p = p + xlab(strivar[1]) + ylab(strivar[2]) } ## return the ggplot object return(p) } ## ggplot hue color scale gg_color_hue <- function(n) { hues = seq(15, 375, length=n+1) hcl(h=hues, l=65, c=100)[1:n] } ## Plotting fonction once library sizes have been estimated ggnorm <- function(physeq, cds, x = "X.SampleID", color = NULL, title = NULL) { ## Args: ## - cds: Count data set (class eSet of BioConductor) ## - x: ggplot x aesthetics, used for plotting ## - color: ggplot color aes, used for plotting ## - title: optional, plot title ## - physeq: phyloseq class object from which metadata are extracted ## ## Returns: ## - ggplot2 figure with distribution of normalized log2(counts+1) on left, colored by ## color and mean/variance function on right panel. countdf <- counts(cds, normalize = TRUE) countdf <- log2(countdf+1) countdf[countdf == 0] <- NA countdf <- melt(countdf, varnames = c("OTU", "X.SampleID")) countdf <- countdf[!is.na(countdf$value), ] countdf <- merge(countdf, as(sample_data(physeq), "data.frame")) p <- ggplot(countdf, aes_string(x = x, y = "value", color = color)) + geom_boxplot() p <- p + theme(axis.text.x = element_text(angle = 90)) + scale_x_discrete("Sample") p <- p + scale_y_discrete("log2(Normalized counts + 1)") if (!is.null(title)) { p <- p + ggtitle(title) } ## Open graphical device par(no.readonly=TRUE) plot.new() ## setup layout gl <- grid.layout(nrow=1, ncol=2, widths=unit(c(1,1), 'null')) ## grid.show.layout(gl) ## setup viewports vp.1 <- viewport(layout.pos.col=1) # boxplot vp.2 <- viewport(layout.pos.col=2) # mean - sd plot ## init layout pushViewport(viewport(layout=gl)) ## Access the first viewport pushViewport(vp.1) ## print our ggplot graphics here print(p, newpage=FALSE) ## done with the viewport popViewport() ## Move to the second viewport pushViewport(vp.2) ## start new base graphics in second viewport par(new=TRUE, fig=gridFIG()) meanSdPlot(log2(counts(cds, normalized = TRUE)[, ] + 1), ranks = FALSE, main = title) ## done with the viewport popViewport() } ## return data frame for relative abundance plots in ggplot2 ## Return relative abundance of top NumberOfTaxa OTUs at the taxaRank2 level ## within taxaSet1 at taxaRank1 level ggformat <- function(physeq, taxaRank1 = "Phylum", taxaSet1 = "Proteobacteria", taxaRank2 = "Family", numberOfTaxa = 9) { ## Args: ## - physeq: phyloseq class object ## - taxaRank1: taxonomic level in which to do the first subsetting ## - taxaSet1: subset of level taxaRank1 to use ## - taxaRank2: taxonomic level used to agglomerate ## - numberOfTaxa: number of (top) taxa to keep at level taxaRank2 ## ## Returns: ## - data frame with taxonomic, sample and otu counts informations melted together ## Enforce orientation and transform count to relative abundances stopifnot(!is.null(sample_data(physeq, FALSE)), !is.null(tax_table(physeq, FALSE))) otutab <- otu_table(physeq) if ( !taxa_are_rows(otutab) ) { otutab = t(otutab) } otutab <- as(otutab, "matrix") otutab <- apply(otutab, 2, function(x) x / sum(x)) ## Subset to OTUs belonging to taxaSet1 to fasten process stopifnot(all(c(taxaRank1, taxaRank2) %in% colnames(tax_table(physeq)))) otutab <- otutab[tax_table(physeq)[ , taxaRank1] %in% taxaSet1, , drop = FALSE] if (nrow(otutab) == 0) { stop(paste("No otu belongs to", paste(taxaSet1, collapse = ","), "\n", "at taxonomic level", taxaRank1)) } mdf <- melt(data = otutab, varnames = c("OTU", "Sample")) colnames(mdf)[3] <- "Abundance" ## mdf <- mdf[mdf$Abundance > 0, ] ## Remove absent taxa ## Add taxonomic information and replace NA and unclassified Unknown tax <- as(tax_table(physeq), "matrix") tax[is.na(tax)] <- "Unknown" tax[tax %in% c("", "unclassified", "Unclassified")] <- "Unknown" tax <- data.frame(OTU = rownames(tax), tax) mdf <- merge(mdf, tax, by.x = "OTU") ## Aggregate by taxaRank2 mdf <- aggregate(as.formula(paste("Abundance ~ Sample +", taxaRank2)), data = mdf, FUN = sum) topTaxa <- aggregate(as.formula(paste("Abundance ~ ", taxaRank2)), data = mdf, FUN = sum) ## Keep only numberOfTaxa top taxa and aggregate the rest as "Other" topTax <- as.character(topTaxa[ order(topTaxa[ , "Abundance"], decreasing = TRUE), taxaRank2]) topTax <- topTax[topTax != "Unknown"] topTax <- topTax[1:min(length(topTax), numberOfTaxa)] ## Change to character mdf[ , taxaRank2] <- as.character(mdf[ , taxaRank2]) ii <- (mdf[ , taxaRank2] %in% c(topTax, "Unknown")) mdf[!ii , taxaRank2] <- "Other" mdf <- aggregate(as.formula(paste("Abundance ~ Sample +", taxaRank2)), data = mdf, FUN = sum) mdf[, taxaRank2] <- factor(mdf[, taxaRank2], levels = c(sort(topTax), "Unknown", "Other")) ## Add sample data.frame sdf <- as(sample_data(physeq), "data.frame") sdf$Sample <- sample_names(physeq) mdf <- merge(mdf, sdf, by.x = "Sample") ## Sort the entries by abundance to produce nice stacked bars in ggplot mdf <- mdf[ order(mdf[ , taxaRank2], mdf$Abundance, decreasing = TRUE), ] return(mdf) } ################################################################################ # Define S3 generic extract_eigenvalue function # Function is used by `plot_scree` to get the eigenvalue vector from different # types of ordination objects. # Used S3 generic in this case because many ordination objects, the input, are # not formally-defined S4 classes, but vaguely-/un-defined S3. #' @keywords internal extract_eigenvalue = function(ordination) UseMethod("extract_eigenvalue", ordination) # Default is to return NULL (e.g. for NMDS, or non-supported ordinations/classes). extract_eigenvalue.default = function(ordination) NULL # for pcoa objects extract_eigenvalue.pcoa = function(ordination) ordination$values$Relative_eig # for CCA objects extract_eigenvalue.cca = function(ordination) c(ordination$CCA$eig, ordination$CA$eig) # for RDA objects extract_eigenvalue.rda = function(ordination) c(ordination$CCA$eig, ordination$CA$eig) # for dpcoa objects extract_eigenvalue.dpcoa = function(ordination) ordination$eig # for decorana (dca) objects extract_eigenvalue.decorana = function(ordination) ordination$evals # for pca (edgePCA) objects extract_eigenvalue.pca = function(ordination) ordination$values$Eigenvalues