##################### ##################### ## ## ## ABOUT PHYLOSEQ ## ## ## ##################### ##################### ## ----Install phyloseq from bioconductor repos --------------------------- ## ## try http if https is not available ## source("http://bioconductor.org/biocLite.R") ## biocLite("phyloseq") ## ---- Install phyloseq source -------------------------------------- ## install.packages("devtools") ## If not installed previously ## library("devtools") ## install_github("phyloseq", "joey711") ## ---- Phyloseq Vignettes -------------------------------------------------------- ## vignette("phyloseq-basics") ## vignette("phyloseq-analysis") ## ----load-packages, cache = FALSE, message = FALSE, tidy = FALSE--------- library(phyloseq) source("https://raw.githubusercontent.com/mahendra-mariadassou/phyloseq-extended/master/load-extra-functions.R") ## ----load additionnal packages ==> already done in load-extra-functions.R ## library(ggplot2) ## library(plyr) ## library(reshape2) ## library(grid) ## library(gridExtra) ## library(ape) ## library(scales) ## check that you are in the correct working directory getwd() ## use "Files" panel to change or setwd() ## import data biomfile <- "data/chaillou/chaillou.biom" food1 <-import_biom(biomfile) food1 sampledata <-read.csv("data/chaillou/sample_data.tsv", sep="\t", row.names = 1) sample_data(food1) <- sampledata food1 treefile <- read.tree("data/chaillou/tree.nwk") phy_tree(food1) <- treefile food1 ## import data biomfile2 <- "data/chaillou/chaillou_with_sam_data.biom" ## treefile <- read.tree("data/chaillou/tree.nwk") ## if not already loaded food <- import_biom(biomfile2, treefile, parseFunction = parse_taxonomy_greengenes) food ##################### ## ## ## DATA STRUCTURE ## ## ## ##################### ## Phyloseq components head(otu_table(food)) head(tax_table(food)) head(tax_table(food1)) head(sample_data(food)) phy_tree(food) ## Accessors ntaxa(food) nsamples(food) head(sample_names(food)) head(taxa_names(food)) head(sample_sums(food)) head(taxa_sums(food)) rank_names(food) rank_names(food1) sample_variables(food) head(get_variable(food)) get_variable(food, varName = "EnvType") levels(factor(get_variable(food, varName = "EnvType"))) get_sample(food, i = "otu_00520") get_taxa(food, i = "MVT0.LOT07") ## sample library sizes sample_sums(food)[c("MVT0.LOT01", "MVT0.LOT07", "MVT0.LOT09")] ## Otu overall abundances taxa_sums(food)[c("otu_00520", "otu_00569", "otu_00527")] ######################### ## ## ## DATA MANIPULATION ## ## (MODIFICATION) ## ## ## ######################### ## change taxonomic rank names new_rank <- c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species") colnames(tax_table(food1)) <- new_rank rank_names(food1) ## or head(tax_table(food)) ## change level order for EnvType correct.order <- c("BoeufHache", "VeauHache", "DesLardons", "MerguezVolaille", "SaumonFume", "FiletSaumon", "FiletCabillaud", "Crevette") sample_data(food)$EnvType <- factor(sample_data(food)$EnvType, levels = correct.order) levels(get_variable(food, "EnvType")) ## Change otu count, or taxonomic affiliation otu_table(food)["otu_00520", "DLT0.LOT08"] <- 0 tax_table(food)["otu_00520", "Species"] <- "Ornithinolytica" ######################### ## ## ## DATA MANIPULATION ## ## (FILTER, SMOOTHING) ## ## ## ######################### ## Filters, subset, prune samplesToKeep <- sample_names(food)[1:10] prune_samples(samplesToKeep, food) subset_samples(food, EnvType %in% c("DesLardons", "MerguezVolaille")) small.food <- subset_taxa(food, Phylum == "Firmicutes" & Class == "Bacilli") head(tax_table(small.food)[ , c("Phylum", "Class", "Order")]) ## Unique combinations (Phylum, Class) unique(tax_table(small.food)[ , c("Phylum", "Class")]) ## aggregation mergedData <- tax_glom(food, "Phylum") ntaxa(mergedData) ## number of different phyla tax_table(mergedData)[1:2, c("Phylum", "Order", "Class")] ## rarefaction and count transformation foodRare <- rarefy_even_depth(food, rngseed = 1121983) sample_sums(foodRare)[1:5] count_to_prop <- function(x) { return( x / sum(x) )} foodTrans <- transform_sample_counts(food, count_to_prop) sample_sums(foodTrans)[1:5] ## should be 1 ######################### ######################### ## ## ## DATA VISUALIZATION ## ## ## ######################### ######################### ################### ## ## ## COMPOSITION ## ## ## ################### ## plot_bar (v1) p <- plot_bar(food) plot(p) ## plot_bar (v2) p <- plot_bar(food, fill = "Phylum") + facet_wrap(~EnvType, scales = "free_x", nrow = 1) plot(p) ## plot_composition (Phylum within Bacteria) p <- plot_composition(food, "Kingdom", "Bacteria", "Phylum", 5, fill = "Phylum") p <- p + facet_wrap(~EnvType, scales = "free_x", nrow = 1) plot(p) ## plot_composition (Family within Proteobacteria) p <- plot_composition(food, "Phylum", "Proteobacteria", "Family", 9, fill = "Family") p <- p + facet_wrap(~EnvType, scales = "free_x", nrow = 1) plot(p) ##################### ## ## ## ALPHA DIVERSITY ## ## RICHNESS ## ## ## ##################### ## plot_richness plot_richness(food) p <- plot_richness(food, x = "EnvType", color = "EnvType", measures = c("Observed", "Chao1", "Shannon", "Simpson", "InvSimpson")) plot(p) p <- p + geom_boxplot(aes(fill = EnvType), alpha = 0.2) + theme_bw() + geom_point() + theme(axis.text.x = element_blank()) plot(p) plot_richness(food, x = "FoodType", color = "FoodType", measures = c("Observed", "Chao1", "Shannon", "Simpson", "InvSimpson")) + geom_boxplot(aes(fill = FoodType), alpha = 0.2) + theme_bw() + geom_point() + theme(axis.text.x = element_blank()) ## plot_richness (after trimming) plot_richness(prune_taxa(taxa_sums(food) >= 500, food), color = "EnvType", measures = c("Observed", "Chao1", "Shannon", "Simpson", "InvSimpson"), x = "EnvType") + geom_boxplot(aes(fill = EnvType), alpha = 0.2) + theme_bw() + geom_point() + theme(axis.text.x = element_blank()) ## Numeric values of alpha diversity indices alpha.diversity <- estimate_richness(food, measures = c("Observed", "Chao1", "Shannon")) head(alpha.diversity) ## export diversity to text file ## write.table(alpha.diversity, "myfile.txt") ## Anova on observed richness data <- cbind(sample_data(food), alpha.diversity) food.anova <- aov(Observed ~ EnvType, data) summary(food.anova) ## significant effect of environment type on richness food.anova <- aov(Shannon ~ EnvType, data) summary(food.anova) #################### ## ## ## BETA DIVERSITY ## ## DISTANCES ## ## ## #################### ## Generic method for distance dist.bc <- distance(food, method = "bray") ## Bray-Curtis ## all available distances distanceMethodList ## compute other matrix distances dist.jac <- distance(food, "cc") dist.uf <- distance(food, method = "unifrac") ## Unifrac dist.wuf <- distance(food, method = "wunifrac") ## Weighted Unifrac ## some code to visualize distances sampleOrder <- levels(reorder(sample_names(food), as.numeric(get_variable(food, "EnvType")))) plot_dist_as_heatmap <- function(dist, order = sampleOrder, title = NULL) { data <- melt(as(dist, "matrix")) colnames(data) <- c("x", "y", "distance") if (!is.null(order)) { data$x <- factor(data$x, levels = order) data$y <- factor(data$y, levels = order) } p <- ggplot(data, aes(x = x, y = y, fill = distance)) + geom_tile() p <- p + theme(axis.title.x = element_blank(), axis.title.y = element_blank(), axis.text.x = element_blank(), axis.text.y = element_blank() ) p <- p + scale_fill_continuous(limits = c(0, 1)) if (!is.null(title)) { p <- p + ggtitle(title) } return(p) } ## Compare Bray-Curtis (bray) and Jaccard (cc) p1 <- plot_dist_as_heatmap(dist.bc, title = "Bray-Curtis") plot(p1) p2 <- plot_dist_as_heatmap(dist.jac, title = "Jaccard") grid.arrange(p1, p2, ncol = 2, nrow = 1) ## Compare unifrac and wunifrac p3 <- plot_dist_as_heatmap(dist.uf, title = "Unifrac") p4 <- plot_dist_as_heatmap(dist.wuf, title = "wUnifrac") grid.arrange(p3, p4, ncol = 2, nrow = 1) ## compare Bray-Curtis, Jaccard, Unifrac and wUnifrac grid.arrange(p1, p2, p3, p4, ncol = 2, nrow = 2) ################ ## ## ## ORDINATION ## ## ## ################ ## Ordination (MDS/PCoA is the same thing) ord <- ordinate(food, method = "MDS", distance = "bray") ## Alternative way to compute ordinations ## dist.bc <- distance(food, method = "bray") ## ord <- ordinate(food, method = "MDS", distance = dist.bc) ## plot_ordination plot_ordination(food,ord) p <- plot_ordination(food, ord, color = "EnvType") p <- p + theme_bw() + ggtitle("MDS + BC") plot(p) ## plot_samples (add group means to the plot) p <- plot_samples(food, ord, color = "EnvType", replicate = "EnvType") p <- p + theme_bw() + ggtitle("MDS + BC") plot(p) ## Compare ordinations obtained from Bray-Curtis, Jaccard, Unifrac and wUnifrac distances p1 <- plot_samples(food, ordinate(food, "MDS", "bray"), color = "EnvType") + ggtitle("MDS + BC") + theme_bw() p2 <- plot_samples(food, ordinate(food, "MDS", "cc"), color = "EnvType") + ggtitle("MDS + Jaccard") + theme_bw() p3 <- plot_samples(food, ordinate(food, "MDS", "unifrac"), color = "EnvType") + ggtitle("MDS + UF") + theme_bw() p4 <- plot_samples(food, ordinate(food, "MDS", "wunifrac"), color = "EnvType") + ggtitle("MDS + wUF") + theme_bw() grid.arrange(p1, p2, p3, p4, ncol = 2, nrow = 2) ################ ## ## ## CLUSTERING ## ## ## ################ ## General method for clustering samples ## clustering <- hclust(distance.matrix, method = "linkage.function") ## plot(clustering) ## Clustering of samples (from UniFrac distances) using different linkage method par(mfcol = c(1,3)) ## To plot the three clustering trees side-by-side plot(hclust(dist.uf, method = "complete")) plot(hclust(dist.uf, method = "ward.D2")) plot(hclust(dist.uf, method = "single")) ## Better dendrograms, color leaves (samples) according to EnvType ## Env types envtype <- get_variable(food, "EnvType") ## automatic color palette: one color per different sample type palette <- hue_pal()(length(levels(envtype))) ## Map sample type to color tipColor <- col_factor(palette, levels = levels(envtype))(envtype) ## Change hclust object to phylo object and plot clust.uf <- as.phylo(hclust(dist.uf, method = "ward.D2")) par(mfcol = c(1,1)) plot(clust.uf, tip.color = tipColor, direction = "downwards", main = "Ward linkage") ## To plot each linkage method side-by-side par(mfcol= c(1,3)) clust.uf <- as.phylo(hclust(dist.uf, method = "single")) plot(clust.uf, tip.color = tipColor, direction = "downwards", main = "Single linkage") clust.uf <- as.phylo(hclust(dist.uf, method = "ward.D2")) plot(clust.uf, tip.color = tipColor, direction = "downwards", main = "Ward linkage") clust.uf <- as.phylo(hclust(dist.uf, method = "complete")) plot(clust.uf, tip.color = tipColor, direction = "downwards", main = "Complete linkage") ################ ## ## ## HEATMAP ## ## ## ################ ## heatmap (v1) p <- plot_heatmap(food) plot(p) ## heatmap (v2, with more usual colors) p <- plot_heatmap(food, low = "yellow", high = "red", na.value = "white") plot(p) ## heatmap (v3, with more usual colors and samples organized by EnvType) p <- plot_heatmap(food, low = "yellow", high = "red", na.value = "white") p <- p + facet_grid(~EnvType, scales = "free_x") plot(p) ## heatmap (v4, alternative color scheme) p <- plot_heatmap(food) p <- p + scale_fill_gradient2(low = "#1a9850", mid = "#ffffbf", high = "#d73027", na.value = "white", trans = log_trans(4), midpoint = log(100, base = 4)) p <- p + facet_grid(~EnvType, scales = "free_x") plot(p) ################## ## ## ## MULTIVARIATE ## ## ANOVA ## ## ## ################## ## MANOVA with CAP (Canonical Analysis of Principal Coordinates) ## CAP through capscale metadata <- as(sample_data(food), "data.frame") ## convert sample_data to data.frame cap <- capscale(dist.uf ~ EnvType, data = metadata) cap anova <- anova(cap, permutations = 999) print(anova) ## MANOVA with CAP (Canonical Analysis of Principal Coordinates) metadata <- as(sample_data(food), "data.frame") adonis(dist.uf ~ EnvType, data = metadata, perm = 9999) ##################### ##################### ## ## ## ADVANCED USE ## ## ## ##################### ##################### ################## ## ## ## MANUAL ## ## IMPORT ## ## ## ################## ## Import raw data in R sampledata <- read.csv("data/manual/sampledata.tsv", sep = "\t", row.names = 1) taxtable <- read.csv("data/manual/taxtable.tsv", sep = "\t", row.names = 1) otutable <- read.csv("data/manual/otutable.tsv", sep = "\t", row.names = 1) tree <- read.tree("data/manual/tree.phy") ## Convert to R base data types taxtable <- as.matrix(taxtable) otutable <- as.matrix(otutable) ## Convert to phyloseq base data types sampledata <- sample_data(sampledata) taxtable <- tax_table(taxtable) otutable <- otu_table(otutable, taxa_are_rows = TRUE) ## Check name consistency (samples) all(colnames(otutable) %in% rownames(sampledata)) ## OTU table and metadata ## Check name consistency (taxa) all(rownames(otutable) %in% rownames(taxtable)) ## OTU table and Taxonomy table all(rownames(otutable) %in% tree$tip.label) ## OTU table and tree leaves ## Construct phyloseq object manualData <- phyloseq(sampledata, otutable, taxtable, tree) manualData ################## ## ## ## ADVANCED ## ## FILTERS ## ## ## ################## ## Sample wise condition (example I): filter taxa present in at least 5 samples condition <- function(x) { x > 0 } taxaToKeep <- genefilter_sample(food, condition, 5) prune_taxa(taxaToKeep, food) ## Sample wise condition (example II): filter taxa in the top 250 taxa present in at least 3 samples condition <- function(x) { order(x, decreasing = TRUE) <= 250 } taxaToKeep <- genefilter_sample(food, condition, 3) prune_taxa(taxaToKeep, food) ## sample wide condition (example I): filter taxa present in at least 5 samples condition <- function(x) { sum(x > 0) >= 5 } taxaToKeep <- filter_taxa(food, condition) prune_taxa(taxaToKeep, food) ## sample wide condition (example II): filter taxa present with global abundance at least 100 condition <- function(x) { sum(x) >= 100 } taxaToKeep <- filter_taxa(food, condition) prune_taxa(taxaToKeep, food) ################## ## ## ## ADVANCED ## ## SMOOTHING ## ## ## ################## ## Merge all samples according to some metadata mergedData <- merge_samples(food, "EnvType") mergedData sample_names(mergedData) ## Beware, some information is lost in the process sample_data(mergedData)[1:2, ] sample_data(food)[1:2, ] ## Merge taxa at at given height in the tree mergedData <- tip_glom(food, h = 0.3) ################## ## ## ## RAREFACTION ## ## CURVES ## ## ## ################## ## Rarefy samples to same depth and compute rarefaction curves food <- rarefy_even_depth(food, rngseed = 1121983) p <- ggrare(food, step = 1000, color = "EnvType", se = FALSE) p <- p + facet_wrap(~EnvType, ncol = 4) + theme_bw() plot(p) ################### ## ## ## CUSTOM COLOR ## ## PALETTE ## ## ## ################### # check EnvType order levels(sample_data(food)$EnvType) # if you want to change the order ## correct.order <- c("BoeufHache", "VeauHache", "DesLardons", ## "MerguezVolaille", "SaumonFume", "FiletSaumon", ## "FiletCabillaud", "Crevette") ##sample_data(food)$EnvType <- factor(sample_data(food)$EnvType, ## levels = correct.order) # desired color palette foodPalette <- c('#61001f','#b2182b','#d6604d','#f4a582','#92c5de','#4393c3','#2166ac','#053061') # associated EnvType level names(foodPalette) <- levels(sample_data(food)$EnvType) foodPalette ## Try this palette on clustering ## Map sample type to color tipColor <- col_factor(foodPalette, levels = levels(envtype))(envtype) ## Change hclust object to phylo object and plot clust.uf <- as.phylo(hclust(dist.uf, method = "ward.D2")) par(mfcol = c(1,1)) plot(clust.uf, tip.color = tipColor, direction = "downwards", main = "Ward linkage") ## Try this on any ggplot food <- rarefy_even_depth(food, rngseed = 1121983) p <- ggrare(food, step = 1000, color = "EnvType", se = FALSE) p <- p + facet_wrap(~EnvType, ncol = 4) + theme_bw() p <- p + scale_color_manual(values = foodPalette) plot(p) ################### ## ## ## CUSTOM ORDER ## ## OTU IN CLUST ## ## ## ################### # custom otu order in function of prevalence prevalence <- estimate_prevalence(food,"EnvType") # get in a different format to estimate correlations correlationData <- estimate_prevalence(food, "EnvType", format = "wide") correlationData <- t(correlationData) correlation <- cor(correlationData, method="pearson") ##clustering sample and order otu according to the tree otu.clust <- hclust(as.dist(1-correlation), method = "complete") otuOrder <- otu.clust$labels[otu.clust$order] ## plot p <- plot_heatmap(food, taxa.order = otuOrder) p <- p + facet_grid(~EnvType, scales = "free", space = "free") p <- p + scale_fill_gradient2 (low="#1a9850", mid = "#ffffbf", high="#d73027", na.value = "white", trans = log_trans(4), midpoint = log(100,base= 4 )) plot(p)