#!/usr/bin/env Rscript ############################################################################### # # # Copyright (c) 2013 J. Craig Venter Institute. # # All rights reserved. # # # ############################################################################### # # # This program is free software: you can redistribute it and/or modify # # it under the terms of the GNU General Public License as published by # # the Free Software Foundation, either version 3 of the License, or # # (at your option) any later version. # # # # This program is distributed in the hope that it will be useful, # # but WITHOUT ANY WARRANTY; without even the implied warranty of # # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # # GNU General Public License for more details. # # # # You should have received a copy of the GNU General Public License # # along with this program. If not, see . # # # ############################################################################### ############################################################################### library('getopt') params=c( "input_file", "i", 1, "character", "output_file", "o", 1, "character", "hclust_method", "m", 1, "character", "dist_convert", "d", 2, "character", #optional conversion of similarity to distance matrix using offset provided "threshold_clusters", "h", 2, "character", #optional threshold of clusters to generate "num_clusters", "n", 2, "character" #optional number of clusters to generate ) opt=getopt(spec=matrix(params, ncol=4, byrow=TRUE), debug=FALSE) script_name=unlist(strsplit(commandArgs(FALSE)[4],"=")[1])[2] usage = paste ( "\nUsage:\n\n", script_name, "\n", " -i \n", " -o \n", " -n \n", " -h \n", " -d \n", " -m \n", "\n", "This script will generate genome medoids for the given number or threshold specified.\n", "\n", "\n") if((!length(opt$input_file)) | (!length(opt$output_file))){ cat(usage) q(status=-1) } ############################################################################### OutputFileName <- opt$output_file #OutputFileNamePDF <- paste(OutputFileName, ".pdf", sep="") #OutputFileNameTXT <- paste(OutputFileName, ".txt", sep="") OutputFileNameMED <- paste(OutputFileName, ".medoids", sep="") OutputFileNameNWK <- paste(OutputFileName, ".newick", sep="") options(width=160) #sink(OutputFileNameTXT) InputFileName <- opt$input_file cat("\nInput File Name: ", InputFileName, "\n\n") cat("\nOutput File Name: ", OutputFileName, "\n\n") if (is.null(opt$num_clusters)) { if (is.null(opt$threshold_clusters)) { cat("\nMust specify either -n or -h\n") cat(usage) q(status=-1) } else { ThreshMedoids <- as.numeric(opt$threshold_clusters) cat("\nThreshold of medoids to generate: ", ThreshMedoids, "\n\n") } } else { if (is.null(opt$threshold_clusters)) { NumMedoids <- as.integer(opt$num_clusters) cat("\nNumber of medoids to generate: ", NumMedoids, "\n\n") } else { cat("\nCannot specify both -n and -h\n") cat(usage) q(status=-1) } } HclustMethod <- opt$hclust_method cat("\nHclust Method: ", HclustMethod, "\n\n") ############################################################################### ############################################################################### # Load data - assumes all values in table are numeric to save memory on input colClasses="numeric" # header=TRUE assumes there is a header line with column genome names #row.names=1 specifies that the first column is a genome name for the row distance_dataframe <- read.delim(InputFileName, sep="\t", header=TRUE, check.names=FALSE, comment.char="", quote="", row.names=1, strip.white=TRUE) #print(distance_dataframe) num_genomes <- nrow(distance_dataframe) width_pdf <- ceiling(num_genomes / 3) if (width_pdf < 7) { width_pdf <- 7 } #pdf(OutputFileNamePDF, width=width_pdf) distance_matrix <- as.matrix(distance_dataframe) #print(distance_matrix) if (!is.null(opt$dist_convert)) { cat("\nConverting similarity matrix to distance matrix using threshold: ", opt$dist_convert, "\n\n") distance_matrix <- as.numeric(opt$dist_convert) - distance_matrix } dist_mat <- as.dist(distance_matrix) if (HclustMethod == "nj") { library('ape') nj_tree <- nj(dist_mat) write.tree(nj_tree,file=OutputFileNameNWK) q() } else { clus <- hclust(dist_mat, method = HclustMethod) } # This snippet of R code should take a complete linkage hclust object and generate a Newick formatted string (label:branch_length,label:branch_length) # where label is either a leaf node label or recursively replaced by a subtree to generate a binary tree # hclust is generating distances at each merge step # assumptions are that this distance should be equally divided between the two clusters (subtrees) being merged, # each subtree already has a partial distance equal to its height and leaf nodes have zero height # an initial join of two leaf nodes results in a subtree with two equal length branches of 1/2 the merge distance # new branch lengths equal 1/2 the merge distance - the subtree height num_labels <- nrow(clus$labels) num_merges <- nrow(clus$merge) subtree_height <- numeric(num_merges) merge_labels <- character(num_merges) for (i in 1:num_merges) { height <- clus$height[i] / 2.0 subtree_height[i] <- height if (clus$merge[i,1] < 0) { label1 <- clus$labels[-1 * clus$merge[i,1]] height1 <- height } else { label1 <- merge_labels[clus$merge[i,1]] height1 <- height - subtree_height[clus$merge[i,1]] } if (clus$merge[i,2] < 0) { label2 <- clus$labels[-1 * clus$merge[i,2]] height2 <- height } else { label2 <- merge_labels[clus$merge[i,2]] height2 <- height - subtree_height[clus$merge[i,2]] } merge_labels[i] <- paste("(", label1, ":", height1, ",", label2, ":", height2, ")", sep="") } write(merge_labels[num_merges],file=OutputFileNameNWK) #plot(clus) #plot(as.dendrogram(clus)) #print(clus) #print(clus$merge) #print(clus$height) #print(clus$labels) if (is.null(opt$num_clusters)) { clusters <- cutree(clus, h=ThreshMedoids) } else { clusters <- cutree(clus, k=NumMedoids) } #print(clusters) sum_check <- 0 NumMedoids <- max(clusters) size_cluster <- numeric(NumMedoids) for (i in 1:NumMedoids) { size_cluster[i] <- sum(clusters == i) sum_check <- sum_check + size_cluster[i] } #print(size_cluster) if (sum_check != num_genomes) { print("Sum of cluster sizes not equalt to number of genomes") print(sum_check) print(num_genomes) } cat(file=OutputFileNameMED, append=FALSE, "") clust.medoid = function(i, pdistmat, pclusters) { ind = (pclusters == i) if (sum(ind) <= 1) { return (rownames(distance_matrix)[ind]) ## medoid of a single object is the object } else { return(names(which.min(rowSums( pdistmat[ind, ind] )))) } } medoids <- sapply(unique(clusters), clust.medoid, distance_matrix, clusters) #print(medoids) for (i in 1:NumMedoids) { cat(medoids[i], size_cluster[i], file=OutputFileNameMED, append=TRUE, sep="\t") cat("\t", file=OutputFileNameMED, append=TRUE) cat(clus$labels[clusters == i], file=OutputFileNameMED, append=TRUE, sep=",") cat("\n", file=OutputFileNameMED, append=TRUE) }