# R version 2.9 # Arguments: # - groupNum: number of cluster # - inFilename: an input filename, the file is a tab-delimited text file # whose rows and columns are indexed by the experiment IDs and whose cells # contains the pairwise experiment dissimilarity scores # - outResFilename: an output filename, the file contains clustering # results with silhouette scores # - outIdFilename: an output filename, the file whose rows are experiment IDs # for a cluster, one cluster per row groupNum<-gsub("groupNum=","",grep("groupNum=",commandArgs(),value=T)) groupNum<-as.numeric(groupNum) inFilename<-gsub("inFilename=","",grep("inFilename=",commandArgs(),value=T)) outResFilename<-gsub("outResFilename=","",grep("outResFilename=",commandArgs(),value=T)) outIdFilename<-gsub("outIdFilename=","",grep("outIdFilename=",commandArgs(),value=T)) # Load cluster library library(cluster) packageDescription("cluster") ############################################## # write clustering results ############################################## writeStudyIDs<-function(res, filename, gNum) { for(i in 1:gNum) { ids<-rownames(subset(res,res[,1]==i)) write(ids,filename,ncolumn=length(ids), append=TRUE, sep="\t") } } ############################################## # load matrix file ############################################## data<-read.table(inFilename,strip.white=TRUE, sep ="\t", header=TRUE) # Retrieve data without studyID column subset<-as.matrix(data[,2:ncol(data)]) # Calculate distance matrix using default distance measure method matrix_dist<-as.dist(subset) # Hierarchical cluster analysis # The agglomeration method should be (an unambiguous abbreviation of) one of # "ward", "single", "complete", "average", "mcquitty", "median" or "centroid". hclust<-hclust(matrix_dist, method="average") ####################################################################################### # cut a hierarchical cluster tree into several groups by specifying the desired number ####################################################################################### hclustK<-cutree(hclust, k=groupNum) # Compute or Extract Silhouette Information from Clustering sil.hclust<-silhouette(x=hclustK, dist=matrix_dist) sorted_sil<-sortSilhouette(sil.hclust) # create a matrix to store result with study IDs rowNum<-nrow(sorted_sil) colNum<-ncol(sorted_sil)+1 resData <- matrix(0, nrow=rowNum, ncol=colNum) # copy cluster data to new created matrix and add study ids from loaded data for(i in 1:rowNum) { resData[i,1]<-colnames(subset)[as.integer(rownames(sorted_sil)[i])] for(j in 2:colNum) { resData[i,j]<-sorted_sil[i,j-1] } } colnames(resData)<-c("Study_id","cluster","neighbor","sil_width") # write result in a file write.table(resData, outResFilename, quote=FALSE, sep="\t", row.names = FALSE) cbind(hclustK) # write experiment IDs for each cluster per row in a file writeStudyIDs(cbind(hclustK),outIdFilename, groupNum)