library(cluster) library(fpc) library(seriation) data1 = read.table(file='data.csv', sep=',', header=T) group <- data1$group data1$group <- NULL kdata <- na.omit(data1) kdata <- scale(kdata) wss <- (nrow(kdata)-1)*sum(apply(kdata,2,var)) for (i in 2:15) wss[i] <- sum(kmeans(kdata, centers=i)$withinss) print('Testing K-Means solutions') plot(1:15, wss, type="b", xlab="Number of Clusters", ylab="Within Groups Sum of Squares") suggest <- pamk(kdata) print('Determining suggested number of clusters based on cluster distortion') print(suggest$nc) choose.clust <- function(){readline("What clustering solution would you like to use? ")} clust.level <- as.integer(choose.clust()) fit <- kmeans(kdata, clust.level) aggregate(kdata, by=list(fit$cluster), FUN=mean) clust.out <- fit$cluster kclust <- as.matrix(cbind(clust.out, kdata)) write.table(kclust, file="kmeans_out.csv", sep=",") print('Applying K-means cluster level and appending files') # BAR PLOT par(ask=TRUE) print('Creating bar chart of clusters by Group') b.plot <- table(clust.out, group) b.plot.mat <- as.matrix(b.plot) b.plot.per <- prop.table(b.plot.mat, margin=2)*100 barplot(b.plot.per, main="K-means Clusters by Group", ylim=c(0,100), ylab="Percent", beside=TRUE, cex.names=0.5, col=rainbow(clust.level)) # PLOT CONFIDENCE INTERVALS OVER MDS PLOT par(ask=TRUE) print('Plotting confidence intervals over first two MDS dimensions') clusplot(kdata, fit$cluster, shade=F, labels=0, lines=0, color=T) # DISSIMILARITY PLOT print('Creating dissimilarity plot') d <- dist(kdata,method='euclidean') l <- fit$cluster res <- dissplot(d,labels=l,options=list(main='Dissimilarity Plot - Standard')) #OUTPUT print('Sending all output to PDF file') pdf(file="kmeans_output.pdf") plot(1:15, wss, type="b", xlab="Number of Clusters", ylab="Within Groups Sum of Squares") barplot(b.plot.per, main="K-means Clusters by Group", ylim=c(0,100), ylab="Percent", beside=TRUE, cex.names=0.5, col=rainbow(clust.level)) clusplot(kdata, fit$cluster, shade=F, labels=0, lines=0, color=T) plot(res, options=list(main='Dissimilarity Plot - Standard')) dev.off()