Dr.Manish Kumar Jain: clustering

library(ggplot2)

df <- data.frame(age = c(18, 21, 22, 24, 26, 26, 27, 30, 31, 35, 39, 40, 41, 42, 44, 46, 47, 48, 49, 54),

spend = c(10, 11, 22, 15, 12, 13, 14, 33, 39, 37, 44, 27, 29, 20, 28, 21, 30, 31, 23, 24))

ggplot(df, aes(x = age, y = spend)) + geom_point()

Clustering in R

library(tidyverse) # data manipulation

library(cluster) # clustering algorithms

library(factoextra) # Visualization

################################3

View(USArrests)

df <- USArrests

df <- na.omit(df) #To remove any missing value

df <- scale(df)

head(df)

#################################

distance <- get_dist(df)

fviz_dist(distance, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))

#################################

k2 <- kmeans(df, centers = 2, nstart = 2)

str(k2)

fviz_cluster(k2, data = df)

#################################

library(dplyr)

library(magrittr)

df %>%

as_tibble() %>%

mutate(cluster = k2$cluster,

state = row.names(USArrests)) %>%

ggplot(aes(UrbanPop, Murder, color = factor(cluster), label = state)) +

geom_text()

###################################

k3 <- kmeans(df, centers = 3, nstart = 25)

k4 <- kmeans(df, centers = 4, nstart = 25)

k5 <- kmeans(df, centers = 5, nstart = 25)

# plots to compare

p1 <- fviz_cluster(k2, geom = "point", data = df) + ggtitle("k = 2")

p2 <- fviz_cluster(k3, geom = "point", data = df) + ggtitle("k = 3")

p3 <- fviz_cluster(k4, geom = "point", data = df) + ggtitle("k = 4")

p4 <- fviz_cluster(k5, geom = "point", data = df) + ggtitle("k = 5")

library(gridExtra)

grid.arrange(p1, p2, p3, p4, nrow = 2)

######################################

Clustering optimization

set.seed(123)

gap_stat <- clusGap(df, FUN = kmeans, nstart = 25,

K.max = 10, B = 50)

# Print the result

print(gap_stat, method = "firstmax")

fviz_gap_stat(gap_stat)

final$cluster

View(df[final$cluster==1,])

#############################################

Hierarchical Cluster Analysis

# Dissimilarity matrix

d <- dist(df, method = "euclidean")

# Hierarchical clustering using Complete Linkage

hc1 <- hclust(d, method = "complete" )

# Plot the obtained dendrogram

plot(hc1, cex = 0.6, hang = -1)

##########################################

# Compute distance matrix

res.dist <- dist(df, method = "euclidean")

# Compute 2 hierarchical clusterings

hc1 <- hclust(res.dist, method = "complete")

hc2 <- hclust(res.dist, method = "ward.D2")

# Create two dendrograms

dend1 <- as.dendrogram (hc1)

dend2 <- as.dendrogram (hc2)

tanglegram(dend1, dend2)

Dr.Manish Kumar Jain

Wednesday, 22 May 2019

clustering

No comments:

Post a Comment

Blog Archive