21. Unsupervised Learning in-R

最新推荐文章于 2024-05-28 11:00:45 发布

radar_sun

最新推荐文章于 2024-05-28 11:00:45 发布

阅读量614

点赞数 1

分类专栏： Data-Scientist-with-R

本文链接：https://blog.csdn.net/agoldminer/article/details/103933003

版权

Data-Scientist-with-R 专栏收录该内容

13 篇文章 9 订阅

订阅专栏

文章目录

1. Unsupervised Learning in R
2. Hierarchical Clustering
3. Dimensionality Reduction with PCA
4. Putting it All Together with A Case Study

1. Unsupervised Learning in R

1.1 Welcome to the course (video)

1.2 Identify clustering problems

1.3 Introduction to k-means clustering (video)

1.4 k-means clustering

Instruction:

# Create the k-means model: km.out
km.out <- kmeans(x, centers = 3, nstart = 20)

# Inspect the result
summary(km.out)

1.5 Results of kmeans()

Instruction:

# Print the cluster membership component of the model
km.out

# Print the km.out object
print(km.out)

1.6 Visualizing and interpreting results of kmeans()

Instruction:

# Scatter plot of x
plot(x,col = km.out$cluster, 
     main = "k-means with 3 clusters",
     xlab = "", 
     ylab = "")

1.7 How kmeans() works and practical matters (video)

1.8 Handling random algorithms

Instruction:

# Set up 2 x 3 plotting grid
par(mfrow = c(2, 3))

# Set seed
set.seed(1)

for(i in 1:6) {
  # Run kmeans() on x with three clusters and one start
  km.out <- kmeans(x, center = 3, nstart = 1)
  
  # Plot clusters
  plot(x, col = km.out$cluster, 
       main = km.out$tot.withinss, 
       xlab = "", ylab = "")
}

1.9 Selecting number of clusters

Instruction:

# Initialize total within sum of squares error: wss
wss <- 0

# For 1 to 15 cluster centers
for (i in 1:15) {
  km.out <- kmeans(x, centers = i, nstart = 20)
  # Save total within sum of squares to wss variable
  wss[i] <- km.out$tot.withinss
}

# Plot total within sum of squares vs. number of clusters
plot(1:15, wss, type = "b", 
     xlab = "Number of Clusters", 
     ylab = "Within groups sum of squares")

# Set k equal to the number of clusters corresponding to the elbow location
k <- 2  # 3 is probably OK, too

1.10 Introduction to the Pokemon data (video)

1.11 Practical matters: working with real data

Instruction:

# Initialize total within sum of squares error: wss
wss <- 0

# Look over 1 to 15 possible clusters
for (i in 1:15) {
  # Fit the model: km.out
  km.out <- kmeans(pokemon, centers = i, nstart = 20, iter.max = 50)
  # Save the within cluster sum of squares
  wss[i] <- wss[i] <- km.out$tot.withinss
}

# Produce a scree plot
plot(1:15, wss, type = "b", 
     xlab = "Number of Clusters", 
     ylab = "Within groups sum of squares")

# Select number of clusters
k <- 2

# Build model with k clusters: km.out
km.out <- kmeans(pokemon, centers = 2, nstart = 20, iter.max = 50)

# View the resulting model
km.out

# Plot of Defense vs. Speed by cluster membership
plot(pokemon[, c("Defense", "Speed")],
     col = km.out$cluster,
     main = paste("k-means clustering of Pokemon with", k, "clusters"),
     xlab = "Defense", ylab = "Speed")

1.12 Review of k-means clustering (video)

2. Hierarchical Clustering

2.1 Introduction to hierarchical clustering (video)

2.2 Hierarchical clustering with results

Instruction:

# Create hierarchical clustering model: hclust.out
hclust.out <- hclust(dist(x))

# Inspect the result
summary(hclust.out)

2.3 Selecting number of clusters (video)

2.4 Interpreting dendrogram

2.5 Cutting the tree

Instruction:

# Cut by height
cutree(hclust.out, h = 7)


# Cut by number of clusters
cutree(hclust.out, k = 3)

2.6 Clustering linkage and practical matters (video)

2.7 Linkage method

Instruction:

# Cluster using complete linkage: hclust.complete
hclust.complete <- hclust(dist(x), method = "complete")

# Cluster using average linkage: hclust.average
hclust.average <- hclust(dist(x), method = "average")

# Cluster using single linkage: hclust.single
hclust.single <- hclust(dist(x), method = "single")

# Plot dendrogram of hclust.complete
plot(hclust.complete, main = "Complete")

# Plot dendrogram of hclust.average
plot(hclust.average, main = "Average")

# Plot dendrogram of hclust.single
plot(hclust.single, main = "Single")

2.8 Comparing linkage methods

2.9 Practical matters: scaling

Instruction:

# View column means
colMeans(pokemon)

# View column standard deviations
apply(pokemon, 2, sd)

# Scale the data
pokemon.scaled <- scale(pokemon)

# Create hierarchical clustering model: hclust.pokemon
hclust.pokemon <- hclust(dist(pokemon.scaled), method = "complete")

2.10 Comparing kmeans () and hclust ()

Instruction:

# Apply cutree() to hclust.pokemon: cut.pokemon
cut.pokemon <- cutree(hclust.pokemon, k = 3)

# Compare methods
table(km.pokemon$cluster, cut.pokemon)

2.11 Review of hierarchical clustering (video)

3. Dimensionality Reduction with PCA

3.1 Introduction to PCA

3.2 PCA using prcomp ()

Instruction:

# Perform scaled PCA: pr.out
pr.out <- prcomp(x = pokemon, scale = T, center = T)

# Inspect model output
summary(pr.out)

3.3 Results of PCA

3.4 Additional results of PCA

3.5 Visualizing and interpreting PCA results (video)

3.6 Interpreting biplots (1)

3.7 Interpreting biplots (2)

3.8 Variance explained

Instruction:

# Variability of each principal component: pr.var
pr.var <- pr.out$sdev^2

# Variance explained by each principal component: pve
pve <- pr.var / sum(pr.var)

3.9 Visualize variance explained

Instruction:

# Plot variance explained for each principal component
plot(pve, xlab = "Principal Component",
     ylab = "Proportion of Variance Explained",
     ylim = c(0, 1), type = "b")

# Plot cumulative proportion of variance explained
plot(cumsum(pve), xlab = "Principal Component",
     ylab = "Cumulative Proportion of Variance Explained",
     ylim = c(0, 1), type = "b")

3.10 Practical issues with PCA (video)

3.11 Practical issues: scaling

Instruction:

# Mean of each variable
colMeans(pokemon)

# Standard deviation of each variable
apply(pokemon, 2, sd)

# PCA model with scaling: pr.with.scaling
pr.with.scaling <- prcomp(pokemon, scale = TRUE)

# PCA model without scaling: pr.without.scaling
pr.without.scaling <- prcomp(pokemon, scale = FALSE)

# Create biplots of both for comparison
biplot(pr.with.scaling)
biplot(pr.without.scaling)

3.12 Additional uses of PCA and wrap-up (video)

4. Putting it All Together with A Case Study

4.1 Introduction to the case study (video)

4.2 Preparing the data

Instruction:

url <- "http://s3.amazonaws.com/assets.datacamp.com/production/course_1903/datasets/WisconsinCancer.csv"

# Download the data: wisc.df
wisc.df <- read.csv(url)

# Convert the features of the data: wisc.data
wisc.data <- as.matrix(wisc.df[, 3:32])

# Set the row names of wisc.data
row.names(wisc.data) <- wisc.df$id

# Create diagnosis vector
diagnosis <- as.numeric(wisc.df$diagnosis == "M")

4.3 Exploratory data analysis

4.4 Performing PCA

Instruction:

# Check column means and standard deviations
colMeans(wisc.data)
apply(wisc.data, 2, sd)

# Execute PCA, scaling if appropriate: wisc.pr
wisc.pr <- prcomp(wisc.data, scale = T, center = T)

# Look at summary of results
summary(wisc.pr)

4.5 Interpreting PCA results

Instruction:

# Create a biplot of wisc.pr
biplot(wisc.pr)

# Scatter plot observations by components 1 and 2
plot(wisc.pr$x[, c(1, 2)], col = (diagnosis + 1), 
     xlab = "PC1", ylab = "PC2")

# Repeat for components 1 and 3
plot(wisc.pr$x[, c(1, 3)], col = (diagnosis + 1), 
     xlab = "PC1", ylab = "PC3")

# Do additional data exploration of your choosing below (optional)
plot(wisc.pr$x[, c(2, 3)], col = (diagnosis + 1), 
     xlab = "PC2", ylab = "PC3")

4.6 Variance explained

Instruction:

# Set up 1 x 2 plotting grid
par(mfrow = c(1, 2))

# Calculate variability of each component
pr.var <- wisc.pr$sdev^2

# Variance explained by each principal component: pve
pve <- pr.var / sum(pr.var)


# Plot variance explained for each principal component
plot(pve, xlab = "Principal Component", 
     ylab = "Proportion of Variance Explained", 
     ylim = c(0, 1), type = "b")

# Plot cumulative proportion of variance explained
plot(cumsum(pve), xlab = "Principal Component", 
     ylab = "Cumulative Proportion of Variance Explained", 
     ylim = c(0, 1), type = "b")

4.7 Communicating PCA results

4.8 PCA review and next steps (video)

4.9 Hierarchical clustering of case data

Instruction:

# Scale the wisc.data data: data.scaled
data.scaled <- scale(wisc.data)

# Calculate the (Euclidean) distances: data.dist
data.dist <- dist(data.scaled)

# Create a hierarchical clustering model: wisc.hclust
wisc.hclust <- hclust(data.dist, method = "complete")

4.10 Results of hierarchical clustering

4.11 Selecting number of clusters

Instruction:

# Cut tree so that it has 4 clusters: wisc.hclust.clusters
wisc.hclust.clusters <- cutree(wisc.hclust, k = 4)

# Compare cluster membership to actual diagnoses
table(wisc.hclust.clusters, diagnosis)

4.12 k-means clustering and comparing results

Instruction:

# Create a k-means model on wisc.data: wisc.km
wisc.km <- kmeans(scale(wisc.data), centers = 2, nstart = 20)

# Compare k-means to actual diagnoses
table(wisc.km$cluster, diagnosis)
sum(apply(table(wisc.km$cluster, diagnosis), 1, min))

# Compare k-means to hierarchical clustering
table(wisc.hclust.clusters, wisc.km$cluster)
sum(apply(table(wisc.hclust.clusters, wisc.km$cluster), 1, min))

4.13 Clustering on PCA results

Instruction:

# Create a hierarchical clustering model: wisc.pr.hclust
wisc.pr.hclust <- hclust(dist(wisc.pr$x[, 1:7]), method = "complete")

# Cut model into 4 clusters: wisc.pr.hclust.clusters
wisc.pr.hclust.clusters <- cutree(wisc.pr.hclust, k = 4)

# Compare to actual diagnoses
table(diagnosis, wisc.pr.hclust.clusters)

# Compare to k-means and hierarchical
table(diagnosis, wisc.hclust.clusters)
table(diagnosis, wisc.km$cluster)