![copyright.gif](https://i-blog.csdnimg.cn/blog_migrate/f4bb33a44367c6812a5edbbde4d0b37e.gif)
相似性度量
相似性和相异性被许多数据挖掘技术所使用,如聚类、最近邻分类、异常检测等。不同组样本之间的相似度是样本间差异程度的数值度量,两组样本越相似,它们的相异度就越低,相似度越高。通常用各种“距离”来衡量样本(观测值)的相似性,用相似系数来衡量指标(变量)的相似性。
原理详细讲解和网页(JS)计算实现,见银河统计相似性度量 - 数据挖掘算法。
R和Python计算实现见下文。
目录概览
A) "距离"计算
1、欧氏距离(Euclidean Distance)
2、曼哈顿距离(绝对值距离)(Manhattan Distance)
3、切比雪夫距离(Chebyshev Distance)
4、闵氏距离(Minkowski Distance)
5、马氏距离(Mahalanobis Distance)
B) 相似系数计算
1、皮尔逊相关系数(Pearson Correlation Coefficient)
2、斯皮尔曼秩相关系数(Spearman Rank Correlation)
3、肯德尔秩相关系数(Kendall Rank Correlation)
4、余弦相似度(Cosine Similarity)
A) "距离"计算函数封装
B) 相似系数计算函数封装
Data - 10名学生六门课程成绩表
序号 | 概率论 | 统计学 | 英语 | 政治 | 数据挖掘 | 线性代数 |
---|---|---|---|---|---|---|
1 | 67 | 63 | 73 | 75 | 44 | 91 |
2 | 74 | 69 | 66 | 94 | 81 | 55 |
3 | 76 | 93 | 93 | 79 | 71 | 27 |
4 | 65 | 38 | 85 | 85 | 61 | 45 |
5 | 80 | 39 | 48 | 75 | 41 | 52 |
6 | 72 | 80 | 70 | 88 | 86 | 43 |
7 | 60 | 50 | 91 | 95 | 42 | 64 |
8 | 77 | 49 | 69 | 50 | 89 | 55 |
9 | 65 | 89 | 50 | 70 | 99 | 85 |
10 | 78 | 41 | 55 | 89 | 71 | 28 |
A) "距离"计算
1、欧氏距离(Euclidean Distance)
Code
options(digits=4)
mydata <- read.table("clipboard",header=T)
class(mydata)
dim(mydata)
head(mydata)
# 第3名学生成绩
A <- mydata[3,2:7]
# 第5名学生成绩
B <- mydata[5,2:7]
# 第3名和第5名学生成绩之间的欧氏距离
x <- rbind(A, B)
D35 <- dist(x, method = "euclidean", diag = FALSE, upper = FALSE)
D35
D35 <- dist(x, method = "euclidean", diag = TRUE, upper = TRUE)
D35
D35 <- dist(x, method = "euclidean", diag = FALSE, upper = TRUE)
D35
D35 <- dist(x, method = "euclidean", diag = TRUE, upper = FALSE)
D35
class(D35)
cat("欧氏距离 =", D35, "\n")
Result
>
> options(digits=4)
> mydata <- read.table("clipboard",header=T)
> class(mydata)
[1] "data.frame"
> dim(mydata)
[1] 10 7
> head(mydata)
序号 概率论 统计学 英语 政治 数据挖掘 线性代数
1 1 67 63 73 75 44 91
2 2 74 69 66 94 81 55
3 3 76 93 93 79 71 27
4 4 65 38 85 85 61 45
5 5 80 39 48 75 41 52
6 6 72 80 70 88 86 43
> # 第3名学生成绩
> A <- mydata[3,2:7]
> A
概率论 统计学 英语 政治 数据挖掘 线性代数
3 76 93 93 79 71 27
> # 第5名学生成绩
> B <- mydata[5,2:7]
> B
概率论 统计学 英语 政治 数据挖掘 线性代数
5 80 39 48 75 41 52
> # 第3名和第5名学生成绩之间的欧氏距离
> x <- rbind(A, B)
> x
概率论 统计学 英语 政治 数据挖掘 线性代数
3 76 93 93 79 71 27
5 80 39 48 75 41 52
> D35 <- dist(x, method = "euclidean", diag = FALSE, upper = FALSE)
> D35
3
5 80.61
> D35 <- dist(x, method = "euclidean", diag = TRUE, upper = TRUE)
> D35
3 5
3 0.00 80.61
5 80.61 0.00
> D35 <- dist(x, method = "euclidean", diag = FALSE, upper = TRUE)
> D35
3 5
3 80.61
5 80.61
> D35 <- dist(x, method = "euclidean", diag = TRUE, upper = FALSE)
> D35
3 5
3 0.00
5 80.61 0.00
> class(D35)
[1] "dist"
> cat("欧氏距离 =", D35, "\n")
欧氏距离 = 80.61
>
Explanation
1.读取"剪切板"中的数据到R变量mydata中。【首先,复制数据Data,然后,运行Code程序!】
mydata <- read.table("clipboard",header=T)
2.距离计算
dist(x, method = "euclidean", diag = FALSE, upper = FALSE, p = 2)
其中
x 是样本矩阵或者数据框;
method表示计算哪种距离;
diag为TRUE的时候给出对角线上的距离;
upper为TURE的时候给出上三角矩阵上的值。
method的取值有:
euclidean 欧几里德距离(欧氏距离)(Euclidean Distance)
manhattan 曼哈顿距离(绝对值距离)(Manhattan Distance)
maximum 切比雪夫距离(Chebyshev Distance)
minkowski 闵可夫斯基距离(Minkowski Distance)(要指定p值)
canberra 兰式距离
2、曼哈顿距离(绝对值距离)(Manhattan Distance)
Code
options(digits=4)
mydata <- read.table("clipboard",header=T)
class(mydata)
dim(mydata)
head(mydata)
# 第3名学生成绩
A <- mydata[3,2:7]
# 第5名学生成绩
B <- mydata[5,2:7]
# 第3名和第5名学生成绩之间的曼哈顿距离
x <- rbind(A, B)
D35 <- dist(x, method = "manhattan", diag = FALSE, upper = FALSE)
D35
D35 <- dist(x, method = "manhattan", diag = TRUE, upper = TRUE)
D35
D35 <- dist(x, method = "manhattan", diag = FALSE, upper = TRUE)
D35
D35 <- dist(x, method = "manhattan", diag = TRUE, upper = FALSE)
D35
class(D35)
cat("曼哈顿距离 =", D35, "\n")
Result
>
> D35 <- dist(x, method = "manhattan", diag = FALSE, upper = FALSE)
> D35
3
5 162
> class(D35)
[1] "dist"
> cat("曼哈顿距离 =", D35, "\n")
曼哈顿距离 = 162
>
3、切比雪夫距离(Chebyshev Distance)
Code
options(digits=4)
mydata <- read.table("clipboard",header=T)
class(mydata)
dim(mydata)
head(mydata)
# 第3名学生成绩
A <- mydata[3,2:7]
# 第5名学生成绩
B <- mydata[5,2:7]
# 第3名和第5名学生成绩之间的切比雪夫距离
x <- rbind(A, B)
D35 <- dist(x, method = "maximum", diag = FALSE, upper = FALSE)
D35
D35 <- dist(x, method = "maximum", diag = TRUE, upper = TRUE)
D35
D35 <- dist(x, method = "maximum", diag = FALSE, upper = TRUE)
D35
D35 <- dist(x, method = "maximum", diag = TRUE, upper = FALSE)
D35
class(D35)
cat("切比雪夫距离 =", D35, "\n")
Result
>
> D35 <- dist(x, method = "maximum", diag = FALSE, upper = FALSE)
> D35
3
5 54
> class(D35)
[1] "dist"
> cat("切比雪夫距离 =", D35, "\n")
切比雪夫距离 = 54
>
4、闵氏距离(Minkowski Distance)
Code
options(digits=4)
mydata <- read.table("clipboard",header=T)
class(mydata)
dim(mydata)
head(mydata)
# 第3名学生成绩
A <- mydata[3,2:7]
# 第5名学生成绩
B <- mydata[5,2:7]
# 第3名和第5名学生成绩之间的闵可夫斯基距离
x <- rbind(A, B)
D35 <- dist(x, method = "minkowski", diag = FALSE, upper = FALSE, p = 1.5)
D35
D35 <- dist(x, method = "minkowski", diag = TRUE, upper = TRUE, p = 1.5)
D35
D35 <- dist(x, method = "minkowski", diag = FALSE, upper = TRUE, p = 1.5)
D35
D35 <- dist(x, method = "minkowski", diag = TRUE, upper = FALSE, p = 1.5)
D35
class(D35)
cat("闵可夫斯基距离 =", D35, "\n")
Result
>
> D35 <- dist(x, method = "minkowski", diag = FALSE, upper = FALSE, p = 1.5)
> D35
3
5 100.267
> class(D35)
[1] "dist"
> cat("闵可夫斯基距离 =", D35, "\n")
闵可夫斯基距离 = 100.267
>
5、马氏距离(Mahalanobis Distance)
Code
# 马氏距离函数
Mahalanobis_Distance <- function(A,B,C){
# A,B为【求距离】的向量 | C为【求样本协方差】的矩阵
result <- sqrt((A-B) %*% solve(cov(C)) %*% t(t(A-B)))
result
}
options(digits=4)
mydata <- read.table("clipboard",header=T)
class(mydata)
dim(mydata)
head(mydata)
# 第3名学生成绩
A <- mydata[3,2:7]
A
# 第5名学生成绩
B <- mydata[5,2:7]
B
# 所有学生的成绩
C <- mydata[,-1]
C
# 第3名和第5名学生成绩之间的马氏距离
A <- as.numeric(A)
B <- as.numeric(B)
C <- as.matrix(C)
result <- Mahalanobis_Distance(A,B,C)
cat("马氏距离 =", result, "\n")
Result
>
> A <- as.numeric(A)
> B <- as.numeric(B)
> C <- as.matrix(C)
> result <- Mahalanobis_Distance(A,B,C)
> cat("马氏距离 =", result, "\n")
马氏距离 = 3.841
>
B) "相似系数"计算
1、皮尔逊相关系数(Pearson Correlation Coefficient)
Code
options(digits=4)
mydata <- read.table("clipboard",header=T)
class(mydata)
dim(mydata)
head(mydata)
# 第3名学生成绩
A <- mydata[3,2:7]
A <- as.numeric(A)
A
# 第5名学生成绩
B <- mydata[5,2:7]
B <- as.numeric(B)
B
x <- data.frame(A,B)
x
result <- cor(x, method=c("pearson"))
cat("皮尔逊相关系数 =", result[1,2], "\n")
Result
>
> result <- cor(x, method=c("pearson"))
> cat("皮尔逊相关系数 =", result[1,2], "\n")
皮尔逊相关系数 = -0.04686
>
2、斯皮尔曼秩相关系数(Spearman Rank Correlation)
Code
options(digits=4)
mydata <- read.table("clipboard",header=T)
class(mydata)
dim(mydata)
head(mydata)
# 第3名学生成绩
A <- mydata[3,2:7]
A <- as.numeric(A)
A
# 第5名学生成绩
B <- mydata[5,2:7]
B <- as.numeric(B)
B
x <- data.frame(A,B)
x
result <- cor(x, method=c("spearman"))
cat("斯皮尔曼秩相关系数 =", result[1,2], "\n")
Result
>
> result <- cor(x, method=c("spearman"))
> cat("斯皮尔曼秩相关系数 =", result[1,2], "\n")
斯皮尔曼秩相关系数 = -0.3189
>
3、肯德尔秩相关系数(Kendall Rank Correlation)
Code
options(digits=4)
mydata <- read.table("clipboard",header=T)
class(mydata)
dim(mydata)
head(mydata)
# 第3名学生成绩
A <- mydata[3,2:7]
A <- as.numeric(A)
A
# 第5名学生成绩
B <- mydata[5,2:7]
B <- as.numeric(B)
B
x <- data.frame(A,B)
x
result <- cor(x, method=c("kendall"))
cat("肯德尔秩相关系数 =", result[1,2], "\n")
Result
>
> result <- cor(x, method=c("kendall"))
> cat("肯德尔秩相关系数 =", result[1,2], "\n")
肯德尔秩相关系数 = -0.276
>
4、余弦相似度(Cosine Similarity)
Code
# 余弦相似度函数
Cosine_Similarity <- function(A, B){
result <- t(A)%*%B/sqrt(sum(A^2)*sum(B^2))
result
}
options(digits=4)
mydata <- read.table("clipboard",header=T)
class(mydata)
dim(mydata)
head(mydata)
# 第3名学生成绩
A <- mydata[3,2:7]
A <- as.numeric(A)
A
# 第5名学生成绩
B <- mydata[5,2:7]
B <- as.numeric(B)
B
result <- Cosine_Similarity(A,B)
cat("余弦相似度 =", result, "\n")
Result
>
> result <- Cosine_Similarity(A,B)
> cat("余弦相似度 =", result, "\n")
余弦相似度 = 0.9162
>
A) "距离"计算函数封装
# euclidean 欧几里德距离(欧氏距离)(Euclidean Distance)
# manhattan 曼哈顿距离(绝对值距离)(Manhattan Distance)
# maximum 切比雪夫距离(Chebyshev Distance)
# minkowski 闵可夫斯基距离(Minkowski Distance)(要指定p值)
# mahalanobis 马氏距离(Mahalanobis Distance)
Similarity_Distance <- function(A, B, oType, C=NULL, P=NULL){
if(oType=='euclidean'){
x <- rbind(A, B)
result <- dist(x, method = "euclidean", diag = TRUE, upper = FALSE)
}else if(oType=='manhattan'){
x <- rbind(A, B)
result <- dist(x, method = "manhattan", diag = TRUE, upper = FALSE)
}else if(oType=='maximum'){
x <- rbind(A, B)
result <- dist(x, method = "maximum", diag = TRUE, upper = FALSE)
}else if(oType=='minkowski'){
x <- rbind(A, B)
result <- dist(x, method = "minkowski", diag = TRUE, upper = FALSE, p = P)
}else if(oType=='mahalanobis'){
result <- sqrt((A-B) %*% solve(cov(C)) %*% t(t(A-B)))
}else {
stop("Error, Please Checking !!!")
}
result
}
Similarity_Distance(A, B, oType='euclidean')
Similarity_Distance(A, B, oType='manhattan')
Similarity_Distance(A, B, oType='maximum')
Similarity_Distance(A, B, oType='minkowski', P=1.5)
Similarity_Distance(A, B, oType='mahalanobis', C=C)
B) 相似系数计算函数封装
# pearson 皮尔逊相关系数(Pearson Correlation Coefficient)
# spearman 斯皮尔曼秩相关系数(Spearman Rank Correlation)
# kendall 肯德尔秩相关系数(Kendall Rank Correlation)
# cosine 余弦相似度(Cosine Similarity)
Similarity_coefficient <- function(A, B, oType){
if(oType=='pearson'){
x <- data.frame(A,B)
result <- cor(x, method=c("pearson"))
}else if(oType=='spearman'){
x <- data.frame(A,B)
result <- cor(x, method=c("spearman"))
}else if(oType=='kendall'){
x <- data.frame(A,B)
result <- cor(x, method=c("kendall"))
}else if(oType=='cosine'){
result <- t(A)%*%B / sqrt(sum(A^2)*sum(B^2))
}else {
stop("Error, Please Checking !!!")
}
result
}
Similarity_coefficient(A, B, oType='pearson')
Similarity_coefficient(A, B, oType='spearman')
Similarity_coefficient(A, B, oType='kendall')
Similarity_coefficient(A, B, oType='cosine')