工业级DBSCAN分布式代码展现

DBSCAN

1,生成样本点
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

import numpy as np
import pandas as pd
from sklearn import datasets

#设置DBSCAN参数
eps = 0.2
min_samples=20

X,_ = datasets.make_moons(500,noise = 0.1,random_state=1)
pdf = pd.DataFrame(X,columns = ['feature1','feature2'])
pdf.plot.scatter('feature1','feature2', s = 100,alpha = 0.6, title = 'dataset by make_moon')

pdf.to_csv("./data/moon_dataset.csv",sep = "\t",index = False)

#转换成spark中的DataFrame
#dfdata = spark.createDataFrame(pdf)
dfdata = spark.read.option("header","true")\
  .option("inferSchema","true") \
  .option("delimiter", "\t") \
  .csv("data/moon_dataset.csv")

#将点的坐标生成一个array,并添加唯一id列

dfinput = spark.createDataFrame(dfdata.selectExpr("array(feature1,feature2) as point") \
    .rdd.map(lambda row:row["point"]).zipWithIndex()).toDF("point","id") \
    .selectExpr("id","point").persist(StorageLevel.MEMORY_AND_DISK)

dfinput.show() 

在这里插入图片描述

2,分批次广播KDTree得到邻近关系**
 import numpy as np 
from pyspark.sql import types as T
from pyspark.sql import functions as F 
from pyspark.sql import Row,DataFrame
from sklearn.neighbors import KDTree


rdd_input = dfinput.rdd.repartition(20).persist(StorageLevel.MEMORY_AND_DISK)

#创建空dataframe
schema = T.StructType([
        T.StructField("m_id", T.LongType(), True),
        T.StructField("s_id", T.LongType(), True),
        T.StructField("m_point", T.ArrayType(T.DoubleType(),False), True),
        T.StructField("s_point", T.ArrayType(T.DoubleType(),False), True)])
 
dfpair_raw = spark.createDataFrame(spark.sparkContext.emptyRDD(), schema)


#分批次进行广播
partition_cnt = 10
dfmaster = dfinput.repartition(partition_cnt)

for i in range(0,partition_cnt):
    rdd_master_i = dfmaster.rdd.mapPartitionsWithIndex(
        lambda idx, iterator: iterator if (idx == i ) else iter([]) )
    master_i = rdd_master_i.collect()
    idxs_i = [x["id"] for x in master_i]
    points_i = [x["point"] for x in master_i]
    tree_i = KDTree(np.array(points_i), leaf_size=40, metric='minkowski') #构建kdtree
    broad_i = sc.broadcast((idxs_i,points_i,tree_i))
    
    def fn(iterator):
        list_res = [] #m_id,s_id,m_point,s_point
        idxs_i,points_i,tree_i = broad_i.value
        for row in iterator:
            s_id = row["id"]
            s_point = row["point"]
            index = tree_i.query_radius(np.array([s_point]), r=2*eps)[0] #根据kdtree查询一定范围内的点
            for j in index:
                list_res.append([idxs_i[j],s_id,points_i[j],s_point])
        return iter(list_res)
    
    dfpair_raw_i = spark.createDataFrame(rdd_input.mapPartitions(fn)).toDF("m_id","s_id","m_point","s_point")
    dfpair_raw = dfpair_raw.union(dfpair_raw_i)
    
3,根据DBSCAN邻域半径得到有效邻近关系 dfpair
# 根据DBSCAN邻域半径得到有效邻近关系 dfpair

spark.udf.register("distance", lambda p,q:((p[0]-q[0])**2+(p[1]-q[1])**2)**0.5)
dfpair = dfpair_raw.where("distance(s_point,m_point) <"+str(eps)) \
    .persist(StorageLevel.MEMORY_AND_DISK)

dfpair.show()

在这里插入图片描述

4,创建临时聚类簇 dfcore
dfcore = dfpair.groupBy("s_id").agg(
  F.first("s_point").alias("s_point"),
  F.count("m_id").alias("neighbour_cnt"),
  F.collect_list("m_id").alias("neighbour_ids")
).where("neighbour_cnt>="+str(min_samples)) \
 .persist(StorageLevel.MEMORY_AND_DISK)

dfcore.show(3)
+----+--------------------+-------------+--------------------+
|s_id|             s_point|neighbour_cnt|       neighbour_ids|
+----+--------------------+-------------+--------------------+
|  26|[0.95199382446206...|           25|[150, 463, 300, 5...|
| 418|[0.04187413307127...|           22|[367, 454, 226, 4...|
|  65|[0.46872165251145...|           30|[45, 402, 44, 456...|
+----+--------------------+-------------+--------------------+
only showing top 3 rows
5,得到临时聚类簇的核心点信息
dfpair_join = dfcore.selectExpr("s_id").join(dfpair,["s_id"],"inner")
df_fids = dfcore.selectExpr("s_id as m_id")
dfpair_core = df_fids.join(dfpair_join,["m_id"],"inner")
rdd_core = dfpair_core.groupBy("s_id").agg(
  F.min("m_id").alias("min_core_id"),
  F.collect_set("m_id").alias("core_id_set")
).rdd.map(lambda row: (row["min_core_id"], set(row["core_id_set"])))

rdd_core.persist(StorageLevel.MEMORY_AND_DISK)

print("before_dbscan, rdd_core.count() = ",rdd_core.count())
6,对rdd_core分区分步合并 rdd_core(min_core_id, core_id_set)
#定义合并函数:将有共同核心点的临时聚类簇合并
def mergeSets(list_set):
    result = []
    while  len(list_set)>0 :
        cur_set = list_set.pop(0)
        intersect_idxs = [i for i in list(range(len(list_set)-1,-1,-1)) if cur_set&list_set[i]]
        while  intersect_idxs :
            for idx in intersect_idxs:
                cur_set = cur_set|list_set[idx]

            for idx in intersect_idxs:
                list_set.pop(idx)
                
            intersect_idxs = [i for i in list(range(len(list_set)-1,-1,-1)) if cur_set&list_set[i]]
        
        result = result+[cur_set]
    return result

#对rdd_core分区后在每个分区合并,不断将分区数量减少,最终合并到一个分区
#如果数据规模十分大,难以合并到一个分区,也可以最终合并到多个分区,得到近似结果。
#rdd: (min_core_id,core_id_set)
def mergeRDD(rdd,partition_cnt):
    def fn(iterator):
        list_set = [x[1] for x in iterator]
        list_set_merged = mergeSets(list_set)
        merged_core = [(min(x),x) for x in list_set_merged] 
        return(iter(merged_core))
    rdd_merged = rdd.partitionBy(partition_cnt).mapPartitions(fn)
    return rdd_merged


#此处需要视实际情况调整分区数量和迭代次数
for pcnt in (16,8,4,1):
    rdd_core = mergeRDD(rdd_core,pcnt)
    
rdd_core.persist(StorageLevel.MEMORY_AND_DISK)

print("after dbscan: rdd_core.count()=",rdd_core.count())
after dbscan: rdd_core.count()= 2
7, 获取每一个core的簇信息
dfcluster_ids = spark.createDataFrame(
    rdd_core.flatMap(lambda t: [(t[0], s_id) for s_id in t[1]])).toDF("cluster_id","s_id")

dfclusters =  dfcore.join(dfcluster_ids, "s_id", "left")

dfclusters.show() 
+----+--------------------+-------------+--------------------+----------+
|s_id|             s_point|neighbour_cnt|       neighbour_ids|cluster_id|
+----+--------------------+-------------+--------------------+----------+
|  26|[0.95199382446206...|           25|[150, 463, 300, 5...|         2|
|  65|[0.46872165251145...|           30|[45, 402, 44, 456...|         0|
| 418|[0.04187413307127...|           22|[367, 454, 226, 4...|         0|
| 293|[0.74589456598500...|           30|[231, 293, 153, 3...|         2|
| 243|[-0.7132555992338...|           21|[243, 482, 174, 1...|         2|
| 278|[-0.8841688633151...|           27|[453, 310, 196, 9...|         2|
| 367|[0.00547311527928...|           24|[367, 437, 454, 2...|         0|
|  19|[-0.2040816479108...|           25|[206, 124, 194, 2...|         2|
|  54|[1.86506527195881...|           22|[331, 116, 92, 54...|         0|
| 296|[1.43490708002292...|           22|[212, 199, 473, 3...|         0|
|   0|[0.31655567612451...|           22|[315, 46, 456, 42...|         0|
| 348|[0.77799441414636...|           25|[348, 402, 374, 4...|         0|
| 415|[-0.4510104506178...|           28|[363, 407, 273, 2...|         2|
| 112|[1.38118745635267...|           28|[212, 199, 473, 3...|         0|
| 113|[1.95088315015933...|           26|[306, 255, 447, 2...|         0|
| 167|[0.39542492867803...|           22|[286, 179, 109, 1...|         2|
| 385|[-0.2769033877846...|           25|[363, 407, 122, 2...|         2|
| 237|[0.08078546751286...|           29|[367, 437, 46, 23...|         0|
| 347|[-0.7336250327143...|           21|[482, 174, 196, 9...|         2|
| 330|[0.71478678633618...|           27|[231, 293, 153, 3...|         2|
+----+--------------------+-------------+--------------------+----------+
only showing top 20 rows

8,求每一个簇的代表核心和簇元素数量

8,求每一个簇的代表核心和簇元素数量
rdd_cluster = dfclusters.rdd.map(
    lambda row: (row["cluster_id"],(row["s_point"],row["neighbour_cnt"],set(row["neighbour_ids"])))
)

def reduce_fn(a,b):
    id_set = a[2]|b[2]
    result = (a[0],a[1],id_set) if a[1]>=b[1] else (b[0],b[1],id_set)
    return result

rdd_result = rdd_cluster.reduceByKey(reduce_fn)

def map_fn(t):
    cluster_id = t[0]
    representation_point = t[1][0]
    neighbour_points_cnt = t[1][1]
    id_set = list(t[1][2])
    cluster_points_cnt = len(id_set)
    return (cluster_id,representation_point,neighbour_points_cnt,cluster_points_cnt,id_set)

dfresult = spark.createDataFrame(rdd_result.map(map_fn)
    ).toDF("cluster_id","representation_point","neighbour_points_cnt","cluster_points_cnt","cluster_points_ids")

dfresult.persist(StorageLevel.MEMORY_AND_DISK)

dfresult.show(3)

+----------+--------------------+--------------------+------------------+--------------------+
|cluster_id|representation_point|neighbour_points_cnt|cluster_points_cnt|  cluster_points_ids|
+----------+--------------------+--------------------+------------------+--------------------+
|         0|[1.95163238902570...|                  32|               242|[0, 1, 4, 5, 6, 1...|
|         2|[0.95067226301300...|                  34|               241|[2, 3, 7, 9, 11, ...|
+----------+--------------------+--------------------+------------------+--------------------+

注意到我们的结果中

聚类簇数量为2个。

噪声点数量为500-242-241 = 17个

和调用sklearn中的结果完全一致。


9,求每一个点的簇id,噪声点簇id赋值为-1
rdd_clusterid = dfresult.select("cluster_id","cluster_points_ids").rdd.flatMap(
    lambda t: [(x,t["cluster_id"]) for x in t["cluster_points_ids"]])

df_clusterid = spark.createDataFrame(rdd_clusterid).toDF("id","cluster_id")
dfoutput_raw = dfinput.join(df_clusterid,"id","left")
dfoutput = dfoutput_raw.na.fill(-1)

dfoutput = dfoutput.selectExpr("id","cluster_id","point[0] as feature1","point[1] as feature2")
dfoutput.persist(StorageLevel.MEMORY_AND_DISK)
dfoutput.show()

+---+----------+--------------------+--------------------+
| id|cluster_id|            feature1|            feature2|
+---+----------+--------------------+--------------------+
| 26|         2|  0.9519938244620684|  0.2552474492493959|
| 29|         2|  -0.863622604833635|   0.756640145262391|
|474|         2| -0.4885096982719171|  1.0491748634589007|
| 65|         0|  0.4687216525114598| -0.3609345154042032|
|191|         2|  -1.110792648099675| 0.18780773522847397|
|418|         0| 0.04187413307127004| 0.25141384401180206|
|222|         0|  1.5899557693512685|-0.42942807171107017|
|270|         0|   2.178538623657351| 0.44807664826862253|
|293|         2|  0.7458945659850041|  0.5914004203001728|
|243|         2| -0.7132555992338488|  0.8089869542594612|
|278|         2| -0.8841688633151701|  0.5147890731484406|
|367|         0|0.005473115279280807| 0.12361319219864111|
|442|         2|  0.8889028924942911|0.028497180983055058|
| 19|         2| -0.2040816479108034|  0.9856890760075208|
| 54|         0|  1.8650652719588168|-0.13541631999968182|
|296|         0|   1.434907080022921| -0.4713434821495917|
|  0|         0|  0.3165556761245117|-0.04942181785843226|
|277|         0| 0.08946739589070024|-0.22831869307482952|
|287|         0|  1.7814103104861185|-0.33109829058582907|
|348|         0|   0.777994414146364| -0.4522352978300379|
+---+----------+--------------------+--------------------+
only showing top 20 rows
10,保存和可视化结果
dfoutput.write.format("csv") \
  .option("header","true")\
  .option("inferSchema","true") \
  .option("delimiter", "\t") \
  .save("data/dbscan_output.csv")
pdfoutput = dfoutput.toPandas()

pdfoutput.plot.scatter('feature1','feature2', s = 100,
    c = list(pdfoutput['cluster_id']),cmap = 'rainbow',colorbar = False,
    alpha = 0.6,title = 'pyspark DBSCAN cluster result');

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-QRdXsKYH-1647343214237)(./data/pyspark_dbscan_img.png)]

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
DBSCAN(Density-Based Spatial Clustering of Applications with Noise)是一种基于密度的聚类算法,用于发现具有高密度的数据点的集群。下面是一个简单的MATLAB代码示例,它实现了DBSCAN算法: ```matlab function [clusterLabels] = dbscan(data, epsilon, minPts) % 参数说明: % data - 待聚类的数据(n个样本的m维特征矩阵) % epsilon - 邻域半径 % minPts - 邻域内最小样本数 n = size(data, 1); % 样本数 visited = false(n, 1); % 标记样本是否已访问 clusterLabels = zeros(n, 1); % 聚类标签 clusterIdx = 1; % 聚类索引 for i = 1:n if visited(i) continue; end visited(i) = true; neighbors = regionQuery(data, i, epsilon); % 获取样本i的邻域样本 if numel(neighbors) < minPts % 若样本i的邻域样本数小于最小样本数则标记为噪声 clusterLabels(i) = -1; else expandCluster(data, i, neighbors, clusterIdx, epsilon, minPts, visited, clusterLabels); % 扩展簇 clusterIdx = clusterIdx + 1; end end end function expandCluster(data, pointIdx, neighbors, clusterIdx, epsilon, minPts, visited, clusterLabels) clusterLabels(pointIdx) = clusterIdx; % 标记样本所属簇 k = 1; while k <= numel(neighbors) neighborIdx = neighbors(k); if ~visited(neighborIdx) visited(neighborIdx) = true; neighborNeighbors = regionQuery(data, neighborIdx, epsilon); if numel(neighborNeighbors) >= minPts % 若邻域样本数大于等于最小样本数 neighbors = [neighbors; neighborNeighbors]; % 将邻域样本添加到邻域集合中 end end if clusterLabels(neighborIdx) == 0 % 若样本未被归入任何簇,则归入当前簇 clusterLabels(neighborIdx) = clusterIdx; end k = k + 1; end end function neighbors = regionQuery(data, pointIdx, epsilon) % 根据欧式距离计算样本pointIdx的邻域样本索引 dist = pdist2(data(pointIdx, :), data); neighbors = find(dist <= epsilon); end ``` 以上代码实现了DBSCAN算法的主要逻辑。首先,根据给定的数据矩阵、邻域半径epsilon和邻域内最小样本数minPts,初始化样本的访问状态和聚类标签。然后,遍历每个样本,如果样本未被访问,则标记为已访问,并获取其邻域样本。如果邻域样本数小于最小样本数,将该样本标记为噪声;否则,将其归入新的聚类索引,并进行簇的扩展。簇的扩展通过遍历邻域样本来实现,如果邻域样本未被访问,则标记为已访问,并获取其邻域样本。如果邻域样本数大于等于最小样本数,则将其添加到邻域集合中,并判断是否已经归入任何簇。最后,返回聚类标签。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

Elvis_hui

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值