密度聚类DBCSCAN工业化实现
DBSCAN
1,生成样本点
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
import numpy as np
import pandas as pd
from sklearn import datasets
#设置DBSCAN参数
eps = 0.2
min_samples=20
X,_ = datasets.make_moons(500,noise = 0.1,random_state=1)
pdf = pd.DataFrame(X,columns = ['feature1','feature2'])
pdf.plot.scatter('feature1','feature2', s = 100,alpha = 0.6, title = 'dataset by make_moon')
pdf.to_csv("./data/moon_dataset.csv",sep = "\t",index = False)
#转换成spark中的DataFrame
#dfdata = spark.createDataFrame(pdf)
dfdata = spark.read.option("header","true")\
.option("inferSchema","true") \
.option("delimiter", "\t") \
.csv("data/moon_dataset.csv")
#将点的坐标生成一个array,并添加唯一id列
dfinput = spark.createDataFrame(dfdata.selectExpr("array(feature1,feature2) as point") \
.rdd.map(lambda row:row["point"]).zipWithIndex()).toDF("point","id") \
.selectExpr("id","point").persist(StorageLevel.MEMORY_AND_DISK)
dfinput.show()
2,分批次广播KDTree得到邻近关系**
import numpy as np
from pyspark.sql import types as T
from pyspark.sql import functions as F
from pyspark.sql import Row,DataFrame
from sklearn.neighbors import KDTree
rdd_input = dfinput.rdd.repartition(20).persist(StorageLevel.MEMORY_AND_DISK)
#创建空dataframe
schema = T.StructType([
T.StructField("m_id", T.LongType(), True),
T.StructField("s_id", T.LongType(), True),
T.StructField("m_point", T.ArrayType(T.DoubleType(),False), True),
T.StructField("s_point", T.ArrayType(T.DoubleType(),False), True)])
dfpair_raw = spark.createDataFrame(spark.sparkContext.emptyRDD(), schema)
#分批次进行广播
partition_cnt = 10
dfmaster = dfinput.repartition(partition_cnt)
for i in range(0,partition_cnt):
rdd_master_i = dfmaster.rdd.mapPartitionsWithIndex(
lambda idx, iterator: iterator if (idx == i ) else iter([]) )
master_i = rdd_master_i.collect()
idxs_i = [x["id"] for x in master_i]
points_i = [x["point"] for x in master_i]
tree_i = KDTree(np.array(points_i), leaf_size=40, metric='minkowski') #构建kdtree
broad_i = sc.broadcast((idxs_i,points_i,tree_i))
def fn(iterator):
list_res = [] #m_id,s_id,m_point,s_point
idxs_i,points_i,tree_i = broad_i.value
for row in iterator:
s_id = row["id"]
s_point = row["point"]
index = tree_i.query_radius(np.array([s_point]), r=2*eps)[0] #根据kdtree查询一定范围内的点
for j in index:
list_res.append([idxs_i[j],s_id,points_i[j],s_point])
return iter(list_res)
dfpair_raw_i = spark.createDataFrame(rdd_input.mapPartitions(fn)).toDF("m_id","s_id","m_point","s_point")
dfpair_raw = dfpair_raw.union(dfpair_raw_i)
3,根据DBSCAN邻域半径得到有效邻近关系 dfpair
# 根据DBSCAN邻域半径得到有效邻近关系 dfpair
spark.udf.register("distance", lambda p,q:((p[0]-q[0])**2+(p[1]-q[1])**2)**0.5)
dfpair = dfpair_raw.where("distance(s_point,m_point) <"+str(eps)) \
.persist(StorageLevel.MEMORY_AND_DISK)
dfpair.show()
4,创建临时聚类簇 dfcore
dfcore = dfpair.groupBy("s_id").agg(
F.first("s_point").alias("s_point"),
F.count("m_id").alias("neighbour_cnt"),
F.collect_list("m_id").alias("neighbour_ids")
).where("neighbour_cnt>="+str(min_samples)) \
.persist(StorageLevel.MEMORY_AND_DISK)
dfcore.show(3)
+----+--------------------+-------------+--------------------+
|s_id| s_point|neighbour_cnt| neighbour_ids|
+----+--------------------+-------------+--------------------+
| 26|[0.95199382446206...| 25|[150, 463, 300, 5...|
| 418|[0.04187413307127...| 22|[367, 454, 226, 4...|
| 65|[0.46872165251145...| 30|[45, 402, 44, 456...|
+----+--------------------+-------------+--------------------+
only showing top 3 rows
5,得到临时聚类簇的核心点信息
dfpair_join = dfcore.selectExpr("s_id").join(dfpair,["s_id"],"inner")
df_fids = dfcore.selectExpr("s_id as m_id")
dfpair_core = df_fids.join(dfpair_join,["m_id"],"inner")
rdd_core = dfpair_core.groupBy("s_id").agg(
F.min("m_id").alias("min_core_id"),
F.collect_set("m_id").alias("core_id_set")
).rdd.map(lambda row: (row["min_core_id"], set(row["core_id_set"])))
rdd_core.persist(StorageLevel.MEMORY_AND_DISK)
print("before_dbscan, rdd_core.count() = ",rdd_core.count())
6,对rdd_core分区分步合并 rdd_core(min_core_id, core_id_set)
#定义合并函数:将有共同核心点的临时聚类簇合并
def mergeSets(list_set):
result = []
while len(list_set)>0 :
cur_set = list_set.pop(0)
intersect_idxs = [i for i in list(range(len(list_set)-1,-1,-1)) if cur_set&list_set[i]]
while intersect_idxs :
for idx in intersect_idxs:
cur_set = cur_set|list_set[idx]
for idx in intersect_idxs:
list_set.pop(idx)
intersect_idxs = [i for i in list(range(len(list_set)-1,-1,-1)) if cur_set&list_set[i]]
result = result+[cur_set]
return result
#对rdd_core分区后在每个分区合并,不断将分区数量减少,最终合并到一个分区
#如果数据规模十分大,难以合并到一个分区,也可以最终合并到多个分区,得到近似结果。
#rdd: (min_core_id,core_id_set)
def mergeRDD(rdd,partition_cnt):
def fn(iterator):
list_set = [x[1] for x in iterator]
list_set_merged = mergeSets(list_set)
merged_core = [(min(x),x) for x in list_set_merged]
return(iter(merged_core))
rdd_merged = rdd.partitionBy(partition_cnt).mapPartitions(fn)
return rdd_merged
#此处需要视实际情况调整分区数量和迭代次数
for pcnt in (16,8,4,1):
rdd_core = mergeRDD(rdd_core,pcnt)
rdd_core.persist(StorageLevel.MEMORY_AND_DISK)
print("after dbscan: rdd_core.count()=",rdd_core.count())
after dbscan: rdd_core.count()= 2
7, 获取每一个core的簇信息
dfcluster_ids = spark.createDataFrame(
rdd_core.flatMap(lambda t: [(t[0], s_id) for s_id in t[1]])).toDF("cluster_id","s_id")
dfclusters = dfcore.join(dfcluster_ids, "s_id", "left")
dfclusters.show()
+----+--------------------+-------------+--------------------+----------+
|s_id| s_point|neighbour_cnt| neighbour_ids|cluster_id|
+----+--------------------+-------------+--------------------+----------+
| 26|[0.95199382446206...| 25|[150, 463, 300, 5...| 2|
| 65|[0.46872165251145...| 30|[45, 402, 44, 456...| 0|
| 418|[0.04187413307127...| 22|[367, 454, 226, 4...| 0|
| 293|[0.74589456598500...| 30|[231, 293, 153, 3...| 2|
| 243|[-0.7132555992338...| 21|[243, 482, 174, 1...| 2|
| 278|[-0.8841688633151...| 27|[453, 310, 196, 9...| 2|
| 367|[0.00547311527928...| 24|[367, 437, 454, 2...| 0|
| 19|[-0.2040816479108...| 25|[206, 124, 194, 2...| 2|
| 54|[1.86506527195881...| 22|[331, 116, 92, 54...| 0|
| 296|[1.43490708002292...| 22|[212, 199, 473, 3...| 0|
| 0|[0.31655567612451...| 22|[315, 46, 456, 42...| 0|
| 348|[0.77799441414636...| 25|[348, 402, 374, 4...| 0|
| 415|[-0.4510104506178...| 28|[363, 407, 273, 2...| 2|
| 112|[1.38118745635267...| 28|[212, 199, 473, 3...| 0|
| 113|[1.95088315015933...| 26|[306, 255, 447, 2...| 0|
| 167|[0.39542492867803...| 22|[286, 179, 109, 1...| 2|
| 385|[-0.2769033877846...| 25|[363, 407, 122, 2...| 2|
| 237|[0.08078546751286...| 29|[367, 437, 46, 23...| 0|
| 347|[-0.7336250327143...| 21|[482, 174, 196, 9...| 2|
| 330|[0.71478678633618...| 27|[231, 293, 153, 3...| 2|
+----+--------------------+-------------+--------------------+----------+
only showing top 20 rows
8,求每一个簇的代表核心和簇元素数量
8,求每一个簇的代表核心和簇元素数量
rdd_cluster = dfclusters.rdd.map(
lambda row: (row["cluster_id"],(row["s_point"],row["neighbour_cnt"],set(row["neighbour_ids"])))
)
def reduce_fn(a,b):
id_set = a[2]|b[2]
result = (a[0],a[1],id_set) if a[1]>=b[1] else (b[0],b[1],id_set)
return result
rdd_result = rdd_cluster.reduceByKey(reduce_fn)
def map_fn(t):
cluster_id = t[0]
representation_point = t[1][0]
neighbour_points_cnt = t[1][1]
id_set = list(t[1][2])
cluster_points_cnt = len(id_set)
return (cluster_id,representation_point,neighbour_points_cnt,cluster_points_cnt,id_set)
dfresult = spark.createDataFrame(rdd_result.map(map_fn)
).toDF("cluster_id","representation_point","neighbour_points_cnt","cluster_points_cnt","cluster_points_ids")
dfresult.persist(StorageLevel.MEMORY_AND_DISK)
dfresult.show(3)
+----------+--------------------+--------------------+------------------+--------------------+
|cluster_id|representation_point|neighbour_points_cnt|cluster_points_cnt| cluster_points_ids|
+----------+--------------------+--------------------+------------------+--------------------+
| 0|[1.95163238902570...| 32| 242|[0, 1, 4, 5, 6, 1...|
| 2|[0.95067226301300...| 34| 241|[2, 3, 7, 9, 11, ...|
+----------+--------------------+--------------------+------------------+--------------------+
注意到我们的结果中
聚类簇数量为2个。
噪声点数量为500-242-241 = 17个
和调用sklearn中的结果完全一致。
9,求每一个点的簇id,噪声点簇id赋值为-1
rdd_clusterid = dfresult.select("cluster_id","cluster_points_ids").rdd.flatMap(
lambda t: [(x,t["cluster_id"]) for x in t["cluster_points_ids"]])
df_clusterid = spark.createDataFrame(rdd_clusterid).toDF("id","cluster_id")
dfoutput_raw = dfinput.join(df_clusterid,"id","left")
dfoutput = dfoutput_raw.na.fill(-1)
dfoutput = dfoutput.selectExpr("id","cluster_id","point[0] as feature1","point[1] as feature2")
dfoutput.persist(StorageLevel.MEMORY_AND_DISK)
dfoutput.show()
+---+----------+--------------------+--------------------+
| id|cluster_id| feature1| feature2|
+---+----------+--------------------+--------------------+
| 26| 2| 0.9519938244620684| 0.2552474492493959|
| 29| 2| -0.863622604833635| 0.756640145262391|
|474| 2| -0.4885096982719171| 1.0491748634589007|
| 65| 0| 0.4687216525114598| -0.3609345154042032|
|191| 2| -1.110792648099675| 0.18780773522847397|
|418| 0| 0.04187413307127004| 0.25141384401180206|
|222| 0| 1.5899557693512685|-0.42942807171107017|
|270| 0| 2.178538623657351| 0.44807664826862253|
|293| 2| 0.7458945659850041| 0.5914004203001728|
|243| 2| -0.7132555992338488| 0.8089869542594612|
|278| 2| -0.8841688633151701| 0.5147890731484406|
|367| 0|0.005473115279280807| 0.12361319219864111|
|442| 2| 0.8889028924942911|0.028497180983055058|
| 19| 2| -0.2040816479108034| 0.9856890760075208|
| 54| 0| 1.8650652719588168|-0.13541631999968182|
|296| 0| 1.434907080022921| -0.4713434821495917|
| 0| 0| 0.3165556761245117|-0.04942181785843226|
|277| 0| 0.08946739589070024|-0.22831869307482952|
|287| 0| 1.7814103104861185|-0.33109829058582907|
|348| 0| 0.777994414146364| -0.4522352978300379|
+---+----------+--------------------+--------------------+
only showing top 20 rows
10,保存和可视化结果
dfoutput.write.format("csv") \
.option("header","true")\
.option("inferSchema","true") \
.option("delimiter", "\t") \
.save("data/dbscan_output.csv")
pdfoutput = dfoutput.toPandas()
pdfoutput.plot.scatter('feature1','feature2', s = 100,
c = list(pdfoutput['cluster_id']),cmap = 'rainbow',colorbar = False,
alpha = 0.6,title = 'pyspark DBSCAN cluster result');