SVD降维后,Kmeans聚趋势分类
py_spark版本
# py_spark
from pyspark.sql.types import *
from pyspark.sql.functions import *
import numpy as np
import pandas as pd
from numpy import linalg as la
from sklearn.cluster import KMeans
import warnings
warnings.filterwarnings('ignore')
# 建表
# sql0 = """
# create table if not exists xx_dev.table
# (
# a bigint
# ,b bigint
# ,c string
# ) partitioned by (dt string)
# """
# spark.sql(sql0)
# print('建表成功')
# 取数
sql = """ select * from table_b """
df_pyspark = spark.sql(sql)
#print('sql run success')
#先用spark透视再转pandas,减少df计算内存
df_pivot = df_pyspark.groupBy("dr_id").pivot("tim_gp").agg(sum("online_min")).na.fill(0)
data1 = df_pivot.toPandas()
-- print('df pivot success')
-- print(data1.info())
-- print(data1.head(10))
# svd降维
def svd_data(data):
svd_model_data = data.iloc[: , 1:].values
u,s,v = np.linalg.svd(svd_model_data, full_matrices=False)
t_square_total = 0
a1 = []
k = []
for i in range(len(s)):
t = s[i]
t_square = t*t
t_square_total = t_square_total + t_square
a1.append(t_square_total)
k.append(i)
b=(a1/a1[-1])
v_num = (len(b[b < 0.95]) + 1)
corr = [[] for i in range(v_num)]
for i in range(len(svd_model_data)):
for j in range(v_num):
corr[j].append((np.corrcoef(svd_model_data[i],v[j, :]))[0][1])
df_km_model_data = data['dr_id']
df_km_model_data_columns = ['dr_id']
for i in range(v_num):
df_km_model_data = pd.concat([df_km_model_data,pd.Series(corr[i],index = df_km_model_data.index)],axis=1)
df_km_model_data_columns.append('v'+str(i))
df_km_model_data.columns=df_km_model_data_columns
df_km_model_data = df_km_model_data.fillna(0)
return svd_model_data, df_km_model_data
# kmeans聚类
def km_data(data,n):
n_clusters = n
df_km_model_data = data
x_train = df_km_model_data.iloc[:, 1:].values
kmeans = KMeans(n_clusters=n_clusters).fit(x_train)
kmeans_group_out =kmeans.labels_
kmeans_cluster_centers = kmeans.cluster_centers_
df_km_output = pd.concat([df_km_model_data['dr_id'],
pd.Series(kmeans_group_out,
index=df_km_model_data.index)], axis=1)
df_km_output.columns=['dr_id','group_id']
np_group_mean_online_duration_by_day = np.zeros(shape=(n_clusters, len(svd_model_data[0])))
for i in range(n_clusters):
np_group_mean_online_duration_by_day[i] = svd_model_data[kmeans_group_out==i, :].mean(axis=0)
re_gp = pd.DataFrame([np_group_mean_online_duration_by_day[i] for i in range(n_clusters)])
g0 = re_gp.mean(axis=1).idxmax()
g1 = re_gp.mean(axis=1).idxmin()
re_gp_id0 = re_gp.idxmax(axis=1).sort_values(ascending=False).index
re_gp_id1 = re_gp_id0[(re_gp_id0 != g0)&(re_gp_id0 != g1)].tolist()
re_gp_id1.extend([g0,g1])
re_gp_id = pd.DataFrame([re_gp_id1,['fe_a','fe_b','fe_c','fe_d','fe_e','fe_f']]).T
re_gp_id.columns = ['group_id','peak_name']
re_gp_id.sort_values(by='group_id',ascending=True)
data_output = pd.merge(df_km_output[['dr_id','group_id']],re_gp_id,on='group_id')
return kmeans_group_out,re_gp_id,data_output
# 计算
svd_model_data,df_km_model_data = svd_data(data1)
kmeans_group_out,re_gp_id,data_output = km_data(df_km_model_data,6)
-- print(re_gp_id)
-- print(data_output.head(10))
# 结果存表
schema = StructType([StructField("a", LongType()),
StructField("b",IntegerType()),
StructField("c", StringType()) ])
spark_data_output = spark.createDataFrame(data_output)
-- print('df转spark_df完成')
spark_data_output.registerTempTable("tmp_table")
-- print('存表完成')
# 写入hive
spark.sql('''
INSERT OVERWRITE table xx.table partition(dt= '${BIZ_DATE_LINE}')
select a,b,c from tmp_table
''')