from pyspark.sql import SparkSession import pandas as pd from sklearn import preprocessing
In [7]:
#创建SparkSession spark = SparkSession.builder \ .master("local") \ .appName("Marketing strategy") \ .config("spark.some.config.option", "some-value") \ .getOrCreate()
In [8]:
#读取CSV文件,有中文encoding设置为gbk,有列名设置为true,inferSchema最好设置为true tradFlow = spark.read.csv("D:\python\RFM_TRAD_FLOW.csv",inferSchema=True,header=True,encoding='gbk')
In [9]:
tradFlow.show(20)
+-------+-----+----------------+------+----------+--------------+ |transID|cumid| time|amount|type_label| type| +-------+-----+----------------+------+----------+--------------+ | 9407|10001|14JUN09:17:58:34| 199.0| 正常| Normal| | 9625|10001|16JUN09:15:09:13| 369.0| 正常| Normal| | 11837|10001|01JUL09:14:50:36| 369.0| 正常| Normal| | 26629|10001|14DEC09:18:05:32| 359.0| 正常| Normal| | 30850|10001|12APR10:13:02:20| 399.0| 正常| Normal| | 32007|10001|04MAY10:16:45:58| 269.0| 正常| Normal| | 36637|10001|04JUN10:20:03:06| 0.0| 赠送| Presented| | 43108|10001|06JUL10:16:56:40| 381.0| 正常| Normal| | 43877|10001|10JUL10:20:41:54|-399.0| 退货|returned_goods| | 46081|10001|23JUL10:16:35:45| 0.0| 赠送| Presented| | 51650|10001|05SEP10:15:53:48| 0.0| 赠送| Presented| | 52266|10001|11SEP10:17:17:51| 0.0| 赠送| Presented| | 52769|10001|17SEP10:12:47:43| 120.0| 正常| Normal| | 36050|10001|01JUN10:18:13:35|-295.0| 退货|returned_goods| | 43207|10001|07JUL10:20:05:36| 0.0| 赠送| Presented| | 14812|10001|21JUL09:09:31:26| 36.0| 正常| Normal| | 24752|10001|12OCT09:10:39:32| 300.0| 特价| Special_offer| | 24779|10001|12OCT09:10:59:13| 120.0| 特价| Special_offer| | 30394|10001|31MAR10:20:29:48| 0.0| 赠送| Presented| | 33073|10001|14MAY10:13:31:23| 239.0| 正常| Normal| +-------+-----+----------------+------+----------+--------------+ only showing top 20 rows
In [10]:
# 注册成临时视图,用于sql语句 tradFlow.createOrReplaceTempView('tradFlow')
In [11]:
#通过计算F反应客户对打折产品的偏好 data_tradFlow1 = spark.sql("select cumid,type,count(transID) as count \ from tradFlow \ group by cumid,type \ order by cumid,type") # pivot转置 data_tradFlow2 = data_tradFlow1.groupBy('cumid') \ .pivot("type",["Normal","Presented","Special_offer","returned_goods"]).sum("count").na.fill(0) #null值赋值为0 # 新增一列 data_tradFlow6 = data_tradFlow2.withColumn('Special_scale',data_tradFlow2['Special_offer']/(data_tradFlow2['Special_offer']+data_tradFlow2['Normal'])) data_tradFlow6.show()
+-----+------+---------+-------------+--------------+-------------------+ |cumid|Normal|Presented|Special_offer|returned_goods| Special_scale| +-----+------+---------+-------------+--------------+-------------------+ |10206| 10| 6| 2| 2|0.16666666666666666| |20135| 21| 9| 3| 0| 0.125| |40011| 14| 3| 1| 0|0.06666666666666667| |40107| 18| 6| 5| 1|0.21739130434782608| |10081| 14| 8| 0| 0| 0.0| |10121| 14| 12| 1| 2|0.06666666666666667| |10230| 11| 8| 1| 0|0.08333333333333333| |20268| 12| 5| 0| 1| 0.0| |30183| 14| 8| 0| 0| 0.0| |40021| 10| 6| 0| 2| 0.0| |40199| 12| 9| 2| 1|0.14285714285714285| |20020| 11| 4| 2| 1|0.15384615384615385| |20029| 8| 3| 2| 1| 0.2| |20052| 8| 6| 0| 0| 0.0| |20120| 12| 4| 1| 0|0.07692307692307693| |20134| 16| 5| 2| 2| 0.1111111111111111| |20247| 17| 6| 1| 0|0.05555555555555555| |20170| 10| 11| 2| 1|0.16666666666666666| |40132| 8| 6| 1| 1| 0.1111111111111111| |40165| 17| 7| 0| 1| 0.0| +-----+------+---------+-------------+--------------+-------------------+ only showing top 20 rows
In [12]:
#通过计算M反应客户的价值信息 data_tradFlow3 = spark.sql("select cumid,type,sum(amount) as total \ from tradFlow \ group by cumid,type \ order by cumid,type") data_tradFlow4 = data_tradFlow3.groupBy('cumid') \ .pivot("type",["Normal","Presented","Special_offer","returned_goods"]).sum("total").na.fill(0) data_tradFlow5 = data_tradFlow4.withColumn('amount',data_tradFlow4['Normal'] + data_tradFlow4['Special_offer']+data_tradFlow4['returned_goods']) data_tradFlow5.show()
+-----+------+---------+-------------+--------------+------+ |cumid|Normal|Presented|Special_offer|returned_goods|amount| +-----+------+---------+-------------+--------------+------+ |10206|2479.0| 0.0| 352.0| -800.0|2031.0| |20135|3356.0| 0.0| 399.0| 0.0|3755.0| |40011|2422.0| 0.0| 20.0| 0.0|2442.0| |40107|4005.0| 0.0| 464.0| -399.0|4070.0| |10081|3201.0| 0.0| 0.0| 0.0|3201.0| |10121|3378.0| 0.0| 224.0| -413.0|3189.0| |10230|2385.0| 0.0| 242.0| 0.0|2627.0| |20268|2487.0| 0.0| 0.0| -314.0|2173.0| |30183|3121.0| 0.0| 0.0| 0.0|3121.0| |40021|1992.0| 0.0| 0.0| -311.0|1681.0| |40199|3127.0| 0.0| 323.0| -239.0|3211.0| |20020|3029.0| 0.0| 181.0| -249.0|2961.0| |20029|1499.0| 0.0| 216.0| -299.0|1416.0| |20052|1184.0| 0.0| 0.0| 0.0|1184.0| |20120|1967.0| 0.0| 140.0| 0.0|2107.0| |20134|4158.0| 0.0| 110.0| -653.0|3615.0| |20247|4342.0| 0.0| 159.0| 0.0|4501.0| |20170|1348.0| 0.0| 398.0| -349.0|1397.0| |40132|1809.0| 0.0| 100.0| -269.0|1640.0| |40165|3302.0| 0.0| 0.0| -359.0|2943.0| +-----+------+---------+-------------+--------------+------+ only showing top 20 rows
In [13]:
#通过计算R反应客户的活跃度 #14JUN09:17:58:34 yyyymmdd hh24 #todo
In [14]:
#将M、F、R整合成一个DataFrame data_tradFlow5.createOrReplaceTempView("data_tradFlow5") data_tradFlow6.createOrReplaceTempView("data_tradFlow6") data_tradFlow7 = spark.sql("select a.cumid,a.amount,b.Special_scale \ from data_tradFlow5 as a,data_tradFlow6 as b \ where a.cumid = b.cumid \ order by cumid") data_tradFlow7.show()
+-----+------+--------------------+ |cumid|amount| Special_scale| +-----+------+--------------------+ |10001|3334.0| 0.11764705882352941| |10002|1652.0| 0.0| |10003|3435.0| 0.0625| |10004|3312.0| 0.11764705882352941| |10005|2119.0| 0.0| |10006|1859.0| 0.09090909090909091| |10007|3961.0| 0.0625| |10008|3855.0| 0.15| |10009|3257.0| 0.0| |10010|2333.0| 0.2727272727272727| |10011|3589.0|0.058823529411764705| |10012|4676.0| 0.15| |10013|2826.0| 0.0625| |10014|4721.0| 0.13636363636363635| |10015|3724.0| 0.0625| |10016|4053.0| 0.0625| |10017|1793.0| 0.21428571428571427| |10018|2904.0| 0.0| |10019|3754.0| 0.125| |10020|1630.0| 0.0| +-----+------+--------------------+ only showing top 20 rows
In [20]:
data_df = data_tradFlow7.toPandas() data_df.rename_axis({"amount":"value","Special_scale":"interest"},axis="columns",inplace=True) #重命名 data_df.head()
Out[20]:
cumid | value | interest | |
---|---|---|---|
0 | 10001 | 3334.0 | 0.117647 |
1 | 10002 | 1652.0 | 0.000000 |
2 | 10003 | 3435.0 | 0.062500 |
3 | 10004 | 3312.0 | 0.117647 |
4 | 10005 | 2119.0 | 0.000000 |
In [80]:
#构建模型,筛选目标客户 # qcut进行分箱 threshold = pd.qcut(data_df['interest'],2,retbins=True)[1][1] # 根据阈值进行划分 binarizer = preprocessing.Binarizer(threshold = threshold) # 生成新的DataFrame interest_q = pd.DataFrame(binarizer.transform(data_df['interest'].values.reshape(-1,1))) interest_q.index = data_df.cumid interest_q.columns = ['interest'] threshold = pd.qcut(data_df['value'],2,retbins=True)[1][1] binarizer = preprocessing.Binarizer(threshold = threshold) value_q = pd.DataFrame(binarizer.transform(data_df['value'].values.reshape(-1,1))) value_q.index = data_df.cumid value_q.columns = ['value'] #连接 analysis = pd.concat([interest_q,value_q],axis=1) analysis.head() label = { (0,0):'无兴趣-低价值', (1,0):'有兴趣-低价值', (0,1):'无兴趣-高价值', (1,1):'有兴趣-高价值', } # apply对每列或每行进行操作 analysis['label'] = analysis.apply(lambda x:label[x[0],x[1]],axis=1) analysis
Out[80]:
interest | value | label | |
---|---|---|---|
cumid | |||
10001 | 1.0 | 1.0 | 有兴趣-高价值 |
10002 | 0.0 | 0.0 | 无兴趣-低价值 |
10003 | 0.0 | 1.0 | 无兴趣-高价值 |
10004 | 1.0 | 1.0 | 有兴趣-高价值 |
10005 | 0.0 | 0.0 | 无兴趣-低价值 |
10006 | 1.0 | 0.0 | 有兴趣-低价值 |
10007 | 0.0 | 1.0 | 无兴趣-高价值 |
10008 | 1.0 | 1.0 | 有兴趣-高价值 |
10009 | 0.0 | 1.0 | 无兴趣-高价值 |
10010 | 1.0 | 0.0 | 有兴趣-低价值 |
10011 | 0.0 | 1.0 | 无兴趣-高价值 |
10012 | 1.0 | 1.0 | 有兴趣-高价值 |
10013 | 0.0 | 0.0 | 无兴趣-低价值 |
10014 | 1.0 | 1.0 | 有兴趣-高价值 |
10015 | 0.0 | 1.0 | 无兴趣-高价值 |
10016 | 0.0 | 1.0 | 无兴趣-高价值 |
10017 | 1.0 | 0.0 | 有兴趣-低价值 |
10018 | 0.0 | 0.0 | 无兴趣-低价值 |
10019 | 1.0 | 1.0 | 有兴趣-高价值 |
10020 | 0.0 | 0.0 | 无兴趣-低价值 |
10021 | 1.0 | 0.0 | 有兴趣-低价值 |
10022 | 0.0 | 0.0 | 无兴趣-低价值 |
10023 | 0.0 | 1.0 | 无兴趣-高价值 |
10024 | 1.0 | 1.0 | 有兴趣-高价值 |
10025 | 1.0 | 0.0 | 有兴趣-低价值 |
10026 | 0.0 | 0.0 | 无兴趣-低价值 |
10027 | 1.0 | 1.0 | 有兴趣-高价值 |
10028 | 1.0 | 0.0 | 有兴趣-低价值 |
10029 | 0.0 | 0.0 | 无兴趣-低价值 |
10030 | 1.0 | 0.0 | 有兴趣-低价值 |
... | ... | ... | ... |
40271 | 1.0 | 1.0 | 有兴趣-高价值 |
40272 | 0.0 | 0.0 | 无兴趣-低价值 |
40273 | 0.0 | 0.0 | 无兴趣-低价值 |
40274 | 1.0 | 0.0 | 有兴趣-低价值 |
40275 | 1.0 | 0.0 | 有兴趣-低价值 |
40276 | 1.0 | 1.0 | 有兴趣-高价值 |
40277 | 1.0 | 0.0 | 有兴趣-低价值 |
40278 | 0.0 | 1.0 | 无兴趣-高价值 |
40279 | 1.0 | 0.0 | 有兴趣-低价值 |
40280 | 1.0 | 0.0 | 有兴趣-低价值 |
40281 | 0.0 | 1.0 | 无兴趣-高价值 |
40282 | 1.0 | 0.0 | 有兴趣-低价值 |
40283 | 0.0 | 1.0 | 无兴趣-高价值 |
40284 | 1.0 | 1.0 | 有兴趣-高价值 |
40285 | 1.0 | 1.0 | 有兴趣-高价值 |
40286 | 1.0 | 0.0 | 有兴趣-低价值 |
40287 | 0.0 | 0.0 | 无兴趣-低价值 |
40288 | 0.0 | 0.0 | 无兴趣-低价值 |
40289 | 0.0 | 1.0 | 无兴趣-高价值 |
40290 | 0.0 | 0.0 | 无兴趣-低价值 |
40291 | 0.0 | 1.0 | 无兴趣-高价值 |
40292 | 0.0 | 1.0 | 无兴趣-高价值 |
40293 | 1.0 | 1.0 | 有兴趣-高价值 |
40294 | 0.0 | 1.0 | 无兴趣-高价值 |
40295 | 0.0 | 1.0 | 无兴趣-高价值 |
40296 | 0.0 | 0.0 | 无兴趣-低价值 |
40297 | 1.0 | 1.0 | 有兴趣-高价值 |
40298 | 1.0 | 0.0 | 有兴趣-低价值 |
40299 | 0.0 | 1.0 | 无兴趣-高价值 |
40300 | 1.0 | 0.0 | 有兴趣-低价值 |
1200 rows × 3 columns