import pandas
wa=pandas.read_csv('D://Python projects//reference data//5.6//data.csv')
#查看两列之间的相关度
wa['人口'].corr(wa['文盲率'])
#查看两两之间的相关度
#选择多列的方法
wa[['人口','平均收入','文盲率','超市购物率','网上购物率']]
wa[['人口','平均收入','文盲率','超市购物率','网上购物率']].corr()
结果为:
#RFM分析
import pandas
import numpy
RFM_data=pandas.read_csv('D://Python projects//reference data//5.7//data.csv')
#标准化时间格式
RFM_data['DealDateTime']=pandas.to_datetime(
RFM_data.DealDateTime,
format='%Y/%m/%d')
#计算出距最近一次交易天数
RFM_data['DateDiff']=pandas.to_datetime('today')-RFM_data.DealDateTime
#只取天
RFM_data['DateDiff']=RFM_data['DateDiff'].dt.days
#计算每个用户的最近消费距离、消费频率、消费总额
R_agg=RFM_data.groupby(
by=['CustomerID']
)['DateDiff'].agg({
'Rencentagg':numpy.min})
F_agg=RFM_data.groupby(
by=['CustomerID']
)['OrderID'].agg({
'Frequencyagg':numpy.size})
M_agg=RFM_data.groupby(
by=['CustomerID']
)['Sales'].agg({
'Moneyagg':numpy.sum})
#将三组数据整合
data_agg=R_agg.join(F_agg).join(M_agg)
#将三组数据进行分组,并赋予1-5分的得分
bins=data_agg.Rencentagg.quantile(
q=[0,0.2,0.4,0.6,0.8,1],
interpolation='nearest')
bins[0]=0
labels=[5,4,3,2,1]
R_result=pandas.cut(
data_agg.Rencentagg,
bins,
labels=labels)
bins=data_agg.Frequencyagg.quantile(
q=[0,0.2,0.4,0.6,0.8,1],
interpolation='nearest')
bins[0]=0
labels=[1,2,3,4,5]
F_result=pandas.cut(
data_agg.Frequencyagg,
bins,
labels=labels)
bins=data_agg.Moneyagg.quantile(
q=[0,0.2,0.4,0.6,0.8,1],
interpolation='nearest')
bins[0]=0
labels=[1,2,3,4,5]
M_result=pandas.cut(
data_agg.Moneyagg,
bins,
labels=labels)
#将RFM值赋值回去原数据表
data_agg['R_result']=R_result
data_agg['F_result']=F_result
data_agg['M_result']=M_result
#将RFM最后分值计算出来
data_agg['RFM']=100*data_agg['R_result'].astype(int)+10*data_agg['F_result'].astype(int)+data_agg['M_result'].astype(int)
#将用户分为8类
bins=data_agg.RFM.quantile(
q=[0,0.125,0.25,0.375,0.5,0.675,0.75,0.875,1],
interpolation='nearest')
bins[0]=0
labels=[1,2,3,4,5,6,7,8]
data_agg['level']=pandas.cut(
data_agg.RFM,
bins,
labels=labels)
#重置索引
data_agg=data_agg.reset_index()
#查看每种类型的用户分类情况
data_agg.sort(
['level','RFM'],
ascending=[1,1])
data_agg.groupby(
by=['level']
)['CustomerID'].agg({
'size':numpy.size})