基于python的幂律分布中帕累托分布拟合

用分箱数据拟合

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sbn
df=pd.read_excel(r'C:\Users\apache\Desktop\pareto_data.xlsx')
df.head()
df.sample(10)
df.columns
df.dtypes
df.describe().T
countmeanstdmin25%50%75%max
Freq_per_year75.035.74333322.6684750.516.50000035.00000053.5000081.0
lost_Prob75.00.1622830.2056480.00.0918680.1072060.145231.0
lost75.09468.56000062971.4674480.01.0000006.00000099.50000540540.0
existing75.017566.64000078662.6798380.05.00000055.000000784.00000624735.0
# missing values process
df1=df[~(df.lost_Prob==0)]
df1.describe().T
countmeanstdmin25%50%75%max
Freq_per_year63.030.20238120.0277960.50000013.50000029.00000045.50000081.0
lost_Prob63.00.1931940.2107320.0555560.0994030.1120630.1666671.0
lost63.011272.09523868645.8015781.0000002.00000015.000000170.000000540540.0
existing63.020911.93650885524.1913100.00000013.500000113.0000001419.000000624735.0
fig,ax=plt.subplots(3,1,figsize=(15,7))
ax[0].scatter(df1.Freq_per_year,df1.lost_Prob)
ax[1].scatter(df1[df1.Freq_per_year<=50].Freq_per_year,df1[df1.Freq_per_year<=50].lost_Prob)
ax[2].scatter(df1[df1.Freq_per_year<=35].Freq_per_year,df1[df1.Freq_per_year<=35].lost_Prob)
plt.show()

在这里插入图片描述在这里插入图片描述
在这里插入图片描述在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传

df2=df1[df1.Freq_per_year<=35]
df2.describe().T
countmeanstdmin25%50%75%max
Freq_per_year38.016.67763210.8380130.5000007.25000016.50000025.75000035.000000
lost_Prob38.00.1231470.0677280.0657890.0970680.1070220.1175920.463873
lost38.018686.44736888056.3515235.00000017.50000099.500000800.750000540540.000000
existing38.034662.210526108476.75835655.000000181.750000784.0000007073.750000624735.000000
fit.distribution_compare('power_law','exponential')
# help(fit.distribution_compare)
# distribution_compare(dist1, dist2, nested=None, **kwargs) method of powerlaw.Fit instance
#     Returns the loglikelihood ratio, and its p-value, between the two
#     distribution fits, ![在这里插入图片描述](https://img-blog.csdnimg.cn/07ed29b129de4b81877f0bb363d3ebf6.png#pic_center)
assuming the candidate distributions are nested.
#     Returns
#     -------
#     R : float
#         Loglikelihood ratio of the two distributions' fit to the data. If
#         greater than 0, the first distribution is preferred. If less than
#         0, the second distribution is preferred.
#     p : float
#         Significance of R
# 5.508748079440679>0 
(5.508748079440679, 0.061570114437107006)
import powerlaw
powerlaw
fit=powerlaw.Fit(df2.lost_Prob,descrete=False)
Calculating best minimal value for power law fit
xmin progress: 97%
print(fit.alpha,'\n',fit.xmin)
4.830091504927531 
 0.0915032679738562
plt.figure(figsize=(7,3))
powerlaw.plot_pdf(df2.lost_Prob,color ='r') 
plt.show()
plt.figure(figsize=(7,3))
powerlaw.plot_cdf(df2.lost_Prob,color ='b')
plt.show()

外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传

外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传

# pdf:
#     p(x)=C*x**-alpha,s.t.C=(alpha-1)/xmin**(-alpha+1),when x<=300
df2['pred_lost_Prob']=(fit.alpha-1)/fit.xmin**(-fit.alpha+1)*df2.Freq_per_year**-fit.alpha
df2['pred_error']=df2.lost_Prob-df2.pred_lost_Prob
print(df2.pred_error.describe())
count    38.000000
mean      0.122832
std       0.066159
min       0.065789
25%       0.097068
50%       0.107022
75%       0.117592
max       0.452407
Name: pred_error, dtype: float64


C:\Users\apache\AppData\Local\Continuum\anaconda3\lib\site-packages\ipykernel_launcher.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
C:\Users\apache\AppData\Local\Continuum\anaconda3\lib\site-packages\ipykernel_launcher.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
fig,ax=plt.subplots(1,1,figsize=(18,10))
ax.plot(df2.Freq_per_year,df2.pred_lost_Prob,label='predict',color='r')
ax.legend(loc='upper left')
#x同轴
ax2=ax.twinx()
ax2.plot(df2.Freq_per_year,df2.lost_Prob,label='actual') 
ax2.legend(loc='upper right')
plt.tight_layout()
# plt.grid()
plt.show()

外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传

# df.lost_Prob.plot()
# plt.show()
# plt.scatter(df.Freq_per_year,df.lost_Prob)
# plt.show()
df30=df.head(30)
# df30
# plt.scatter(df30.Freq_per_year,df30.lost_Prob)
# plt.show()
# 单行多列或者单列多行按照一维数组方式引用子图ax[0]...
fig,ax=plt.subplots(1,3,figsize=(18,3))
ax[0].scatter(df30.Freq_per_year,df30.lost_Prob,c='r')
ax[1].plot(df30.Freq_per_year,df30.lost_Prob)
ax[2].plot(df.Freq_per_year,df.lost_Prob)
plt.tight_layout
plt.show()

外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传

# !pip install powerlaw
import powerlaw
powerlaw
# help(powerlaw)
<module 'powerlaw' from 'C:\\Users\\apache\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\powerlaw.py'>
fit=powerlaw.Fit(df30.lost_Prob,descrete=False)
# fit=powerlaw.Fit(df.lost_Prob)
# help(powerlaw.Fit)
Calculating best minimal value for power law fit
xmin progress: 96%
print('alpha=',fit.alpha,'\n')
# help(fit)
print('xmin=',fit.xmin,'\n')
df30.lost_Prob.describe().T
alpha= 4.158527091405096 

xmin= 0.08629441624365482 






count    30.000000
mean      0.129028
std       0.074869
min       0.086294
25%       0.097399
50%       0.107495
75%       0.118141
max       0.463873
Name: lost_Prob, dtype: float64
fit.distribution_compare('power_law','exponential')
# help(fit.distribution_compare)
# distribution_compare(dist1, dist2, nested=None, **kwargs) method of powerlaw.Fit instance
#     Returns the loglikelihood ratio, and its p-value, between the two
#     distribution fits, assuming the candidate distributions are nested.
(5.508748079440679, 0.061570114437107006)
plt.figure(figsize=(9,4))
powerlaw.plot_pdf(df30.lost_Prob,color ='r') 
# plt.show()
powerlaw.plot_cdf(df30.lost_Prob,color ='b')
plt.show()

外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传

用不分箱数据拟合

import pandas as pd
df_raw=pd.read_excel(r'C:\Users\apache\Desktop\df_raw.xlsx')
df_raw.head()
Freqlostexisting
01483938473166
12104579199597
2341374117802
342171078646
451286557635
# generate lost freq that similar as prob
df_raw['lost_prob']=df_raw.apply(lambda d:d.lost/(d.lost+d.existing),axis=1)
df_raw.head()
Freqlostexistinglost_prob
014839384731660.505627
121045791995970.343811
23413741178020.259926
3421710786460.216330
4512865576350.182482
## considering missing and zero values 
print(df_raw.describe(),'\n','vs')
df_raw[df_raw.lost_prob==0]
# 删除lost prob为0的行
df_raw1=df_raw.loc[~(df_raw.lost_prob==0)]
# 删除0值所在的行
# df_raw[~(df_raw==0).any(axis=1)]
print(df_raw1.describe())
print(df_raw2.describe())
              Freq           lost       existing   lost_prob
count   609.000000     609.000000     609.000000  609.000000
mean    323.100164    1165.523810    2162.656814    0.025200
std     211.096171   20148.630371   21872.040822    0.050174
min       1.000000       0.000000       1.000000    0.000000
25%     153.000000       0.000000       2.000000    0.000000
50%     305.000000       0.000000      10.000000    0.000000
75%     463.000000       3.000000      94.000000    0.040323
max    1253.000000  483938.000000  473166.000000    0.505627 
 vs
             Freq           lost       existing   lost_prob
count  254.000000     254.000000     254.000000  254.000000
mean   138.622047    2794.503937    5175.377953    0.060420
std     93.278658   31161.435154   33675.021899    0.062558
min      1.000000       1.000000       1.000000    0.006410
25%     64.250000       1.000000      46.250000    0.031533
50%    127.500000       5.000000     152.500000    0.045663
75%    196.500000      34.500000     781.250000    0.063387
max    487.000000  483938.000000  473166.000000    0.505627
             Freq           lost       existing   lost_prob  pred_lost_prob
count  240.000000     240.000000     240.000000  240.000000    2.400000e+02
mean   125.850000    2957.458333    5476.933333    0.052623    4.920171e-06
std     77.713146   32053.579689   34623.367223    0.047232    6.827182e-05
min      1.000000       1.000000       8.000000    0.006410    1.699603e-12
25%     60.750000       2.000000      56.750000    0.031199    9.079929e-12
50%    120.500000       7.000000     184.000000    0.043609    4.117756e-11
75%    184.250000      44.000000     857.750000    0.058824    4.717054e-10
max    295.000000  483938.000000  473166.000000    0.505627    1.054178e-03
import matplotlib.pyplot as plt
import seaborn as sbn
sbn.set_style('whitegrid')
fig,ax=plt.subplots(4,1,figsize=(15,9))
ax[0].scatter(df_raw.Freq,df_raw.lost_prob)
ax[1].scatter(df_raw1.Freq,df_raw1.lost_prob)
ax[2].scatter(df_raw1[df_raw1.Freq<300].Freq,df_raw1[df_raw1.Freq<300].lost_prob)
ax[3].hist(df_raw1[df_raw1.Freq<300].lost_prob,bins=15)
plt.show()
# 当消费频次不超过300次,流失概率没有出现翘尾震荡,可以用powerlaw分布拟合

外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传

df_raw2=df_raw1[df_raw1.Freq<300]
print(df_raw2.head())
import powerlaw
fit=powerlaw.Fit(df_raw2.lost_prob,descrete=False)
   Freq    lost  existing  lost_prob
0     1  483938    473166   0.505627
1     2  104579    199597   0.343811
2     3   41374    117802   0.259926
3     4   21710     78646   0.216330
4     5   12865     57635   0.182482
Calculating best minimal value for power law fit
xmin progress: 99%

9%

print(fit.alpha)
print(fit.xmin)
3.559979143336282
0.047619047619047616
# pdf:
#     p(x)=C*x**-alpha,s.t.C=(alpha-1)/xmin**(-alpha+1),when x<=300
# 预测的会员流失概率与会员消费活跃度的关系为:
#     lost_prob=(3.56-1)/0.0476**(-3.56+1)*Freq**-3.56

df_raw2['pred_lost_prob']=((3.56-1)/0.0476**(-3.56+1))*df_raw2.Freq**-3.56
df_raw2
C:\Users\apache\AppData\Local\Continuum\anaconda3\lib\site-packages\ipykernel_launcher.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
Freqlostexistinglost_probpred_lost_prob
014839384731660.5056271.054178e-03
121045791995970.3438118.938136e-05
23413741178020.2599262.110387e-05
3421710786460.2163307.578447e-06
4512865576350.1824823.424369e-06
..................
2882891150.0625001.828595e-12
2902911160.0588241.784246e-12
291292180.1111111.762588e-12
2932941160.0588241.720273e-12
2942951120.0769231.699603e-12

240 rows × 5 columns

# 比较预测值与实际值
# plt.plot(df_raw2.lost_prob)
plt.figure(figsize=(16,3))
plt.plot(df_raw2.pred_lost_prob,color='r')
plt.show()

外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传

# 生成1w个[0,1]上均匀分布的随机数
rnd_list = np.random.uniform(0,1,10**5)
rnd_list.shape
(100000,)
a, m = 3, 2 
s = (np.random.pareto(a, 1000) + 1) * m
plt.hist(np.random.pareto(a,1000),bins=10)
plt.show()

外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传

plt.plot(s)
plt.show()
plt.hist(s,bins=15)
plt.show()

外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传

外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传

X与alpha值域对应的曲线形状模拟

import numpy as np
import matplotlib.pyplot as plt
x1=np.linspace(0,1,10)
x2=np.linspace(1,3,10)
x3=np.linspace(-3,0,10)

fig,ax=plt.subplots(6,1,figsize=(15,20))
ax[0].plot(x1,x1**3)
ax[1].plot(x2,x2**3)
ax[2].plot(x1,x1**-3)
ax[3].plot(x2,x2**-3)
ax[4].plot(x3,x3**3)
ax[5].plot(x3,x3**-3)
plt.show()
C:\Users\apache\AppData\Local\Continuum\anaconda3\lib\site-packages\ipykernel_launcher.py:8: RuntimeWarning: divide by zero encountered in power
  
C:\Users\apache\AppData\Local\Continuum\anaconda3\lib\site-packages\ipykernel_launcher.py:11: RuntimeWarning: divide by zero encountered in power
  # This is added back by InteractiveShellApp.init_path()

外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传


---------------------------------------------------------------------------

TypeError                                 Traceback (most recent call last)

<ipython-input-17-a56dda00e550> in <module>()
----> 1 for item in range(0,100,0.5):
      2     print(item)


TypeError: 'float' object cannot be interpreted as an integer
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值