背景
《艾伯特贝叶斯思维:统计建模的Python学习法.pdf》以面向对象的方式给出代码,作为了一本介绍统计思想的书,我们只需要了解其逻辑即可(PS:其实是懒得去看书中代码的组织过程),因此,给出读这本书的时候自己写的代码。
简单复述
总结第五章决策分析的处理过程的简单逻辑:
1、基于历史数据得到各选手商品价格的分布,以及选手出价差的分布 2、选手见到商品,猜测一个价格,据下面的过程得到商品真实价格的后验 先验分布:商品历史分布,似然度:由第一步的价差分布给出 后验分布:商品真实价格分布 3、基于游戏规则计算己方出不同价格时的预期收益。 4、出预期收益最大的价格
导入常见模块
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib as mpl
from matplotlib import pyplot as plt
from matplotlib. pyplot import plot as plot
import sklearn
import seaborn as sns
import sys
import patsy
sys. path. append( r"C:\Users\Administrator\PycharmProjects\QY_TS_Quant" )
from QY_plot import *
plt. rcParams[ 'font.sans-serif' ] = [ 'SimHei' ]
plt. rcParams[ 'axes.unicode_minus' ] = False
sns. set ( font= 'SimHei' , font_scale= 1.25 , style= "ticks" , rc= { "xtick.major.size" : 3 , "ytick.major.size" : 3 } )
% matplotlib inline
% config InlineBackend. figure_format= "retina"
% config InlineBackend. rc = { "figure.figsize" : ( 7.5 , 4.5 ) }
from IPython. core. interactiveshell import InteractiveShell
InteractiveShell. ast_node_interactivity = "all"
导入数据
url_2011 = "http://thinkbayes.com/showcases.2011.csv"
url_2012 = "http://thinkbayes.com/showcases.2012.csv"
df_2011 = pd. read_csv( url_2011, index_col= 0 , skiprows= [ 1 , 2 , 5 , 8 ] ) . T. assign( year= 2011 ) . reset_index( drop= True )
df_2012 = pd. read_csv( url_2012, index_col= 0 , skiprows= [ 1 , 4 , 7 ] ) . T. assign( year= 2012 ) . reset_index( drop= True )
df = df_2011. append( df_2012, ignore_index= True )
df_2011. head( )
Showcase 1 Showcase 2 Bid 1 Bid 2 Difference 1 Difference 2 year 0 50969 45429 42000 34000 8969 11429 2011 1 21901 34061 14000 59900 7901 -25839 2011 2 32815 53186 32000 45000 815 8186 2011 3 44432 31428 27000 38000 17432 -6572 2011 4 24273 22320 18750 23000 5523 -680 2011
df_2012. head( )
Showcase 1 Showcase 2 Bid 1 Bid 2 Difference 1 Difference 2 year 0 40811 62485 22000 40500 18811 21985 2012 1 31259 32972 10000 21000 21259 11972 2012 2 41943 24755 39500 22500 2443 2255 2012 3 24946 34665 21513 32000 3433 2665 2012 4 26257 48635 23420 35000 2837 13635 2012
展品价格分布
df. groupby( "year" ) . describe( ) . T. assign( minus= lambda x: x. diff( axis= 1 ) . iloc[ : , - 1 ] ) . filter ( like= "Show" , axis= 0 )
year 2011 2012 minus Showcase 1 count 191.000000 122.000000 -69.000000 mean 29482.738220 31578.172131 2095.433911 std 6713.205663 7628.472048 915.266384 min 19563.000000 20869.000000 1306.000000 25% 24470.500000 25684.500000 1214.000000 50% 28762.000000 29402.500000 640.500000 75% 32740.500000 36599.500000 3859.000000 max 54579.000000 58342.000000 3763.000000 Showcase 2 count 191.000000 122.000000 -69.000000 mean 30668.146597 31641.868852 973.722256 std 8003.103747 8728.088463 724.984716 min 18349.000000 19290.000000 941.000000 25% 25310.500000 24900.250000 -410.250000 50% 29460.000000 29530.500000 70.500000 75% 34191.000000 35764.000000 1573.000000 max 71628.000000 66996.000000 -4632.000000
df. filter ( like= "Show" ) . describe( )
Showcase 1 Showcase 2 count 313.000000 313.000000 mean 30299.488818 31047.680511 std 7145.705405 8293.059002 min 19563.000000 18349.000000 25% 24866.000000 25264.000000 50% 28958.000000 29488.000000 75% 34428.000000 34665.000000 max 58342.000000 71628.000000
list ( filter ( lambda x: x. startswith( "Show" ) , df. columns) )
['Showcase 1', 'Showcase 2']
fig, axes = plt. subplots( 1 , 2 , sharex= True , sharey= True , figsize= ( 18 , 6 ) )
for i, col in enumerate ( filter ( lambda x: x. startswith( "Show" ) , df. columns) ) :
sns. distplot( df[ col] , ax= axes[ i] ) ;
axes[ i] . axvline( 30000 , c= "r" ) ;
fig. suptitle( "$2011-2012$正确商品价格分布" , fontsize= 15 ) ;
fig. subplots_adjust( wspace= 0.05 ) ;
价差分析
fig, axes = plt. subplots( 1 , 2 , sharex= True , sharey= True , figsize= ( 18 , 6 ) )
for i, col in enumerate ( filter ( lambda x: x. startswith( "Diff" ) , df. columns) ) :
axes[ i] . hist( df[ col] , cumulative= True )
axes[ i] . axvline( 0 . , c= "r" ) ;
fig. suptitle( "$2011-2012$价差分析" , fontsize= 15 ) ;
fig. subplots_adjust( wspace= 0.05 ) ;
( df. filter ( like= "Diff" ) < 0 ) . apply ( lambda x: x. value_counts( normalize= True ) )
Difference 1 Difference 2 False 0.753994 0.709265 True 0.246006 0.290735
选手一对商品真实价格分析
选手一根据商品信息,预测商品为20000元,再结合历史信息,对商品真实价格的预测过程。
kde = sp. stats. gaussian_kde( df[ "Showcase 1" ] )
h = np. linspace( 0 , 75000 , 101 )
plt. plot( h, kde. evaluate( h) )
[<matplotlib.lines.Line2D at 0x1e2396cb7b8>]
h[ kde. evaluate( h) . argmax( ) ]
27750.0
p_h = kde. evaluate( h)
p_dh = sp. stats. norm. pdf( h- 20000 , loc= 0 , scale= df[ "Difference 1" ] . std( ) )
def normlize ( z) :
return z/ np. sum ( z)
p_hd = normlize( p_dh* p_h)
plt. plot( h, p_hd, h, normlize( kde. evaluate( h) ) )
plt. legend( [ "后验分布" , "先验分布" ] )
plt. title( "选手一预测价格为20000时,结合历史展品信息与误差分布的后验分布" ) ;
h[ p_hd. argmax( ) ]
24000.0
np. sum ( p_hd * h)
25103.012921487145
最优报价分析
plt. plot( h, p_hd)
[<matplotlib.lines.Line2D at 0x1e2399a3048>]
def cdf ( x) :
return np. sum ( df[ "Difference 2" ] < x) / len ( df)
return_s = [ ]
for bid in h:
Er = 0 .
for price, prob in zip ( h, p_hd) :
diff = price - bid
if diff< 0 :
Er += 0
else :
probwin_1 = cdf( - 1 )
probwin_2 = 1 - cdf( diff)
probwin = probwin_1 + probwin_2
if diff<= 250 :
Er += 2 * price* prob* probwin
else :
Er += price* prob* probwin
return_s. append( Er)
plt. plot( h, return_s)
[<matplotlib.lines.Line2D at 0x1e239e44dd8>]
max ( zip ( return_s, h) )
(16666.56998160178, 21000.0)