import pandas as pd
app= pd. read_csv( '11-w1-applestore.csv' )
app. info( )
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7197 entries, 0 to 7196
Data columns (total 11 columns):
Unnamed: 0 7197 non-null int64
id 7197 non-null int64
track_name 7197 non-null object
size_bytes 7197 non-null int64
price 7197 non-null float64
rating_count_tot 7197 non-null int64
user_rating 7197 non-null float64
prime_genre 7197 non-null object
sup_devices 7197 non-null int64
ipadSc_urls 7197 non-null int64
lang 7197 non-null int64
dtypes: float64(2), int64(7), object(2)
memory usage: 618.6+ KB
app. head( )
Unnamed: 0 id track_name size_bytes price rating_count_tot user_rating prime_genre sup_devices ipadSc_urls lang 0 0 281656475 PAC-MAN Premium 100788224 3.99 21292 4.0 Games 38 5 10 1 1 281796108 Evernote - stay organized 158578688 0.00 161065 4.0 Productivity 37 5 23 2 2 281940292 WeatherBug - Local Weather, Radar, Maps, Alerts 100524032 0.00 188583 3.5 Weather 37 5 3 3 3 282614216 eBay: Best App to Buy, Sell, Save! Online Shop... 128512000 0.00 262241 4.0 Shopping 37 5 9 4 4 282935706 Bible 92774400 0.00 985920 4.5 Reference 37 5 45
app. drop( 'Unnamed: 0' , axis= 1 , inplace= True )
app. describe( )
id size_bytes price rating_count_tot user_rating sup_devices ipadSc_urls lang count 7.197000e+03 7.197000e+03 7197.000000 7.197000e+03 7197.000000 7197.000000 7197.000000 7197.000000 mean 8.631310e+08 1.991345e+08 1.726218 1.289291e+04 3.526956 37.361817 3.707100 5.434903 std 2.712368e+08 3.592069e+08 5.833006 7.573941e+04 1.517948 3.737715 1.986005 7.919593 min 2.816565e+08 5.898240e+05 0.000000 0.000000e+00 0.000000 9.000000 0.000000 0.000000 25% 6.000937e+08 4.692275e+07 0.000000 2.800000e+01 3.500000 37.000000 3.000000 1.000000 50% 9.781482e+08 9.715302e+07 0.000000 3.000000e+02 4.000000 37.000000 5.000000 1.000000 75% 1.082310e+09 1.819249e+08 1.990000 2.793000e+03 4.500000 38.000000 5.000000 8.000000 max 1.188376e+09 4.025970e+09 299.990000 2.974676e+06 5.000000 47.000000 5.000000 75.000000
app[ 'size_mb' ] = app[ 'size_bytes' ] / ( 1024 * 1024.0 )
app. size_mb. describe( )
count 7197.000000
mean 189.909414
std 342.566408
min 0.562500
25% 44.749023
50% 92.652344
75% 173.497070
max 3839.463867
Name: size_mb, dtype: float64
app[ 'paid' ] = app[ 'price' ] . apply ( lambda x: 1 if x > 0 else 0 )
app. paid. describe( )
count 7197.000000
mean 0.436432
std 0.495977
min 0.000000
25% 0.000000
50% 0.000000
75% 1.000000
max 1.000000
Name: paid, dtype: float64
app. price. value_counts( )
0.00 4056
0.99 728
2.99 683
1.99 621
4.99 394
3.99 277
6.99 166
9.99 81
5.99 52
7.99 33
14.99 21
19.99 13
8.99 9
24.99 8
13.99 6
11.99 6
29.99 6
12.99 5
15.99 4
59.99 3
17.99 3
22.99 2
23.99 2
20.99 2
27.99 2
16.99 2
49.99 2
39.99 2
74.99 1
18.99 1
34.99 1
99.99 1
299.99 1
47.99 1
21.99 1
249.99 1
Name: price, dtype: int64
app= app[ app[ 'price' ] <= 49.99 ]
bins = [ 0 , 2 , 10 , 300 ]
labels = [ '<2' , '<10' , '<300' ]
app[ 'price_new' ] = pd. cut( app. price, bins, right= False , labels= labels)
app. groupby( [ 'price_new' ] ) [ 'price' ] . describe( )
count mean std min 25% 50% 75% max price_new <2 5405.0 0.361981 0.675318 0.00 0.00 0.00 0.00 1.99 <10 1695.0 4.565811 1.864034 2.99 2.99 3.99 4.99 9.99 <300 90.0 20.256667 8.245939 11.99 14.99 17.99 23.99 49.99
app. groupby( [ 'prime_genre' ] ) [ 'price' ] . describe( )
count mean std min 25% 50% 75% max prime_genre Book 112.0 1.790536 3.342210 0.0 0.0 0.00 2.99 27.99 Business 56.0 4.136429 7.154403 0.0 0.0 2.99 4.99 49.99 Catalogs 10.0 0.799000 2.526660 0.0 0.0 0.00 0.00 7.99 Education 449.0 2.572004 2.669539 0.0 0.0 2.99 2.99 24.99 Entertainment 535.0 0.889701 1.454022 0.0 0.0 0.00 1.99 9.99 Finance 104.0 0.421154 1.108990 0.0 0.0 0.00 0.00 5.99 Food & Drink 63.0 1.552381 3.972119 0.0 0.0 0.00 1.49 27.99 Games 3862.0 1.432923 2.486609 0.0 0.0 0.00 1.99 29.99 Health & Fitness 180.0 1.916444 2.052378 0.0 0.0 1.99 2.99 9.99 Lifestyle 144.0 0.885417 1.478410 0.0 0.0 0.00 1.24 4.99 Medical 23.0 8.776087 10.788269 0.0 0.0 3.99 16.99 34.99 Music 138.0 4.835435 8.915667 0.0 0.0 0.99 4.99 49.99 Navigation 45.0 2.550000 4.487584 0.0 0.0 0.99 2.99 20.99 News 75.0 0.517733 1.127771 0.0 0.0 0.00 0.00 3.99 Photo & Video 349.0 1.473295 2.280703 0.0 0.0 0.99 1.99 22.99 Productivity 177.0 3.790113 4.965777 0.0 0.0 1.99 4.99 29.99 Reference 64.0 4.836875 8.285100 0.0 0.0 1.99 4.99 47.99 Shopping 122.0 0.016311 0.180166 0.0 0.0 0.00 0.00 1.99 Social Networking 167.0 0.339880 1.142210 0.0 0.0 0.00 0.00 9.99 Sports 114.0 0.953070 2.419084 0.0 0.0 0.00 0.99 19.99 Travel 81.0 1.120370 2.183772 0.0 0.0 0.00 0.99 9.99 Utilities 248.0 1.647621 2.628541 0.0 0.0 0.99 1.99 24.99 Weather 72.0 1.605417 1.831316 0.0 0.0 0.99 2.99 9.99
app. rating_count_tot. describe( )
count 7.190000e+03
mean 1.290515e+04
std 7.577526e+04
min 0.000000e+00
25% 2.725000e+01
50% 3.005000e+02
75% 2.796750e+03
max 2.974676e+06
Name: rating_count_tot, dtype: float64
bins = [ 0 , 1000 , 5000 , 100000 , 5000000 ]
app[ 'rating_new' ] = pd. cut( app. rating_count_tot, bins, right= False )
app. groupby( [ 'rating_new' ] ) [ 'price' ] . describe( )
count mean std min 25% 50% 75% max rating_new [0, 1000) 4587.0 1.798696 3.324682 0.0 0.0 0.0 2.99 49.99 [1000, 5000) 1193.0 1.740721 3.203853 0.0 0.0 0.0 2.99 39.99 [5000, 100000) 1192.0 0.963549 1.984895 0.0 0.0 0.0 0.99 14.99 [100000, 5000000) 218.0 0.196376 0.925160 0.0 0.0 0.0 0.00 7.99
import matplotlib. pyplot as plt
import seaborn as sns
% matplotlib inline
plt. figure( figsize= ( 30 , 20 ) )
sns. relplot( x= "prime_genre" , y= "user_rating" , kind= 'line' ,
data= app)
<seaborn.axisgrid.FacetGrid at 0x197e75416d8>
<Figure size 2160x1440 with 0 Axes>
app1= app[ app[ 'price' ] <= 9.99 ]
sns. distplot( app1[ 'price' ] )
<matplotlib.axes._subplots.AxesSubplot at 0x197e7afd4e0>
plt. figure( figsize= ( 10 , 8 ) )
sns. boxplot( x= 'price' , y= 'prime_genre' , data= app[ app[ 'paid' ] == 1 ] )
<matplotlib.axes._subplots.AxesSubplot at 0x197e7905320>
top5= [ 'Games' , 'Entertainment' , 'Education' , 'Photo & Video' ,
'Utilities' ]
app5 = app[ app. prime_genre. isin( top5) ]
plt. figure( figsize= ( 10 , 8 ) )
sns. boxplot( x= 'price' , y= 'prime_genre' , data= app5[ app[ 'paid' ] == 1 ] )
C:\Anaconda3\lib\site-packages\ipykernel_launcher.py:4: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
after removing the cwd from sys.path.
<matplotlib.axes._subplots.AxesSubplot at 0x197e97f3518>
sns. scatterplot( x= 'price' , y= 'user_rating' , data= app)
<matplotlib.axes._subplots.AxesSubplot at 0x197e81ae710>
top5= [ 'Games' , 'Entertainment' , 'Education' , 'Photo & Video' ,
'Utilities' ]
app5 = app[ app. prime_genre. isin( top5) ]
plt. figure( figsize= ( 10 , 8 ) )
sns. barplot( x= 'prime_genre' , y= 'user_rating' , hue= 'paid' , data= app5)
<matplotlib.axes._subplots.AxesSubplot at 0x197e7fece48>
plt. figure( figsize= ( 20 , 10 ) )
sns. countplot( y= 'prime_genre' , hue= 'paid' , data= app, order= app[ 'prime_genre' ] . value_counts( ) . index)
plt. tick_params( labelsize= 20 )
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-1-1652bf40fd57> in <module>()
8 #使用order对数据进行排序(从大到小)
9
---> 10 plt.figure(figsize=(20,10))
11 sns.countplot(y='prime_genre',hue='paid',data=app,order=app['prime_genre'].value_counts())
12 plt.tick_params(labelsize=20) #调整文字大小
NameError: name 'plt' is not defined
bins= [ 0 , 0.1 , 2.5 , 4.5 , 5 ]
app[ 'rating_level' ] = pd. cut( app. user_rating, bins, right= False )
app. groupby( [ 'rating_level' ] ) [ 'user_rating' ] . describe( )
count mean std min 25% 50% 75% max rating_level [0.0, 0.1) 929.0 0.000000 0.000000 0.0 0.0 0.0 0.0 0.0 [0.1, 2.5) 206.0 1.650485 0.400213 1.0 1.5 2.0 2.0 2.0 [2.5, 4.5) 2903.0 3.646056 0.467987 2.5 3.5 4.0 4.0 4.0 [4.5, 5.0) 2660.0 4.500000 0.000000 4.5 4.5 4.5 4.5 4.5
sns. countplot( x= 'paid' , hue= 'rating_level' , data= app)
<matplotlib.axes._subplots.AxesSubplot at 0x197e9577f28>
q4= [ 'user_rating' , 'price' , 'size_mb' ]
app[ q4] . corr( )
user_rating price size_mb user_rating 1.000000 0.073237 0.066160 price 0.073237 1.000000 0.314386 size_mb 0.066160 0.314386 1.000000
sns. heatmap( app[ q4] . corr( ) )
<matplotlib.axes._subplots.AxesSubplot at 0x197e95ce6d8>