利用Python进行数据分析的学习笔记——chap9

数据聚合与分组运算

GroupBy技术

import numpy as np
import pandas as pd
from pandas import DataFrame,Series
df = DataFrame({'key1':['a','a','b','b','a'],
               'key2':['one','two','one','two','one'],
               'data1':np.random.randn(5),
               'data2':np.random.randn(5)})
df
key1key2data1data2
0aone-0.074122-0.571432
1atwo0.347874-0.794645
2bone0.399766-0.596056
3btwo1.209857-0.266257
4aone-0.0011750.180895
#根据key1进行分组,并计算data1列的平均值。
grouped = df['data1'].groupby(df['key1'])
grouped
<pandas.core.groupby.generic.SeriesGroupBy object at 0x000001CCD8450910>
grouped.mean()
key1
a    0.090859
b    0.804812
Name: data1, dtype: float64
means = df['data1'].groupby([df['key1'],df['key2']]).mean()
means
key1  key2
a     one    -0.037649
      two     0.347874
b     one     0.399766
      two     1.209857
Name: data1, dtype: float64
means.unstack()
key2onetwo
key1
a-0.0376490.347874
b0.3997661.209857
states = np.array(['Ohio','California','California','Ohio','Ohio'])
years = np.array([2005,2005,2006,2005,2006])
df['data1'].groupby([states,years]).mean()
California  2005    0.347874
            2006    0.399766
Ohio        2005    0.567867
            2006   -0.001175
Name: data1, dtype: float64
df.groupby('key1').mean()
data1data2
key1
a0.090859-0.395061
b0.804812-0.431157
df.groupby(['key1','key2']).mean()
data1data2
key1key2
aone-0.037649-0.195268
two0.347874-0.794645
bone0.399766-0.596056
two1.209857-0.266257
#GroupBy的size方法,可以返回一个含有分组大小的Series。目前,分组键中的任何缺失值都会被排除在结果之外。
df.groupby(['key1','key2']).size()
key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

对分组进行迭代

for name,group in df.groupby('key1'):
    print(name)
    print(group)
a
  key1 key2     data1     data2
0    a  one -0.074122 -0.571432
1    a  two  0.347874 -0.794645
4    a  one -0.001175  0.180895
b
  key1 key2     data1     data2
2    b  one  0.399766 -0.596056
3    b  two  1.209857 -0.266257
for (k1,k2),group in df.groupby(['key1','key2']):
    print(k1,k2)
    print(group)
a one
  key1 key2     data1     data2
0    a  one -0.074122 -0.571432
4    a  one -0.001175  0.180895
a two
  key1 key2     data1     data2
1    a  two  0.347874 -0.794645
b one
  key1 key2     data1     data2
2    b  one  0.399766 -0.596056
b two
  key1 key2     data1     data2
3    b  two  1.209857 -0.266257
pieces = dict(list(df.groupby('key1')))
pieces['b']
key1key2data1data2
2bone0.399766-0.596056
3btwo1.209857-0.266257
df.dtypes
key1      object
key2      object
data1    float64
data2    float64
dtype: object
grouped = df.groupby(df.dtypes,axis=1)
dict(list(grouped))
{dtype('float64'):       data1     data2
 0 -0.074122 -0.571432
 1  0.347874 -0.794645
 2  0.399766 -0.596056
 3  1.209857 -0.266257
 4 -0.001175  0.180895,
 dtype('O'):   key1 key2
 0    a  one
 1    a  two
 2    b  one
 3    b  two
 4    a  one}

选取一个或一组列

df.groupby('key1')['data1']
df.groupby('key1')[['data2']]
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001CCCF3F55A0>
df.groupby(['key1','key2'])[['data2']].mean()
data2
key1key2
aone-0.195268
two-0.794645
bone-0.596056
two-0.266257
s_grouped = df.groupby(['key1','key2'])['data2']
s_grouped
<pandas.core.groupby.generic.SeriesGroupBy object at 0x000001CCD8452DA0>
s_grouped.mean()
key1  key2
a     one    -0.195268
      two    -0.794645
b     one    -0.596056
      two    -0.266257
Name: data2, dtype: float64

通过字典或Series进行分组

people = DataFrame(np.random.randn(5,5),
                  columns=['a','b','c','d','e'],
                  index=['Joe','Steve','Wes','Jim','Travis'])
people.loc[2:3,['b','c']] = np.nan#添加几个NA值
people
C:\windows\ FutureWarning: Slicing a positional slice with .loc is not supported, and will raise TypeError in a future version.  Use .loc with labels or .iloc with positions instead.
  people.loc[2:3,['b','c']] = np.nan#添加几个NA值
abcde
Joe0.3093271.6581071.146959-0.1234710.159285
Steve1.380735-0.7032450.158134-1.6029581.455772
Wes-0.766580NaNNaN0.0744621.430541
Jim-0.6156662.578830-0.0027660.885567-0.375239
Travis-0.0335341.1581130.6373271.4735470.373215
mapping = {'a':'red','b':'red','c':'blue','d':'blue','e':'red','f':'orange'}
by_column = people.groupby(mapping,axis=1)
by_column.sum()
bluered
Joe1.0234882.126719
Steve-1.4448242.133263
Wes0.0744620.663960
Jim0.8828001.587925
Travis2.1108741.497794
map_series = Series(mapping)
map_series
a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object
people.groupby(map_series,axis=1).count()
bluered
Joe23
Steve23
Wes12
Jim23
Travis23

通过函数进行分组

people.groupby(len).sum()
abcde
3-1.0729204.2369371.1441930.8365581.214587
51.380735-0.7032450.158134-1.6029581.455772
6-0.0335341.1581130.6373271.4735470.373215
key_list = ['one','one','one','two','two']
people.groupby([len,key_list]).min()
abcde
3one-0.7665801.6581071.146959-0.1234710.159285
two-0.6156662.578830-0.0027660.885567-0.375239
5one1.380735-0.7032450.158134-1.6029581.455772
6two-0.0335341.1581130.6373271.4735470.373215

根据索引级别分组

columns = pd.MultiIndex.from_arrays([['US','US','US','JP','JP'],
                                    [1,3,5,1,3]],names=['cty','tenor'])
hier_df = DataFrame(np.random.randn(4,5),columns=columns)
hier_df
ctyUSJP
tenor13513
00.971689-0.2070270.6415281.197729-0.800907
10.906871-0.0872880.204273-0.0093740.637842
20.649755-0.800055-0.057130-1.0872000.435762
3-0.6187370.325816-0.702310-0.519860-0.101653
hier_df.groupby(level='cty',axis=1).count()
ctyJPUS
023
123
223
323

数据聚合

grouped = df.groupby('key1')
#如果传入的百分位上没有值,则quantile会进行线性插值
grouped['data1'].quantile(0.9)
key1
a    0.278064
b    1.128848
Name: data1, dtype: float64
def peak_to_peak(arr):
    return arr.max() - arr.min()
grouped.agg(peak_to_peak)
C:\windows\TFutureWarning: ['key2'] did not aggregate successfully. If any error is raised this will raise in a future version of pandas. Drop these columns/ops to avoid this warning.
  grouped.agg(peak_to_peak)
data1data2
key1
a0.4219960.975541
b0.8100900.329799
grouped.describe()
data1data2
countmeanstdmin25%50%75%maxcountmeanstdmin25%50%75%max
key1
a3.00.0908590.22555-0.074122-0.037649-0.0011750.1733490.3478743.0-0.3950610.511126-0.794645-0.683039-0.571432-0.1952680.180895
b2.00.8048120.572820.3997660.6022890.8048121.0073341.2098572.0-0.4311570.233203-0.596056-0.513606-0.431157-0.348707-0.266257
#有个知识点
import matplotlib.pyplot as plt
from pylab import *
img = plt.imread('经过优化的GroupBy的方法.png')
imshow(img)

在这里插入图片描述

tips = pd.read_csv("E:\\python_study_files\\python\\pydata-book-2nd-edition\\examples\\tips.csv")
#添加“小费占总额百分比”的列
tips['tip_pct'] = tips['tip']/tips['total_bill']
tips[:6]
total_billtipsmokerdaytimesizetip_pct
016.991.01NoSunDinner20.059447
110.341.66NoSunDinner30.160542
221.013.50NoSunDinner30.166587
323.683.31NoSunDinner20.139780
424.593.61NoSunDinner40.146808
525.294.71NoSunDinner40.186240

面向列的多函数应用

grouped = tips.groupby(['day','smoker'])
grouped_pct = grouped['tip_pct']
grouped_pct.agg('mean')
day   smoker
Fri   No        0.151650
      Yes       0.174783
Sat   No        0.158048
      Yes       0.147906
Sun   No        0.160113
      Yes       0.187250
Thur  No        0.160298
      Yes       0.163863
Name: tip_pct, dtype: float64
grouped_pct.agg(['mean','std',peak_to_peak])
meanstdpeak_to_peak
daysmoker
FriNo0.1516500.0281230.067349
Yes0.1747830.0512930.159925
SatNo0.1580480.0397670.235193
Yes0.1479060.0613750.290095
SunNo0.1601130.0423470.193226
Yes0.1872500.1541340.644685
ThurNo0.1602980.0387740.193350
Yes0.1638630.0393890.151240
#由(name,function)组成的列表,第一个元素会被用作列名
grouped_pct.agg([('foo','mean'),('bar',np.std)])
foobar
daysmoker
FriNo0.1516500.028123
Yes0.1747830.051293
SatNo0.1580480.039767
Yes0.1479060.061375
SunNo0.1601130.042347
Yes0.1872500.154134
ThurNo0.1602980.038774
Yes0.1638630.039389
functions = ['count','mean','max']
result = grouped['tip_pct','total_bill'].agg(functions)
result
C:\windowFutureWarning: Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.
  result = grouped['tip_pct','total_bill'].agg(functions)
tip_pcttotal_bill
countmeanmaxcountmeanmax
daysmoker
FriNo40.1516500.187735418.42000022.75
Yes150.1747830.2634801516.81333340.17
SatNo450.1580480.2919904519.66177848.33
Yes420.1479060.3257334221.27666750.81
SunNo570.1601130.2526725720.50666748.17
Yes190.1872500.7103451924.12000045.35
ThurNo450.1602980.2663124517.11311141.19
Yes170.1638630.2412551719.19058843.11
result['tip_pct']
countmeanmax
daysmoker
FriNo40.1516500.187735
Yes150.1747830.263480
SatNo450.1580480.291990
Yes420.1479060.325733
SunNo570.1601130.252672
Yes190.1872500.710345
ThurNo450.1602980.266312
Yes170.1638630.241255
ftuples = [('Durchschnitt','mean'),('Abweichung',np.var)]
grouped['tip_pct','total_bill'].agg(ftuples)
C:\windowsFutureWarning: Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.
  grouped['tip_pct','total_bill'].agg(ftuples)
tip_pcttotal_bill
DurchschnittAbweichungDurchschnittAbweichung
daysmoker
FriNo0.1516500.00079118.42000025.596333
Yes0.1747830.00263116.81333382.562438
SatNo0.1580480.00158119.66177879.908965
Yes0.1479060.00376721.276667101.387535
SunNo0.1601130.00179320.50666766.099980
Yes0.1872500.02375724.120000109.046044
ThurNo0.1602980.00150317.11311159.625081
Yes0.1638630.00155119.19058869.808518
#对不同的列应用不同的函数
grouped.agg({'tip':np.max,'size':'sum'})
tipsize
daysmoker
FriNo3.509
Yes4.7331
SatNo9.00115
Yes10.00104
SunNo6.00167
Yes6.5049
ThurNo6.70112
Yes5.0040
grouped.agg({'tip_pct':['min','max','mean','std'],
            'size':'sum'})
tip_pctsize
minmaxmeanstdsum
daysmoker
FriNo0.1203850.1877350.1516500.0281239
Yes0.1035550.2634800.1747830.05129331
SatNo0.0567970.2919900.1580480.039767115
Yes0.0356380.3257330.1479060.061375104
SunNo0.0594470.2526720.1601130.042347167
Yes0.0656600.7103450.1872500.15413449
ThurNo0.0729610.2663120.1602980.038774112
Yes0.0900140.2412550.1638630.03938940

以“无索引”的形式返回聚合数据

tips.groupby(['day','smoker'],as_index=False).mean()
daysmokertotal_billtipsizetip_pct
0FriNo18.4200002.8125002.2500000.151650
1FriYes16.8133332.7140002.0666670.174783
2SatNo19.6617783.1028892.5555560.158048
3SatYes21.2766672.8754762.4761900.147906
4SunNo20.5066673.1678952.9298250.160113
5SunYes24.1200003.5168422.5789470.187250
6ThurNo17.1131112.6737782.4888890.160298
7ThurYes19.1905883.0300002.3529410.163863

分组级运算和转换

#添加一个用于存放各索引分组平均值的列
k1_means = df.groupby('key1').mean().add_prefix('mean_')
k1_means
mean_data1mean_data2
key1
a0.090859-0.395061
b0.804812-0.431157
pd.merge(df,k1_means,left_on='key1',right_index=True)
key1key2data1data2mean_data1mean_data2
0aone-0.074122-0.5714320.090859-0.395061
1atwo0.347874-0.7946450.090859-0.395061
4aone-0.0011750.1808950.090859-0.395061
2bone0.399766-0.5960560.804812-0.431157
3btwo1.209857-0.2662570.804812-0.431157
key = ['one','two','one','two','one']
people.groupby(key).mean()
abcde
one-0.1635961.4081100.8921430.4748460.654347
two0.3825340.9377920.077684-0.3586950.540267
#transform会将一个函数应用到各个分组
people.groupby(key).transform(np.mean)
abcde
Joe-0.1635961.4081100.8921430.4748460.654347
Steve0.3825340.9377920.077684-0.3586950.540267
Wes-0.1635961.4081100.8921430.4748460.654347
Jim0.3825340.9377920.077684-0.3586950.540267
Travis-0.1635961.4081100.8921430.4748460.654347
#从各组中减去平均值
def demean(arr):
    return arr-arr.mean()
demeaned = people.groupby(key).transform(demean)
demeaned
abcde
Joe0.4729230.2499970.254816-0.598317-0.495062
Steve0.998201-1.6410380.080450-1.2442620.915506
Wes-0.602985NaNNaN-0.4003840.776194
Jim-0.9982011.641038-0.0804501.244262-0.915506
Travis0.130062-0.249997-0.2548160.998701-0.281132
demeaned.groupby(key).mean()
abcde
one2.775558e-170.000000e+00-5.551115e-177.401487e-17-1.110223e-16
two0.000000e+001.110223e-16-6.938894e-180.000000e+00-5.551115e-17

apply:一般性的“拆分——应用——合并”

#在指定列找到最大值,然后把这个值所在的行选取出来
#将sort_index()改为sort_values()即可
def top(df,n=5,column='tip_pct'):
    return df.sort_values(by=column)[-n:]
top(tips,n=6)
total_billtipsmokerdaytimesizetip_pct
10914.314.00YesSatDinner20.279525
18323.176.50YesSunDinner40.280535
23211.613.39NoSatDinner20.291990
673.071.00YesSatDinner10.325733
1789.604.00YesSunDinner20.416667
1727.255.15YesSunDinner20.710345
tips.groupby('smoker').apply(top)
total_billtipsmokerdaytimesizetip_pct
smoker
No8824.715.85NoThurLunch20.236746
18520.695.00NoSunDinner50.241663
5110.292.60NoSunDinner20.252672
1497.512.00NoThurLunch20.266312
23211.613.39NoSatDinner20.291990
Yes10914.314.00YesSatDinner20.279525
18323.176.50YesSunDinner40.280535
673.071.00YesSatDinner10.325733
1789.604.00YesSunDinner20.416667
1727.255.15YesSunDinner20.710345
tips.groupby(['smoker','day']).apply(top,n=1,column='total_bill')
total_billtipsmokerdaytimesizetip_pct
smokerday
NoFri9422.753.25NoFriDinner20.142857
Sat21248.339.00NoSatDinner40.186220
Sun15648.175.00NoSunDinner60.103799
Thur14241.195.00NoThurLunch50.121389
YesFri9540.174.73YesFriDinner40.117750
Sat17050.8110.00YesSatDinner30.196812
Sun18245.353.50YesSunDinner30.077178
Thur19743.115.00YesThurLunch40.115982
result = tips.groupby('smoker')['tip_pct'].describe()
result
countmeanstdmin25%50%75%max
smoker
No151.00.1593280.0399100.0567970.1369060.1556250.1850140.291990
Yes93.00.1631960.0851190.0356380.1067710.1538460.1950590.710345
result.unstack('smoker')
       smoker
count  No        151.000000
       Yes        93.000000
mean   No          0.159328
       Yes         0.163196
std    No          0.039910
       Yes         0.085119
min    No          0.056797
       Yes         0.035638
25%    No          0.136906
       Yes         0.106771
50%    No          0.155625
       Yes         0.153846
75%    No          0.185014
       Yes         0.195059
max    No          0.291990
       Yes         0.710345
dtype: float64

当调用describe之类的方法时,实际上只是应用了以下两条代码的快捷方式:
f = lambda x: x.describe()
grouped.apply(f)

禁止分组键

tips.groupby('smoker',group_keys=False).apply(top)
total_billtipsmokerdaytimesizetip_pct
8824.715.85NoThurLunch20.236746
18520.695.00NoSunDinner50.241663
5110.292.60NoSunDinner20.252672
1497.512.00NoThurLunch20.266312
23211.613.39NoSatDinner20.291990
10914.314.00YesSatDinner20.279525
18323.176.50YesSunDinner40.280535
673.071.00YesSatDinner10.325733
1789.604.00YesSunDinner20.416667
1727.255.15YesSunDinner20.710345

分位数和桶分析

frame = DataFrame({'data1':np.random.randn(1000),
                  'data2':np.random.randn(1000)})
factor = pd.cut(frame.data1,4)
factor[:10]
0    (-1.448, 0.107]
1    (-1.448, 0.107]
2    (-1.448, 0.107]
3    (-1.448, 0.107]
4     (0.107, 1.663]
5     (0.107, 1.663]
6     (0.107, 1.663]
7    (-1.448, 0.107]
8    (-1.448, 0.107]
9     (0.107, 1.663]
Name: data1, dtype: category
Categories (4, interval[float64, right]): [(-3.01, -1.448] < (-1.448, 0.107] < (0.107, 1.663] < (1.663, 3.218]]
def get_stats(group):
    return {'min':group.min(),'max':group.max(),
           'count':group.count(),'mean':group.mean()}
grouped = frame.data2.groupby(factor)
grouped.apply(get_stats).unstack()
#区间大小相等
minmaxcountmean
data1
(-3.01, -1.448]-2.6149102.36804670.0-0.092146
(-1.448, 0.107]-2.5349622.783160479.00.009041
(0.107, 1.663]-3.0737712.513553398.0-0.091291
(1.663, 3.218]-2.6990802.37363453.0-0.099021
#数据点数量相等,使用qcut
#返回分位数编号
grouping = pd.qcut(frame.data1,10,labels=False)
grouped = frame.data2.groupby(grouping)
grouped.apply(get_stats).unstack()
minmaxcountmean
data1
0-2.6149102.783160100.00.006906
1-2.5349622.490249100.0-0.101695
2-2.0158622.261854100.00.084059
3-2.2509662.509572100.0-0.000924
4-2.0687472.425219100.00.119523
5-2.9134922.032037100.0-0.233505
6-2.4320551.983781100.0-0.038541
7-2.3391642.046824100.0-0.096358
8-3.0737712.235941100.0-0.091584
9-2.6990802.513553100.0-0.084895

示例:用特定于分组的值填充缺失值

#用平均值填充NA值
s = Series(np.random.randn(6))
s[::2] = np.nan
s
0         NaN
1    0.209858
2         NaN
3    1.379023
4         NaN
5   -0.743300
dtype: float64
s.fillna(s.mean())
0    0.281860
1    0.209858
2    0.281860
3    1.379023
4    0.281860
5   -0.743300
dtype: float64
states = ['Ohio','New York','Vermont','Florida',
         'Oregon','Nevada','California','Idaho']
group_key = ['East']*4+['West']*4
data = Series(np.random.randn(8),index=states)
data[['Vermont','Nevada','Idaho']] = np.nan
data
Ohio          0.155978
New York     -0.133767
Vermont            NaN
Florida      -0.765162
Oregon        0.682524
Nevada             NaN
California    0.730390
Idaho              NaN
dtype: float64
data.groupby(group_key).mean()
East   -0.247650
West    0.706457
dtype: float64
fill_mean = lambda g:g.fillna(g.mean())
data.groupby(group_key).apply(fill_mean)
Ohio          0.155978
New York     -0.133767
Vermont      -0.247650
Florida      -0.765162
Oregon        0.682524
Nevada        0.706457
California    0.730390
Idaho         0.706457
dtype: float64
fill_values = {'East':0.5,'West':-1}
fill_func = lambda g:g.fillna(fill_values[g.name])
data.groupby(group_key).apply(fill_func)
Ohio          0.155978
New York     -0.133767
Vermont       0.500000
Florida      -0.765162
Oregon        0.682524
Nevada       -1.000000
California    0.730390
Idaho        -1.000000
dtype: float64

示例:随机采样和排列

抽取的一个办法:选取np.random.permutation(N)的前K个元素,其中N为完整数据的大小,K为期望的样本大小。

#红桃(Hearts)、黑桃(Spades)、梅花(Clubs)、方片(Diamonds)
suits = ['H','S','C','D']
#python2中,range()返回的是list,可以将两个range()直接相加,如range(5)+range(10) ;python3中,range()成了一个class
card_val = (list(range(1,11))+ [10] * 3) * 4
base_names = ['A'] + list(range(2,11)) + ['J','K','Q']
cards = []
for suit in ['H','S','C','D']:
    cards.extend(str(num) + suit for num in base_names)
deck = Series(card_val,index=cards)
deck[:13]
AH      1
2H      2
3H      3
4H      4
5H      5
6H      6
7H      7
8H      8
9H      9
10H    10
JH     10
KH     10
QH     10
dtype: int64
def draw(deck,n=5):
    return deck.take(np.random.permutation(len(deck))[:n])
draw(deck)
7H     7
4D     4
8H     8
QC    10
4S     4
dtype: int64
#从每种花色中随机抽取两张牌
get_suit = lambda card:card[-1]#只要最后一个字母
deck.groupby(get_suit).apply(draw,n=2)
C  AC      1
   JC     10
D  5D      5
   8D      8
H  10H    10
   JH     10
S  9S      9
   5S      5
dtype: int64
#另一种办法
deck.groupby(get_suit,group_keys=False).apply(draw,n=2)
10C    10
AC      1
KD     10
10D    10
3H      3
9H      9
5S      5
8S      8
dtype: int64

示例:分组加权平均数和相关系数

df = DataFrame({'category':['a','a','a','a','b','b','b','b'],
               'data':np.random.randn(8),
               'weights':np.random.randn(8)})
df
categorydataweights
0a0.591317-1.032939
1a-0.5896920.436704
2a-0.1288482.257153
3a-0.7746260.811910
4b-2.0506791.144802
5b1.2161110.736471
6b-0.8013660.139008
7b-1.577430-0.576198
grouped = df.groupby('category')
get_wavg = lambda g: np.average(g['data'],weights=g['weights'])
grouped.apply(get_wavg)
category
a   -0.723088
b   -0.453212
dtype: float64
close_px = pd.read_csv("E:\python_study_files\python\pydata-book-2nd-edition\examples\stock_px.csv",parse_dates=True,index_col=0)
close_px
AAAAPLGEIBMJNJMSFTPEPSPXXOM
1990-02-014.987.862.8716.794.270.516.04328.796.12
1990-02-025.048.002.8716.894.370.516.09330.926.24
1990-02-055.078.182.8717.324.340.516.05331.856.25
1990-02-065.018.122.8817.564.320.516.15329.666.23
1990-02-075.047.772.9117.934.380.516.17333.756.33
..............................
2011-10-1010.09388.8116.14186.6264.4326.9461.871194.8976.28
2011-10-1110.30400.2916.14185.0063.9627.0060.951195.5476.27
2011-10-1210.05402.1916.40186.1264.3326.9662.701207.2577.16
2011-10-1310.10408.4316.22186.8264.2327.1862.361203.6676.37
2011-10-1410.26422.0016.60190.5364.7227.2762.241224.5878.11

5472 rows × 9 columns

close_px[-4:]
AAAAPLGEIBMJNJMSFTPEPSPXXOM
2011-10-1110.30400.2916.14185.0063.9627.0060.951195.5476.27
2011-10-1210.05402.1916.40186.1264.3326.9662.701207.2577.16
2011-10-1310.10408.4316.22186.8264.2327.1862.361203.6676.37
2011-10-1410.26422.0016.60190.5364.7227.2762.241224.5878.11
#计算日收益率与SPX之间的年度相关系数
rets = close_px.pct_change().dropna()
spx_corr = lambda x:x.corrwith(x['SPX'])
by_year = rets.groupby(lambda x:x.year)
by_year.apply(spx_corr)
AAAAPLGEIBMJNJMSFTPEPSPXXOM
19900.5950240.5450670.7521870.7383610.8011450.5866910.7831681.00.517586
19910.4535740.3653150.7596070.5570460.6464010.5242250.6417751.00.569335
19920.3981800.4987320.6326850.2622320.5157400.4923450.4738711.00.318408
19930.2590690.2385780.4472570.2112690.4515030.4253770.3850891.00.318952
19940.4285490.2684200.5729960.3851620.3729620.4365850.4505161.00.395078
19950.2915320.1618290.5191260.4163900.3157330.4536600.4131441.00.368752
19960.2923440.1914820.7507240.3884970.5692320.5640150.4214771.00.538736
19970.5644270.2114350.8275120.6468230.7035380.6061710.5093441.00.695653
19980.5338020.3798830.8152430.6239820.5919880.6987730.4942131.00.369264
19990.0990330.4255840.7109280.4861670.5170610.6313150.3365931.00.315383
20000.2653590.4401610.6103620.4451140.1897650.5380050.0775251.00.084163
20010.6240690.5771520.7946320.6960380.1114930.6964470.1339751.00.336869
20020.7480210.5805480.8223730.7164900.5847580.7847280.4872111.00.759933
20030.6904660.5455820.7776430.7417750.5623990.7505340.5414871.00.662775
20040.5914850.3742830.7286260.6017400.3546900.5885310.4668541.00.557742
20050.5642670.4675400.6756370.5168460.4447280.5623740.4895591.00.631010
20060.4876380.4282670.6123880.5986360.3940260.4061260.3350541.00.518514
20070.6424270.5081180.7969450.6039060.5684230.6587700.6519111.00.786264
20080.7810570.6814340.7773370.8330740.8010050.8046260.7092641.00.828303
20090.7356420.7071030.7130860.6845130.6031460.6549020.5414741.00.797921
20100.7457000.7101050.8222850.7836380.6898960.7301180.6266551.00.839057
20110.8820450.6919310.8645950.8027300.7523790.8009960.5920291.00.859975
#苹果和微软的年度相关系数
by_year.apply(lambda g: g['AAPL'].corr(g['MSFT']))
1990    0.408271
1991    0.266807
1992    0.450592
1993    0.236917
1994    0.361638
1995    0.258642
1996    0.147539
1997    0.196144
1998    0.364106
1999    0.329484
2000    0.275298
2001    0.563156
2002    0.571435
2003    0.486262
2004    0.259024
2005    0.300093
2006    0.161735
2007    0.417738
2008    0.611901
2009    0.432738
2010    0.571946
2011    0.581987
dtype: float64

示例:面向分组的线性回归

import statsmodels.api as sm
def regress(data,yvar,xvars):
    Y = data[yvar]
    X = data[xvars]
    X['intercept'] = 1.
    result = sm.OLS(Y,X).fit()
    return result.params
by_year.apply(regress,'AAPL',['SPX'])
E:\python_study_files\python_pip\.venvs\lpthw\lib\site-packages\statsmodels\compat\pandas.py:65: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.
  from pandas import Int64Index as NumericIndex
SPXintercept
19901.5127720.001395
19911.1873510.000396
19921.8324270.000164
19931.390470-0.002657
19941.1902770.001617
19950.858818-0.001423
19960.829389-0.001791
19970.749928-0.001901
19981.1645820.004075
19991.3849890.003273
20001.733802-0.002523
20011.6761280.003122
20021.080795-0.000219
20031.1877700.000690
20041.3634630.004201
20051.7664150.003246
20061.6454960.000080
20071.1987610.003438
20080.968016-0.001110
20090.8791030.002954
20101.0526080.001261
20110.8066050.001514

透视表和交叉表

tips.pivot_table(index=['day','smoker'])
sizetiptip_pcttotal_bill
daysmoker
FriNo2.2500002.8125000.15165018.420000
Yes2.0666672.7140000.17478316.813333
SatNo2.5555563.1028890.15804819.661778
Yes2.4761902.8754760.14790621.276667
SunNo2.9298253.1678950.16011320.506667
Yes2.5789473.5168420.18725024.120000
ThurNo2.4888892.6737780.16029817.113111
Yes2.3529413.0300000.16386319.190588
tips.pivot_table(['tip_pct','size'],index=['time','day'],columns='smoker')
sizetip_pct
smokerNoYesNoYes
timeday
DinnerFri2.0000002.2222220.1396220.165347
Sat2.5555562.4761900.1580480.147906
Sun2.9298252.5789470.1601130.187250
Thur2.000000NaN0.159744NaN
LunchFri3.0000001.8333330.1877350.188937
Thur2.5000002.3529410.1603110.163863
tips.pivot_table(['tip_pct','size'],index=['time','day'],
                columns='smoker',margins=True)
sizetip_pct
smokerNoYesAllNoYesAll
timeday
DinnerFri2.0000002.2222222.1666670.1396220.1653470.158916
Sat2.5555562.4761902.5172410.1580480.1479060.153152
Sun2.9298252.5789472.8421050.1601130.1872500.166897
Thur2.000000NaN2.0000000.159744NaN0.159744
LunchFri3.0000001.8333332.0000000.1877350.1889370.188765
Thur2.5000002.3529412.4590160.1603110.1638630.161301
All2.6688742.4086022.5696720.1593280.1631960.160803
tips.pivot_table('tip_pct',index=['time','smoker'],columns='day',
                aggfunc=len,margins=True)
dayFriSatSunThurAll
timesmoker
DinnerNo3.045.057.01.0106
Yes9.042.019.0NaN70
LunchNo1.0NaNNaN44.045
Yes6.0NaNNaN17.023
All19.087.076.062.0244
tips.pivot_table('size',index=['time','smoker'],
                columns='day',aggfunc='sum',fill_value=0)
dayFriSatSunThur
timesmoker
DinnerNo61151672
Yes20104490
LunchNo300110
Yes110040
#有个知识点
import matplotlib.pyplot as plt
from pylab import *
img = plt.imread('pivot_table的参数.png')
imshow(img)

在这里插入图片描述

rows改为index,cols改为columns

交叉表:crosstab

pd.crosstab([tips.time,tips.day],tips.smoker,margins=True)
smokerNoYesAll
timeday
DinnerFri3912
Sat454287
Sun571976
Thur101
LunchFri167
Thur441761
All15193244

示例:2012联邦选举委员会数据库

fec =pd.read_csv("E:\\python_study_files\\python\\pydata-book-2nd-edition\\datasets\\fec\\P00000001-ALL.csv")
fec
C:\windowsDtypeWarning: Columns (6) have mixed types. Specify dtype option on import or set low_memory=False.
  fec =pd.read_csv("E:\\python_study_files\\python\\pydata-book-2nd-edition\\datasets\\fec\\P00000001-ALL.csv")
cmte_idcand_idcand_nmcontbr_nmcontbr_citycontbr_stcontbr_zipcontbr_employercontbr_occupationcontb_receipt_amtcontb_receipt_dtreceipt_descmemo_cdmemo_textform_tpfile_num
0C00410118P20002978Bachmann, MichelleHARVEY, WILLIAMMOBILEAL366010290.0RETIREDRETIRED250.020-JUN-11NaNNaNNaNSA17A736166
1C00410118P20002978Bachmann, MichelleHARVEY, WILLIAMMOBILEAL366010290.0RETIREDRETIRED50.023-JUN-11NaNNaNNaNSA17A736166
2C00410118P20002978Bachmann, MichelleSMITH, LANIERLANETTAL368633403.0INFORMATION REQUESTEDINFORMATION REQUESTED250.005-JUL-11NaNNaNNaNSA17A749073
3C00410118P20002978Bachmann, MichelleBLEVINS, DARONDAPIGGOTTAR724548253.0NONERETIRED250.001-AUG-11NaNNaNNaNSA17A749073
4C00410118P20002978Bachmann, MichelleWARDENBURG, HAROLDHOT SPRINGS NATIONAR719016467.0NONERETIRED300.020-JUN-11NaNNaNNaNSA17A736166
...................................................
1001726C00500587P20003281Perry, RickGORMAN, CHRIS D. MR.INFO REQUESTEDXX99999INFORMATION REQUESTED PER BEST EFFORTSINFORMATION REQUESTED PER BEST EFFORTS5000.029-SEP-11REATTRIBUTION / REDESIGNATION REQUESTED (AUTOM...NaNREATTRIBUTION / REDESIGNATION REQUESTED (AUTOM...SA17A751678
1001727C00500587P20003281Perry, RickDUFFY, DAVID A. MR.INFO REQUESTEDXX99999DUFFY EQUIPMENT COMPANY INC.BUSINESS OWNER2500.030-SEP-11NaNNaNNaNSA17A751678
1001728C00500587P20003281Perry, RickGRANE, BRYAN F. MR.INFO REQUESTEDXX99999INFORMATION REQUESTED PER BEST EFFORTSINFORMATION REQUESTED PER BEST EFFORTS500.029-SEP-11NaNNaNNaNSA17A751678
1001729C00500587P20003281Perry, RickTOLBERT, DARYL MR.INFO REQUESTEDXX99999T.A.C.C.LONGWALL MAINTENANCE FOREMAN500.030-SEP-11NaNNaNNaNSA17A751678
1001730C00500587P20003281Perry, RickANDERSON, MARILEE MRS.INFO REQUESTEDXX99999INFORMATION REQUESTED PER BEST EFFORTSINFORMATION REQUESTED PER BEST EFFORTS2500.031-AUG-11NaNNaNNaNSA17A751678

1001731 rows × 16 columns

fec.loc[123456]
cmte_id                             C00431445
cand_id                             P80003338
cand_nm                         Obama, Barack
contbr_nm                         ELLMAN, IRA
contbr_city                             TEMPE
contbr_st                                  AZ
contbr_zip                          852816719
contbr_employer      ARIZONA STATE UNIVERSITY
contbr_occupation                   PROFESSOR
contb_receipt_amt                        50.0
contb_receipt_dt                    01-DEC-11
receipt_desc                              NaN
memo_cd                                   NaN
memo_text                                 NaN
form_tp                                 SA17A
file_num                               772372
Name: 123456, dtype: object
unique_cands = fec.cand_nm.unique()
unique_cands
array(['Bachmann, Michelle', 'Romney, Mitt', 'Obama, Barack',
       "Roemer, Charles E. 'Buddy' III", 'Pawlenty, Timothy',
       'Johnson, Gary Earl', 'Paul, Ron', 'Santorum, Rick',
       'Cain, Herman', 'Gingrich, Newt', 'McCotter, Thaddeus G',
       'Huntsman, Jon', 'Perry, Rick'], dtype=object)
unique_cands[2]
'Obama, Barack'
parties = {'Bachmann, Michelle':'Republican',
          'Cain, Herman':'Republican',
          'Gingrich, Newt':'Republican',
          'Huntsman, Jon':'Republican',
          'Johnson, Gary Earl':'Republican',
          'McCotter, Thaddeus G':'Republican',
          'Obama, Barack':'Democrat',
          'Paul, Ron':'Republican',
          'Pawlenty, Timothy':'Republican',
          'Perry, Rick':'Republican',
          "Roemer, Charles E. 'Buddy' Ⅲ":'Republican',
          'Romney, Mitt':'Republican',
          'Santorum, Rick':'Republican'}
fec.cand_nm[123456:123461]
125611    Obama, Barack
125612    Obama, Barack
125613    Obama, Barack
125614    Obama, Barack
125615    Obama, Barack
Name: cand_nm, dtype: object
fec.cand_nm[123456:123461].map(parties)
125611    Democrat
125612    Democrat
125613    Democrat
125614    Democrat
125615    Democrat
Name: cand_nm, dtype: object
#添加一个新列
fec['party'] = fec.cand_nm.map(parties)
fec['party'].value_counts()
Democrat      589127
Republican    396504
Name: party, dtype: int64
(fec.contb_receipt_amt>0).value_counts()
True    991475
Name: contb_receipt_amt, dtype: int64
fec = fec[fec.contb_receipt_amt>0]
fec_mrbo = fec[fec.cand_nm.isin(['Obama, Barack','Romney, Mitt'])]

雇主职业和雇主统计赞助信息

fec.contbr_occupation.value_counts()[:10]
RETIRED         233990
NOT PROVIDED     56245
ATTORNEY         34286
HOMEMAKER        29931
PHYSICIAN        23432
ENGINEER         14334
TEACHER          13990
CONSULTANT       13273
PROFESSOR        12555
NOT EMPLOYED      9828
Name: contbr_occupation, dtype: int64
occ_mapping = {'INFORMATION REQUESTED PER BEST EFFORTS':'NOT PROVIDED',
              'INFORMATION REQUESTED':'NOT PROVIDED',
              'INFORMATIO REQUESTED (BEST EFFORTS)':'NOT PROVIDED',
              'C.E.O':'CEO'}
#如果没有提供相关映射,则返回X
f = lambda x:occ_mapping.get(x,x)
fec.contbr_occupation = fec.contbr_occupation.map(f)
emp_mapping = {'INFORMATION REQUESTED PER BEST EFFORTS':'NOT PROVIDED',
              'INFORMATION REQUESTED':'NOT PROVIDED',
               'SELF':'SELF-EMPLOYED',
               'SELF EMPLOYED':'SELF-EMPLOTED'
}
f = lambda x:emp_mapping.get(x,x)
fec.contbr_employer = fec.contbr_employer.map(f)
by_occupation = fec.pivot_table('contb_receipt_amt',index='contbr_occupation',columns='party',aggfunc='sum')
over_2mm = by_occupation[by_occupation.sum(1)>2000000]
over_2mm
partyDemocratRepublican
contbr_occupation
ATTORNEY11141982.977462058.31
C.E.O.1690.002592983.11
CEO2074284.791638668.41
CONSULTANT2459912.712538990.45
ENGINEER951525.551811937.30
EXECUTIVE1355161.054136400.09
HOMEMAKER4248875.8013625600.78
INVESTOR884133.002431258.92
LAWYER3160478.87391124.32
MANAGER762883.221441092.37
NOT PROVIDED4866973.9620216287.01
OWNER1001567.362406081.92
PHYSICIAN3735124.943587195.24
PRESIDENT1878509.954717413.76
PROFESSOR2165071.08294032.73
REAL ESTATE528902.091624507.25
RETIRED25305116.3823481023.18
SELF-EMPLOYED672393.401636774.54
over_2mm.plot(kind='barh')

在这里插入图片描述

def get_top_amounts(group,key,n=5):
    totals = group.groupby(key)['contb_receipt_amt'].sum()
    #根据key对totals进行降序排列
    return totals.sort_values(ascending=False)[n:]
grouped = fec_mrbo.groupby('cand_nm')
grouped.apply(get_top_amounts,'contbr_occupation',n=7)
cand_nm        contbr_occupation                   
Obama, Barack  PROFESSOR                               2165071.08
               CEO                                     2074284.79
               PRESIDENT                               1878509.95
               NOT EMPLOYED                            1709188.20
               EXECUTIVE                               1355161.05
                                                          ...    
Romney, Mitt   INDEPENDENT PROFESSIONAL                      3.00
               IFC CONTRACTING SOLUTIONS                     3.00
               REMODELER & SEMI RETIRED                      3.00
               AFFORDABLE REAL ESTATE DEVELOPER              3.00
               3RD GENERATION FAMILY BUSINESS OWNER          3.00
Name: contb_receipt_amt, Length: 35973, dtype: float64
grouped.apply(get_top_amounts,'contbr_employer',n=10)
cand_nm        contbr_employer         
Obama, Barack  REFUSED                     149516.07
               DLA PIPER                   148235.00
               HARVARD UNIVERSITY          131368.94
               IBM                         128490.93
               GOOGLE                      125302.88
                                             ...    
Romney, Mitt   UN                               3.00
               UPTOWN CHEAPSKATE                3.00
               WILL MERRIFIELD                  3.00
               INDEPENDENT PROFESSIONAL         3.00
               HONOLD COMMUNICTAIONS            3.00
Name: contb_receipt_amt, Length: 95890, dtype: float64

对出资额分组

bins = np.array([0,1,10,100,1000,10000,100000,1000000,10000000])
labels = pd.cut(fec_mrbo.contb_receipt_amt,bins)
labels
411         (10, 100]
412       (100, 1000]
413       (100, 1000]
414         (10, 100]
415         (10, 100]
             ...     
701381      (10, 100]
701382    (100, 1000]
701383        (1, 10]
701384      (10, 100]
701385    (100, 1000]
Name: contb_receipt_amt, Length: 694282, dtype: category
Categories (8, interval[int64, right]): [(0, 1] < (1, 10] < (10, 100] < (100, 1000] < (1000, 10000] < (10000, 100000] < (100000, 1000000] < (1000000, 10000000]]
grouped = fec_mrbo.groupby(['cand_nm',labels])
grouped.size().unstack(0)
cand_nmObama, BarackRomney, Mitt
contb_receipt_amt
(0, 1]49377
(1, 10]400703681
(10, 100]37228031853
(100, 1000]15399143357
(1000, 10000]2228426186
(10000, 100000]21
(100000, 1000000]30
(1000000, 10000000]40
bucket_sums = grouped.contb_receipt_amt.sum().unstack(0)
bucket_sums
cand_nmObama, BarackRomney, Mitt
contb_receipt_amt
(0, 1]318.2477.00
(1, 10]337267.6229819.66
(10, 100]20288981.411987783.76
(100, 1000]54798531.4622363381.69
(1000, 10000]51753705.6763942145.42
(10000, 100000]59100.0012700.00
(100000, 1000000]1490683.080.00
(1000000, 10000000]7148839.760.00
normed_sums = bucket_sums.div(bucket_sums.sum(axis=1),axis=0)
normed_sums
cand_nmObama, BarackRomney, Mitt
contb_receipt_amt
(0, 1]0.8051820.194818
(1, 10]0.9187670.081233
(10, 100]0.9107690.089231
(100, 1000]0.7101760.289824
(1000, 10000]0.4473260.552674
(10000, 100000]0.8231200.176880
(100000, 1000000]1.0000000.000000
(1000000, 10000000]1.0000000.000000
normed_sums[:2].plot(kind='barh',stacked=True)

在这里插入图片描述

根据州统计赞助信息

grouped = fec_mrbo.groupby(['cand_nm','contbr_st'])
totals = grouped.contb_receipt_amt.sum().unstack(0).fillna(0)
totals = totals[totals.sum(1)>100000]
totals[:10]
cand_nmObama, BarackRomney, Mitt
contbr_st
AK281840.1586204.24
AL543123.48527303.51
AR359247.28105556.00
AZ1506476.981888436.23
CA23824984.2411237636.60
CO2132429.491506714.12
CT2068291.263499475.45
DC4373538.801025137.50
DE336669.1482712.00
FL7318178.588338458.81
percent = totals.div(totals.sum(1),axis=0)
percent[:10]
cand_nmObama, BarackRomney, Mitt
contbr_st
AK0.7657780.234222
AL0.5073900.492610
AR0.7729020.227098
AZ0.4437450.556255
CA0.6794980.320502
CO0.5859700.414030
CT0.3714760.628524
DC0.8101130.189887
DE0.8027760.197224
FL0.4674170.532583
from mpl_toolkits.basemap import Basemap, cm
import numpy as np
from matplotlib import rcParams
from matplotlib.collections import LineCollection
import matplotlib.pyplot as plt
#from shapelib import ShapeFile
import pyshp
import dbflib
---------------------------------------------------------------------------

ModuleNotFoundError                       Traceback (most recent call last)
      5 import matplotlib.pyplot as plt
      6 #from shapelib import ShapeFile
----> 7 import pyshp
      8 import dbflib


ModuleNotFoundError: No module named 'pyshp'
  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值