Pandas 综合练习

import pandas as pd
import numpy as np
df = pd.read_csv('Data/Game_of_Thrones_Script.csv')
df.head()
Release DateSeasonEpisodeEpisode TitleNameSentence
02011/4/17Season 1Episode 1Winter is Comingwaymar royceWhat do you expect? They're savages. One lot s...
12011/4/17Season 1Episode 1Winter is ComingwillI've never seen wildlings do a thing like this...
22011/4/17Season 1Episode 1Winter is Comingwaymar royceHow close did you get?
32011/4/17Season 1Episode 1Winter is ComingwillClose as any man would.
42011/4/17Season 1Episode 1Winter is CominggaredWe should head back to the wall.
df['Name'].nunique()
564
df['Name'].value_counts().index[0]
'tyrion lannister'
df_words = df.assign(Words=df['Sentence'].apply(lambda x:len(x.split()))).sort_values(by='Name')
df_words.head()
Release DateSeasonEpisodeEpisode TitleNameSentenceWords
2762011/4/17Season 1Episode 1Winter is Cominga voiceIt's Maester Luwin, my lord.5
30122011/6/19Season 1Episode 10Fire and Bloodaddam marbrandls it true about Stannis and Renly?7
30172011/6/19Season 1Episode 10Fire and Bloodaddam marbrandKevan Lannister2
136102014/6/8Season 4Episode 9The Watchers on the WallaemonAnd what is it that couldn't wait until mornin...10
136142014/6/8Season 4Episode 9The Watchers on the WallaemonOh, no need. I know my way around this library...48
L_count = []
N_words = list(zip(df_words['Name'],df_words['Words']))
for i in N_words:
    if i == N_words[0]:
        L_count.append(i[1])
        last = i[0]
    else:
        L_count.append(L_count[-1]+i[1] if i[0]==last else i[1])
        last = i[0]
df_words['Count']=L_count
df_words['Name'][df_words['Count'].idxmax()]
'tyrion lannister'
df = pd.read_csv('data/Kobe_data.csv',index_col='shot_id')
df.head()
action_typecombined_shot_typegame_event_idgame_idlatloc_xloc_ylonminutes_remainingperiod...shot_made_flagshot_typeshot_zone_areashot_zone_basicshot_zone_rangeteam_idteam_namegame_datematchupopponent
shot_id
1Jump ShotJump Shot102000001233.972316772-118.1028101...NaN2PT Field GoalRight Side(R)Mid-Range16-24 ft.1610612747Los Angeles Lakers2000/10/31LAL @ PORPOR
2Jump ShotJump Shot122000001234.0443-1570-118.4268101...0.02PT Field GoalLeft Side(L)Mid-Range8-16 ft.1610612747Los Angeles Lakers2000/10/31LAL @ PORPOR
3Jump ShotJump Shot352000001233.9093-101135-118.370871...1.02PT Field GoalLeft Side Center(LC)Mid-Range16-24 ft.1610612747Los Angeles Lakers2000/10/31LAL @ PORPOR
4Jump ShotJump Shot432000001233.8693138175-118.131861...0.02PT Field GoalRight Side Center(RC)Mid-Range16-24 ft.1610612747Los Angeles Lakers2000/10/31LAL @ PORPOR
5Driving Dunk ShotDunk1552000001234.044300-118.269862...1.02PT Field GoalCenter(C)Restricted AreaLess Than 8 ft.1610612747Los Angeles Lakers2000/10/31LAL @ PORPOR

5 rows × 24 columns

pd.Series(list(zip(df['action_type'],df['combined_shot_type']))).value_counts().index[0]
('Jump Shot', 'Jump Shot')
pd.Series(list(list(zip(*(pd.Series(list(zip(df['game_id'],df['opponent'])))
                          .unique()).tolist()))[1])).value_counts().index[0]
'SAS'
df = pd.read_csv('data/UFO.csv')
df.rename(columns={'duration (seconds)':'duration'},inplace=True)
df['duration'].astype('float')
df.head()
datetimeshapedurationlatitudelongitude
010/10/1949 20:30cylinder2700.029.883056-97.941111
110/10/1949 21:00light7200.029.384210-98.581082
210/10/1955 17:00circle20.053.200000-2.916667
310/10/1956 21:00circle20.028.978333-96.645833
410/10/1960 20:00light900.021.418056-157.803611
df.query('duration > 60')['shape'].value_counts().index[0]
'light'
bins_long = np.linspace(-180,180,13).tolist()
bins_la = np.linspace(-90,90,11).tolist()
cuts_long = pd.cut(df['longitude'],bins=bins_long)
df['cuts_long'] = cuts_long
cuts_la = pd.cut(df['latitude'],bins=bins_la)
df['cuts_la'] = cuts_la
df.head()
datetimeshapedurationlatitudelongitudecuts_longcuts_la
010/10/1949 20:30cylinder2700.029.883056-97.941111(-120.0, -90.0](18.0, 36.0]
110/10/1949 21:00light7200.029.384210-98.581082(-120.0, -90.0](18.0, 36.0]
210/10/1955 17:00circle20.053.200000-2.916667(-30.0, 0.0](36.0, 54.0]
310/10/1956 21:00circle20.028.978333-96.645833(-120.0, -90.0](18.0, 36.0]
410/10/1960 20:00light900.021.418056-157.803611(-180.0, -150.0](18.0, 36.0]
df = pd.read_csv('data/Pokemon.csv')
df.head()
#NameType 1Type 2TotalHPAttackDefenseSp. AtkSp. DefSpeedGenerationLegendary
01BulbasaurGrassPoison3184549496565451False
12IvysaurGrassPoison4056062638080601False
23VenusaurGrassPoison525808283100100801False
33VenusaurMega VenusaurGrassPoison62580100123122120801False
44CharmanderFireNaN3093952436050651False
df['Type 2'].count()/df.shape[0]
0.5175
df.query('Total >= 580')['Legendary'].value_counts(normalize=True)
True     0.575221
False    0.424779
Name: Legendary, dtype: float64
df[df['Type 1']=='Fighting'].sort_values(by='Attack',ascending=False).iloc[:3]
#NameType 1Type 2TotalHPAttackDefenseSp. AtkSp. DefSpeedGenerationLegendary
498448LucarioMega LucarioFightingSteel6257014588140701124False
594534ConkeldurrFightingNaN505105140955565455False
7468MachampFightingNaN50590130806585551False
df['range'] = df.iloc[:,5:11].max(axis=1)-df.iloc[:,5:11].min(axis=1)
attribute = df[['Type 1','range']].set_index('Type 1')
max_range = 0
result = ''
for i in attribute.index.unique():
    temp = attribute.loc[i,:].mean()
    if temp.values[0] > max_range:
        max_range = temp.values[0]
        result = i
result
'Steel'
df.query('Legendary == True')['Type 1'].value_counts(normalize=True).index[0]
'Psychic'
attribute = df.query('Legendary == True')[['Type 1','Total']].set_index('Type 1')
max_value = 0
result = ''
for i in attribute.index.unique():
    temp = float(attribute.loc[i,:].mean())
    if temp > max_value:
        max_value = temp
        result = i
result
'Normal'
df = pd.read_csv('data/Diamonds.csv')
df.head()
caratcolordepthprice
00.23E61.5326
10.21E59.8326
20.23E56.9327
30.29I62.4334
40.31J63.3335
df_r = df.query('carat>1')['price']
df_r.max()-df_r.min()
17561
bins = df['depth'].quantile(np.linspace(0,1,6)).tolist()
cuts = pd.cut(df['depth'],bins=bins) #可选label添加自定义标签
df['cuts'] = cuts
df.head()
caratcolordepthpricecuts
00.23E61.5326(60.8, 61.6]
10.21E59.8326(43.0, 60.8]
20.23E56.9327(43.0, 60.8]
30.29I62.4334(62.1, 62.7]
40.31J63.3335(62.7, 79.0]
color_result = df.groupby('cuts')['color'].describe()
color_result
countuniquetopfreq
cuts
(43.0, 60.8]112947E2259
(60.8, 61.6]118317G2593
(61.6, 62.1]104037G2247
(62.1, 62.7]101377G2193
(62.7, 79.0]102737G2000
df['均重价格']=df['price']/df['carat']
color_result['top'] == [i[1] for i in df.groupby(['cuts'
                                ,'color'])['均重价格'].mean().groupby(['cuts']).idxmax().values]
cuts
(43.0, 60.8]    False
(60.8, 61.6]    False
(61.6, 62.1]    False
(62.1, 62.7]     True
(62.7, 79.0]     True
Name: top, dtype: bool
df = df.drop(columns='均重价格')
cuts = pd.cut(df['carat'],bins=[0,0.5,1,1.5,2,np.inf]) #可选label添加自定义标签
df['cuts'] = cuts
df.head()
caratcolordepthpricecuts
00.23E61.5326(0.0, 0.5]
10.21E59.8326(0.0, 0.5]
20.23E56.9327(0.0, 0.5]
30.29I62.4334(0.0, 0.5]
40.31J63.3335(0.0, 0.5]
def f(nums):
    if not nums:        
        return 0
    res = 1                            
    cur_len = 1                        
    for i in range(1, len(nums)):      
        if nums[i-1] < nums[i]:        
            cur_len += 1                
            res = max(cur_len, res)     
        else:                       
            cur_len = 1                 
    return res
for name,group in df.groupby('cuts'):
    group = group.sort_values(by='depth')
    s = group['price']
    print(name,f(s.tolist()))
(0.0, 0.5] 8
(0.5, 1.0] 8
(1.0, 1.5] 7
(1.5, 2.0] 11
(2.0, inf] 7
for name,group in df[['carat','price','color']].groupby('color'):
    L1 = np.array([np.ones(group.shape[0]),group['carat']]).reshape(2,group.shape[0])
    L2 = group['price']
    result = (np.linalg.inv(L1.dot(L1.T)).dot(L1)).dot(L2).reshape(2,1)
    print('当颜色为%s时,截距项为:%f,回归系数为:%f'%(name,result[0],result[1]))
当颜色为D时,截距项为:-2361.017152,回归系数为:8408.353126
当颜色为E时,截距项为:-2381.049600,回归系数为:8296.212783
当颜色为F时,截距项为:-2665.806191,回归系数为:8676.658344
当颜色为G时,截距项为:-2575.527643,回归系数为:8525.345779
当颜色为H时,截距项为:-2460.418046,回归系数为:7619.098320
当颜色为I时,截距项为:-2878.150356,回归系数为:7761.041169
当颜色为J时,截距项为:-2920.603337,回归系数为:7094.192092
df = pd.read_csv('data/Drugs.csv')
df.head()
YYYYStateCOUNTYSubstanceNameDrugReports
02010VAACCOMACKPropoxyphene1
12010OHADAMSMorphine9
22010PAADAMSMethadone2
32010VAALEXANDRIA CITYHeroin5
42010PAALLEGHENYHydromorphone5
idx=pd.IndexSlice
for i in range(2010,2018):
    county = (df.groupby(['COUNTY','YYYY']).sum().loc[idx[:,i],:].idxmax()[0][0])
    state = df.query('COUNTY == "%s"'%county)['State'].iloc[0]
    state_true = df.groupby(['State','YYYY']).sum().loc[idx[:,i],:].idxmax()[0][0]
    if state==state_true:
        print('在%d年,%s县的报告数最多,它所属的州%s也是报告数最多的'%(i,county,state))
    else:
        print('在%d年,%s县的报告数最多,但它所属的州%s不是报告数最多的,%s州报告数最多'%(i,county,state,state_true))
在2010年,PHILADELPHIA县的报告数最多,它所属的州PA也是报告数最多的
在2011年,PHILADELPHIA县的报告数最多,但它所属的州PA不是报告数最多的,OH州报告数最多
在2012年,PHILADELPHIA县的报告数最多,但它所属的州PA不是报告数最多的,OH州报告数最多
在2013年,PHILADELPHIA县的报告数最多,但它所属的州PA不是报告数最多的,OH州报告数最多
在2014年,PHILADELPHIA县的报告数最多,但它所属的州PA不是报告数最多的,OH州报告数最多
在2015年,PHILADELPHIA县的报告数最多,但它所属的州PA不是报告数最多的,OH州报告数最多
在2016年,HAMILTON县的报告数最多,它所属的州OH也是报告数最多的
在2017年,HAMILTON县的报告数最多,它所属的州OH也是报告数最多的
df_b = df[(df['YYYY'].isin([2014,2015]))&(df['SubstanceName']=='Heroin')]
df_add = df_b.groupby(['YYYY','State']).sum()
(df_add.loc[2015]-df_add.loc[2014]).idxmax()
DrugReports    OH
dtype: object
df_b = df[(df['YYYY'].isin([2014,2015]))&(df['State']=='OH')]
df_add = df_b.groupby(['YYYY','SubstanceName']).sum()
display((df_add.loc[2015]-df_add.loc[2014]).idxmax()) #这里利用了索引对齐的特点
display((df_add.loc[2015]/df_add.loc[2014]).idxmax())
DrugReports    Heroin
dtype: object



DrugReports    Acetyl fentanyl
dtype: object
df = pd.read_csv('data/Drugs.csv',index_col=['State','COUNTY']).sort_index()
df.head()
YYYYSubstanceNameDrugReports
StateCOUNTY
KYADAIR2010Methadone1
ADAIR2010Hydrocodone6
ADAIR2011Oxycodone4
ADAIR2011Buprenorphine3
ADAIR2011Morphine2
result = pd.pivot_table(df,index=['State','COUNTY','SubstanceName']
                 ,columns='YYYY'
                 ,values='DrugReports',fill_value='-').reset_index().rename_axis(columns={'YYYY':''})
result.head()
StateCOUNTYSubstanceName20102011201220132014201520162017
0KYADAIRBuprenorphine-354275710
1KYADAIRCodeine--1----1
2KYADAIRFentanyl--1-----
3KYADAIRHeroin--12-1-2
4KYADAIRHydrocodone69101097113
result_melted = result.melt(id_vars=result.columns[:3],value_vars=result.columns[-8:]
                ,var_name='YYYY',value_name='DrugReports').query('DrugReports != "-"')
result2 = result_melted.sort_values(by=['State','COUNTY','YYYY'
                                    ,'SubstanceName']).reset_index().drop(columns='index')
#下面其实无关紧要,只是交换两个列再改一下类型(因为‘-’所以type变成object了)
cols = list(result2.columns)
a, b = cols.index('SubstanceName'), cols.index('YYYY')
cols[b], cols[a] = cols[a], cols[b]
result2 = result2[cols].astype({'DrugReports':'int','YYYY':'int'})
result2.head()
StateCOUNTYYYYYSubstanceNameDrugReports
0KYADAIR2010Hydrocodone6
1KYADAIR2010Methadone1
2KYADAIR2011Buprenorphine3
3KYADAIR2011Hydrocodone9
4KYADAIR2011Morphine2
df_tidy = df.reset_index().sort_values(by=result2.columns[:4].tolist()).reset_index().drop(columns='index')
df_tidy.head()
StateCOUNTYYYYYSubstanceNameDrugReports
0KYADAIR2010Hydrocodone6
1KYADAIR2010Methadone1
2KYADAIR2011Buprenorphine3
3KYADAIR2011Hydrocodone9
4KYADAIR2011Morphine2
df_tidy.equals(result2)
True
df = pd.read_csv('data/Earthquake.csv')
df = df.sort_values(by=df.columns.tolist()[:3]).sort_index(axis=1).reset_index().drop(columns='index')
df.head()
方向日期时间深度烈度经度维度距离
0south_east1912.08.0912:29:00 AM16.06.727.240.64.3
1south_west1912.08.1012:23:00 AM15.06.027.140.62.0
2south_west1912.08.1012:30:00 AM15.05.227.140.62.0
3south_east1912.08.1112:19:04 AM30.04.927.240.64.3
4south_west1912.08.1112:20:00 AM15.04.527.140.62.0
result = pd.pivot_table(df,index=['日期','时间','维度','经度']
            ,columns='方向'
            ,values=['烈度','深度','距离'],fill_value='-').stack(level=0).rename_axis(index={None:'地震参数'})
result.head(6)
方向eastnorthnorth_eastnorth_westsouthsouth_eastsouth_westwest
日期时间维度经度地震参数
1912.08.0912:29:00 AM40.627.2深度-----16--
烈度-----6.7--
距离-----4.3--
1912.08.1012:23:00 AM40.627.1深度------15-
烈度------6-
距离------2-
df_result = result.unstack().stack(0)[(~(result.unstack().stack(0)=='-')).any(1)].reset_index()
df_result.columns.name=None
df_result = df_result.sort_index(axis=1).astype({'深度':'float64','烈度':'float64','距离':'float64'})
df_result.head()
方向日期时间深度烈度经度维度距离
0south_east1912.08.0912:29:00 AM16.06.727.240.64.3
1south_west1912.08.1012:23:00 AM15.06.027.140.62.0
2south_west1912.08.1012:30:00 AM15.05.227.140.62.0
3south_east1912.08.1112:19:04 AM30.04.927.240.64.3
4south_west1912.08.1112:20:00 AM15.04.527.140.62.0
df_result.astype({'深度':'float64','烈度':'float64','距离':'float64'},copy=False).dtypes
方向     object
日期     object
时间     object
深度    float64
烈度    float64
经度    float64
维度    float64
距离    float64
dtype: object
df.equals(df_result)
True
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值