Pandas 综合练习

最新推荐文章于 2024-04-26 19:28:22 发布

Dantill

最新推荐文章于 2024-04-26 19:28:22 发布

阅读量253

点赞数

本文链接：https://blog.csdn.net/weixin_45707959/article/details/105881621

版权

import pandas as pd
import numpy as np

df = pd.read_csv('Data/Game_of_Thrones_Script.csv')
df.head()

	Release Date	Season	Episode	Episode Title	Name	Sentence
0	2011/4/17	Season 1	Episode 1	Winter is Coming	waymar royce	What do you expect? They're savages. One lot s...
1	2011/4/17	Season 1	Episode 1	Winter is Coming	will	I've never seen wildlings do a thing like this...
2	2011/4/17	Season 1	Episode 1	Winter is Coming	waymar royce	How close did you get?
3	2011/4/17	Season 1	Episode 1	Winter is Coming	will	Close as any man would.
4	2011/4/17	Season 1	Episode 1	Winter is Coming	gared	We should head back to the wall.

df['Name'].nunique()

df['Name'].value_counts().index[0]

'tyrion lannister'

df_words = df.assign(Words=df['Sentence'].apply(lambda x:len(x.split()))).sort_values(by='Name')
df_words.head()

	Release Date	Season	Episode	Episode Title	Name	Sentence	Words
276	2011/4/17	Season 1	Episode 1	Winter is Coming	a voice	It's Maester Luwin, my lord.	5
3012	2011/6/19	Season 1	Episode 10	Fire and Blood	addam marbrand	ls it true about Stannis and Renly?	7
3017	2011/6/19	Season 1	Episode 10	Fire and Blood	addam marbrand	Kevan Lannister	2
13610	2014/6/8	Season 4	Episode 9	The Watchers on the Wall	aemon	And what is it that couldn't wait until mornin...	10
13614	2014/6/8	Season 4	Episode 9	The Watchers on the Wall	aemon	Oh, no need. I know my way around this library...	48

L_count = []
N_words = list(zip(df_words['Name'],df_words['Words']))
for i in N_words:
    if i == N_words[0]:
        L_count.append(i[1])
        last = i[0]
    else:
        L_count.append(L_count[-1]+i[1] if i[0]==last else i[1])
        last = i[0]
df_words['Count']=L_count
df_words['Name'][df_words['Count'].idxmax()]

'tyrion lannister'

df = pd.read_csv('data/Kobe_data.csv',index_col='shot_id')
df.head()

	action_type	combined_shot_type	game_event_id	game_id	lat	loc_x	loc_y	lon	minutes_remaining	period	...	shot_made_flag	shot_type	shot_zone_area	shot_zone_basic	shot_zone_range	team_id	team_name	game_date	matchup	opponent
shot_id
1	Jump Shot	Jump Shot	10	20000012	33.9723	167	72	-118.1028	10	1	...	NaN	2PT Field Goal	Right Side(R)	Mid-Range	16-24 ft.	1610612747	Los Angeles Lakers	2000/10/31	LAL @ POR	POR
2	Jump Shot	Jump Shot	12	20000012	34.0443	-157	0	-118.4268	10	1	...	0.0	2PT Field Goal	Left Side(L)	Mid-Range	8-16 ft.	1610612747	Los Angeles Lakers	2000/10/31	LAL @ POR	POR
3	Jump Shot	Jump Shot	35	20000012	33.9093	-101	135	-118.3708	7	1	...	1.0	2PT Field Goal	Left Side Center(LC)	Mid-Range	16-24 ft.	1610612747	Los Angeles Lakers	2000/10/31	LAL @ POR	POR
4	Jump Shot	Jump Shot	43	20000012	33.8693	138	175	-118.1318	6	1	...	0.0	2PT Field Goal	Right Side Center(RC)	Mid-Range	16-24 ft.	1610612747	Los Angeles Lakers	2000/10/31	LAL @ POR	POR
5	Driving Dunk Shot	Dunk	155	20000012	34.0443	0	0	-118.2698	6	2	...	1.0	2PT Field Goal	Center(C)	Restricted Area	Less Than 8 ft.	1610612747	Los Angeles Lakers	2000/10/31	LAL @ POR	POR

5 rows × 24 columns

pd.Series(list(zip(df['action_type'],df['combined_shot_type']))).value_counts().index[0]

('Jump Shot', 'Jump Shot')

pd.Series(list(list(zip(*(pd.Series(list(zip(df['game_id'],df['opponent'])))
                          .unique()).tolist()))[1])).value_counts().index[0]

'SAS'

df = pd.read_csv('data/UFO.csv')
df.rename(columns={'duration (seconds)':'duration'},inplace=True)
df['duration'].astype('float')
df.head()

	datetime	shape	duration	latitude	longitude
0	10/10/1949 20:30	cylinder	2700.0	29.883056	-97.941111
1	10/10/1949 21:00	light	7200.0	29.384210	-98.581082
2	10/10/1955 17:00	circle	20.0	53.200000	-2.916667
3	10/10/1956 21:00	circle	20.0	28.978333	-96.645833
4	10/10/1960 20:00	light	900.0	21.418056	-157.803611

df.query('duration > 60')['shape'].value_counts().index[0]

'light'

bins_long = np.linspace(-180,180,13).tolist()
bins_la = np.linspace(-90,90,11).tolist()
cuts_long = pd.cut(df['longitude'],bins=bins_long)
df['cuts_long'] = cuts_long
cuts_la = pd.cut(df['latitude'],bins=bins_la)
df['cuts_la'] = cuts_la
df.head()

	datetime	shape	duration	latitude	longitude	cuts_long	cuts_la
0	10/10/1949 20:30	cylinder	2700.0	29.883056	-97.941111	(-120.0, -90.0]	(18.0, 36.0]
1	10/10/1949 21:00	light	7200.0	29.384210	-98.581082	(-120.0, -90.0]	(18.0, 36.0]
2	10/10/1955 17:00	circle	20.0	53.200000	-2.916667	(-30.0, 0.0]	(36.0, 54.0]
3	10/10/1956 21:00	circle	20.0	28.978333	-96.645833	(-120.0, -90.0]	(18.0, 36.0]
4	10/10/1960 20:00	light	900.0	21.418056	-157.803611	(-180.0, -150.0]	(18.0, 36.0]

df = pd.read_csv('data/Pokemon.csv')
df.head()

	#	Name	Type 1	Type 2	Total	HP	Attack	Defense	Sp. Atk	Sp. Def	Speed	Generation	Legendary
0	1	Bulbasaur	Grass	Poison	318	45	49	49	65	65	45	1	False
1	2	Ivysaur	Grass	Poison	405	60	62	63	80	80	60	1	False
2	3	Venusaur	Grass	Poison	525	80	82	83	100	100	80	1	False
3	3	VenusaurMega Venusaur	Grass	Poison	625	80	100	123	122	120	80	1	False
4	4	Charmander	Fire	NaN	309	39	52	43	60	50	65	1	False

df['Type 2'].count()/df.shape[0]

0.5175

df.query('Total >= 580')['Legendary'].value_counts(normalize=True)

True     0.575221
False    0.424779
Name: Legendary, dtype: float64

df[df['Type 1']=='Fighting'].sort_values(by='Attack',ascending=False).iloc[:3]

	#	Name	Type 1	Type 2	Total	HP	Attack	Defense	Sp. Atk	Sp. Def	Speed	Generation	Legendary
498	448	LucarioMega Lucario	Fighting	Steel	625	70	145	88	140	70	112	4	False
594	534	Conkeldurr	Fighting	NaN	505	105	140	95	55	65	45	5	False
74	68	Machamp	Fighting	NaN	505	90	130	80	65	85	55	1	False

df['range'] = df.iloc[:,5:11].max(axis=1)-df.iloc[:,5:11].min(axis=1)
attribute = df[['Type 1','range']].set_index('Type 1')
max_range = 0
result = ''
for i in attribute.index.unique():
    temp = attribute.loc[i,:].mean()
    if temp.values[0] > max_range:
        max_range = temp.values[0]
        result = i
result

'Steel'

df.query('Legendary == True')['Type 1'].value_counts(normalize=True).index[0]

'Psychic'

attribute = df.query('Legendary == True')[['Type 1','Total']].set_index('Type 1')
max_value = 0
result = ''
for i in attribute.index.unique():
    temp = float(attribute.loc[i,:].mean())
    if temp > max_value:
        max_value = temp
        result = i
result

'Normal'

df = pd.read_csv('data/Diamonds.csv')
df.head()

	carat	color	depth	price
0	0.23	E	61.5	326
1	0.21	E	59.8	326
2	0.23	E	56.9	327
3	0.29	I	62.4	334
4	0.31	J	63.3	335

df_r = df.query('carat>1')['price']
df_r.max()-df_r.min()

bins = df['depth'].quantile(np.linspace(0,1,6)).tolist()
cuts = pd.cut(df['depth'],bins=bins) #可选label添加自定义标签
df['cuts'] = cuts
df.head()

	carat	color	depth	price	cuts
0	0.23	E	61.5	326	(60.8, 61.6]
1	0.21	E	59.8	326	(43.0, 60.8]
2	0.23	E	56.9	327	(43.0, 60.8]
3	0.29	I	62.4	334	(62.1, 62.7]
4	0.31	J	63.3	335	(62.7, 79.0]

color_result = df.groupby('cuts')['color'].describe()
color_result

	count	unique	top	freq
cuts
(43.0, 60.8]	11294	7	E	2259
(60.8, 61.6]	11831	7	G	2593
(61.6, 62.1]	10403	7	G	2247
(62.1, 62.7]	10137	7	G	2193
(62.7, 79.0]	10273	7	G	2000

df['均重价格']=df['price']/df['carat']
color_result['top'] == [i[1] for i in df.groupby(['cuts'
                                ,'color'])['均重价格'].mean().groupby(['cuts']).idxmax().values]

cuts
(43.0, 60.8]    False
(60.8, 61.6]    False
(61.6, 62.1]    False
(62.1, 62.7]     True
(62.7, 79.0]     True
Name: top, dtype: bool

df = df.drop(columns='均重价格')
cuts = pd.cut(df['carat'],bins=[0,0.5,1,1.5,2,np.inf]) #可选label添加自定义标签
df['cuts'] = cuts
df.head()

	carat	color	depth	price	cuts
0	0.23	E	61.5	326	(0.0, 0.5]
1	0.21	E	59.8	326	(0.0, 0.5]
2	0.23	E	56.9	327	(0.0, 0.5]
3	0.29	I	62.4	334	(0.0, 0.5]
4	0.31	J	63.3	335	(0.0, 0.5]

def f(nums):
    if not nums:        
        return 0
    res = 1                            
    cur_len = 1                        
    for i in range(1, len(nums)):      
        if nums[i-1] < nums[i]:        
            cur_len += 1                
            res = max(cur_len, res)     
        else:                       
            cur_len = 1                 
    return res

for name,group in df.groupby('cuts'):
    group = group.sort_values(by='depth')
    s = group['price']
    print(name,f(s.tolist()))

(0.0, 0.5] 8
(0.5, 1.0] 8
(1.0, 1.5] 7
(1.5, 2.0] 11
(2.0, inf] 7

for name,group in df[['carat','price','color']].groupby('color'):
    L1 = np.array([np.ones(group.shape[0]),group['carat']]).reshape(2,group.shape[0])
    L2 = group['price']
    result = (np.linalg.inv(L1.dot(L1.T)).dot(L1)).dot(L2).reshape(2,1)
    print('当颜色为%s时，截距项为：%f，回归系数为：%f'%(name,result[0],result[1]))

当颜色为D时，截距项为：-2361.017152，回归系数为：8408.353126
当颜色为E时，截距项为：-2381.049600，回归系数为：8296.212783
当颜色为F时，截距项为：-2665.806191，回归系数为：8676.658344
当颜色为G时，截距项为：-2575.527643，回归系数为：8525.345779
当颜色为H时，截距项为：-2460.418046，回归系数为：7619.098320
当颜色为I时，截距项为：-2878.150356，回归系数为：7761.041169
当颜色为J时，截距项为：-2920.603337，回归系数为：7094.192092

df = pd.read_csv('data/Drugs.csv')
df.head()

	YYYY	State	COUNTY	SubstanceName	DrugReports
0	2010	VA	ACCOMACK	Propoxyphene	1
1	2010	OH	ADAMS	Morphine	9
2	2010	PA	ADAMS	Methadone	2
3	2010	VA	ALEXANDRIA CITY	Heroin	5
4	2010	PA	ALLEGHENY	Hydromorphone	5

idx=pd.IndexSlice
for i in range(2010,2018):
    county = (df.groupby(['COUNTY','YYYY']).sum().loc[idx[:,i],:].idxmax()[0][0])
    state = df.query('COUNTY == "%s"'%county)['State'].iloc[0]
    state_true = df.groupby(['State','YYYY']).sum().loc[idx[:,i],:].idxmax()[0][0]
    if state==state_true:
        print('在%d年，%s县的报告数最多，它所属的州%s也是报告数最多的'%(i,county,state))
    else:
        print('在%d年，%s县的报告数最多，但它所属的州%s不是报告数最多的，%s州报告数最多'%(i,county,state,state_true))

在2010年，PHILADELPHIA县的报告数最多，它所属的州PA也是报告数最多的
在2011年，PHILADELPHIA县的报告数最多，但它所属的州PA不是报告数最多的，OH州报告数最多
在2012年，PHILADELPHIA县的报告数最多，但它所属的州PA不是报告数最多的，OH州报告数最多
在2013年，PHILADELPHIA县的报告数最多，但它所属的州PA不是报告数最多的，OH州报告数最多
在2014年，PHILADELPHIA县的报告数最多，但它所属的州PA不是报告数最多的，OH州报告数最多
在2015年，PHILADELPHIA县的报告数最多，但它所属的州PA不是报告数最多的，OH州报告数最多
在2016年，HAMILTON县的报告数最多，它所属的州OH也是报告数最多的
在2017年，HAMILTON县的报告数最多，它所属的州OH也是报告数最多的

df_b = df[(df['YYYY'].isin([2014,2015]))&(df['SubstanceName']=='Heroin')]
df_add = df_b.groupby(['YYYY','State']).sum()
(df_add.loc[2015]-df_add.loc[2014]).idxmax()

DrugReports    OH
dtype: object

df_b = df[(df['YYYY'].isin([2014,2015]))&(df['State']=='OH')]
df_add = df_b.groupby(['YYYY','SubstanceName']).sum()
display((df_add.loc[2015]-df_add.loc[2014]).idxmax()) #这里利用了索引对齐的特点
display((df_add.loc[2015]/df_add.loc[2014]).idxmax())

DrugReports    Heroin
dtype: object



DrugReports    Acetyl fentanyl
dtype: object

df = pd.read_csv('data/Drugs.csv',index_col=['State','COUNTY']).sort_index()
df.head()

		YYYY	SubstanceName	DrugReports
State	COUNTY
KY	ADAIR	2010	Methadone	1
	ADAIR	2010	Hydrocodone	6
	ADAIR	2011	Oxycodone	4
	ADAIR	2011	Buprenorphine	3
	ADAIR	2011	Morphine	2

result = pd.pivot_table(df,index=['State','COUNTY','SubstanceName']
                 ,columns='YYYY'
                 ,values='DrugReports',fill_value='-').reset_index().rename_axis(columns={'YYYY':''})
result.head()

	State	COUNTY	SubstanceName	2010	2011	2012	2013	2014	2015	2016	2017
0	KY	ADAIR	Buprenorphine	-	3	5	4	27	5	7	10
1	KY	ADAIR	Codeine	-	-	1	-	-	-	-	1
2	KY	ADAIR	Fentanyl	-	-	1	-	-	-	-	-
3	KY	ADAIR	Heroin	-	-	1	2	-	1	-	2
4	KY	ADAIR	Hydrocodone	6	9	10	10	9	7	11	3

result_melted = result.melt(id_vars=result.columns[:3],value_vars=result.columns[-8:]
                ,var_name='YYYY',value_name='DrugReports').query('DrugReports != "-"')
result2 = result_melted.sort_values(by=['State','COUNTY','YYYY'
                                    ,'SubstanceName']).reset_index().drop(columns='index')
#下面其实无关紧要，只是交换两个列再改一下类型（因为‘-’所以type变成object了）
cols = list(result2.columns)
a, b = cols.index('SubstanceName'), cols.index('YYYY')
cols[b], cols[a] = cols[a], cols[b]
result2 = result2[cols].astype({'DrugReports':'int','YYYY':'int'})
result2.head()

	State	COUNTY	YYYY	SubstanceName	DrugReports
0	KY	ADAIR	2010	Hydrocodone	6
1	KY	ADAIR	2010	Methadone	1
2	KY	ADAIR	2011	Buprenorphine	3
3	KY	ADAIR	2011	Hydrocodone	9
4	KY	ADAIR	2011	Morphine	2

df_tidy = df.reset_index().sort_values(by=result2.columns[:4].tolist()).reset_index().drop(columns='index')
df_tidy.head()

	State	COUNTY	YYYY	SubstanceName	DrugReports
0	KY	ADAIR	2010	Hydrocodone	6
1	KY	ADAIR	2010	Methadone	1
2	KY	ADAIR	2011	Buprenorphine	3
3	KY	ADAIR	2011	Hydrocodone	9
4	KY	ADAIR	2011	Morphine	2

df_tidy.equals(result2)

True

df = pd.read_csv('data/Earthquake.csv')
df = df.sort_values(by=df.columns.tolist()[:3]).sort_index(axis=1).reset_index().drop(columns='index')
df.head()

	方向	日期	时间	深度	烈度	经度	维度	距离
0	south_east	1912.08.09	12:29:00 AM	16.0	6.7	27.2	40.6	4.3
1	south_west	1912.08.10	12:23:00 AM	15.0	6.0	27.1	40.6	2.0
2	south_west	1912.08.10	12:30:00 AM	15.0	5.2	27.1	40.6	2.0
3	south_east	1912.08.11	12:19:04 AM	30.0	4.9	27.2	40.6	4.3
4	south_west	1912.08.11	12:20:00 AM	15.0	4.5	27.1	40.6	2.0

result = pd.pivot_table(df,index=['日期','时间','维度','经度']
            ,columns='方向'
            ,values=['烈度','深度','距离'],fill_value='-').stack(level=0).rename_axis(index={None:'地震参数'})
result.head(6)

				方向	east	north	north_east	north_west	south	south_east	south_west	west
日期	时间	维度	经度	地震参数
1912.08.09	12:29:00 AM	40.6	27.2	深度	-	-	-	-	-	16	-	-
				烈度	-	-	-	-	-	6.7	-	-
				距离	-	-	-	-	-	4.3	-	-
1912.08.10	12:23:00 AM	40.6	27.1	深度	-	-	-	-	-	-	15	-
				烈度	-	-	-	-	-	-	6	-
				距离	-	-	-	-	-	-	2	-

df_result = result.unstack().stack(0)[(~(result.unstack().stack(0)=='-')).any(1)].reset_index()
df_result.columns.name=None
df_result = df_result.sort_index(axis=1).astype({'深度':'float64','烈度':'float64','距离':'float64'})
df_result.head()

	方向	日期	时间	深度	烈度	经度	维度	距离
0	south_east	1912.08.09	12:29:00 AM	16.0	6.7	27.2	40.6	4.3
1	south_west	1912.08.10	12:23:00 AM	15.0	6.0	27.1	40.6	2.0
2	south_west	1912.08.10	12:30:00 AM	15.0	5.2	27.1	40.6	2.0
3	south_east	1912.08.11	12:19:04 AM	30.0	4.9	27.2	40.6	4.3
4	south_west	1912.08.11	12:20:00 AM	15.0	4.5	27.1	40.6	2.0

df_result.astype({'深度':'float64','烈度':'float64','距离':'float64'},copy=False).dtypes

方向     object
日期     object
时间     object
深度    float64
烈度    float64
经度    float64
维度    float64
距离    float64
dtype: object

df.equals(df_result)

True

Dantill

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
Pandas 综合练习

import pandas as pdimport numpy as npdf = pd.read_csv('Data/Game_of_Thrones_Script.csv')df.head() Release Date Season Episode Episode Title Name ...
复制链接

扫一扫