pandas 第十二期组队-pandas基础

最新推荐文章于 2024-07-21 19:37:09 发布

lx12633036

最新推荐文章于 2024-07-21 19:37:09 发布

阅读量708

点赞数

文章标签： python 数据分析

本文链接：https://blog.csdn.net/lx12633036/article/details/105625991

版权

import pandas as pd 
import numpy as np

pd.__version__

'1.0.3'

文件的读写#

csv格式

data=pd.read_csv('E:\jupyter Notebook\天池比赛\joyful-pandas-master\data\\table.csv')
data.head(20)

	School	Class	ID	Gender	Address	Height	Weight	Math	Physics
0	S_1	C_1	1101	M	street_1	173	63	34.0	A+
1	S_1	C_1	1102	F	street_2	192	73	32.5	B+
2	S_1	C_1	1103	M	street_2	186	82	87.2	B+
3	S_1	C_1	1104	F	street_2	167	81	80.4	B-
4	S_1	C_1	1105	F	street_4	159	64	84.8	B+
5	S_1	C_2	1201	M	street_5	188	68	97.0	A-
6	S_1	C_2	1202	F	street_4	176	94	63.5	B-
7	S_1	C_2	1203	M	street_6	160	53	58.8	A+
8	S_1	C_2	1204	F	street_5	162	63	33.8	B
9	S_1	C_2	1205	F	street_6	167	63	68.4	B-
10	S_1	C_3	1301	M	street_4	161	68	31.5	B+
11	S_1	C_3	1302	F	street_1	175	57	87.7	A-
12	S_1	C_3	1303	M	street_7	188	82	49.7	B
13	S_1	C_3	1304	M	street_2	195	70	85.2	A
14	S_1	C_3	1305	F	street_5	187	69	61.7	B-
15	S_2	C_1	2101	M	street_7	174	84	83.3	C
16	S_2	C_1	2102	F	street_6	161	61	50.6	B+
17	S_2	C_1	2103	M	street_4	157	61	52.5	B-
18	S_2	C_1	2104	F	street_5	159	97	72.2	B+
19	S_2	C_1	2105	M	street_4	170	81	34.2	A

txt格式

data_txt=pd.read_table('E:\jupyter Notebook\天池比赛\joyful-pandas-master\data\\table.txt')
#读取文件的时候，默认是以，作为分隔符，可以指定其他分隔符
data_txt.head(20)

	col1	col2	col3	col4
0	2	a	1.4	apple
1	3	b	3.4	banana
2	6	c	2.5	orange
3	5	d	3.2	lemon

xls或者xlsx格式

data_excel=pd.read_excel('E:\jupyter Notebook\天池比赛\joyful-pandas-master\data\\table.xlsx')
data_excel.head(20)

	School	Class	ID	Gender	Address	Height	Weight	Math	Physics
0	S_1	C_1	1101	M	street_1	173	63	34.0	A+
1	S_1	C_1	1102	F	street_2	192	73	32.5	B+
2	S_1	C_1	1103	M	street_2	186	82	87.2	B+
3	S_1	C_1	1104	F	street_2	167	81	80.4	B-
4	S_1	C_1	1105	F	street_4	159	64	84.8	B+
5	S_1	C_2	1201	M	street_5	188	68	97.0	A-
6	S_1	C_2	1202	F	street_4	176	94	63.5	B-
7	S_1	C_2	1203	M	street_6	160	53	58.8	A+
8	S_1	C_2	1204	F	street_5	162	63	33.8	B
9	S_1	C_2	1205	F	street_6	167	63	68.4	B-
10	S_1	C_3	1301	M	street_4	161	68	31.5	B+
11	S_1	C_3	1302	F	street_1	175	57	87.7	A-
12	S_1	C_3	1303	M	street_7	188	82	49.7	B
13	S_1	C_3	1304	M	street_2	195	70	85.2	A
14	S_1	C_3	1305	F	street_5	187	69	61.7	B-
15	S_2	C_1	2101	M	street_7	174	84	83.3	C
16	S_2	C_1	2102	F	street_6	161	61	50.6	B+
17	S_2	C_1	2103	M	street_4	157	61	52.5	B-
18	S_2	C_1	2104	F	street_5	159	97	72.2	B+
19	S_2	C_1	2105	M	street_4	170	81	34.2	A

写入

csv格式

data.to_csv('E:\jupyter Notebook\天池比赛\joyful-pandas-master\data\\table.csv')

其他的不过多展示了

数据的基本结构

Series

创建一个Series

对于一个Series，其中最常用的属性为值（values）,索引（index），名字（name）,类型（type）

s=pd.Series(np.random.randn(5),index=['a','b','c','d','e'],name='this is Series',dtype='float64')
s

a   -1.576202
b    1.070319
c    0.668490
d    1.760701
e    0.942655
Name: this is Series, dtype: float64

访问Series属性

s.values

array([-1.57620183,  1.07031892,  0.66849049,  1.76070095,  0.94265474])

都会就不过多敲代码了

取出元素

#根据索引来取出
s['a']

-1.5762018254215895

调用方法

print([attr for attr in dir(s) if not attr.startswith('_')])
#字段名称打印？

['T', 'a', 'abs', 'add', 'add_prefix', 'add_suffix', 'agg', 'aggregate', 'align', 'all', 'any', 'append', 'apply', 'argmax', 'argmin', 'argsort', 'array', 'asfreq', 'asof', 'astype', 'at', 'at_time', 'attrs', 'autocorr', 'axes', 'b', 'between', 'between_time', 'bfill', 'bool', 'c', 'clip', 'combine', 'combine_first', 'convert_dtypes', 'copy', 'corr', 'count', 'cov', 'cummax', 'cummin', 'cumprod', 'cumsum', 'd', 'describe', 'diff', 'div', 'divide', 'divmod', 'dot', 'drop', 'drop_duplicates', 'droplevel', 'dropna', 'dtype', 'dtypes', 'duplicated', 'e', 'empty', 'eq', 'equals', 'ewm', 'expanding', 'explode', 'factorize', 'ffill', 'fillna', 'filter', 'first', 'first_valid_index', 'floordiv', 'ge', 'get', 'groupby', 'gt', 'hasnans', 'head', 'hist', 'iat', 'idxmax', 'idxmin', 'iloc', 'index', 'infer_objects', 'interpolate', 'is_monotonic', 'is_monotonic_decreasing', 'is_monotonic_increasing', 'is_unique', 'isin', 'isna', 'isnull', 'item', 'items', 'iteritems', 'keys', 'kurt', 'kurtosis', 'last', 'last_valid_index', 'le', 'loc', 'lt', 'mad', 'map', 'mask', 'max', 'mean', 'median', 'memory_usage', 'min', 'mod', 'mode', 'mul', 'multiply', 'name', 'nbytes', 'ndim', 'ne', 'nlargest', 'notna', 'notnull', 'nsmallest', 'nunique', 'pct_change', 'pipe', 'plot', 'pop', 'pow', 'prod', 'product', 'quantile', 'radd', 'rank', 'ravel', 'rdiv', 'rdivmod', 'reindex', 'reindex_like', 'rename', 'rename_axis', 'reorder_levels', 'repeat', 'replace', 'resample', 'reset_index', 'rfloordiv', 'rmod', 'rmul', 'rolling', 'round', 'rpow', 'rsub', 'rtruediv', 'sample', 'searchsorted', 'sem', 'set_axis', 'shape', 'shift', 'size', 'skew', 'slice_shift', 'sort_index', 'sort_values', 'squeeze', 'std', 'sub', 'subtract', 'sum', 'swapaxes', 'swaplevel', 'tail', 'take', 'to_clipboard', 'to_csv', 'to_dict', 'to_excel', 'to_frame', 'to_hdf', 'to_json', 'to_latex', 'to_list', 'to_markdown', 'to_numpy', 'to_period', 'to_pickle', 'to_sql', 'to_string', 'to_timestamp', 'to_xarray', 'transform', 'transpose', 'truediv', 'truncate', 'tshift', 'tz_convert', 'tz_localize', 'unique', 'unstack', 'update', 'value_counts', 'values', 'var', 'view', 'where', 'xs']

DataFrame

创建一个DF

df=pd.DataFrame({'col1':list('abcde'),'col2':range(5,10),'col3':[1.3,2.5,3.6,4.6,5.8]},index=list('12345'))
df

	col1	col2	col3
1	a	5	1.3
2	b	6	2.5
3	c	7	3.6
4	d	8	4.6
5	e	9	5.8

从DF中取出一列为Series

df['col1']

1    a
2    b
3    c
4    d
5    e
Name: col1, dtype: object

type(df)

pandas.core.frame.DataFrame

type(df['col1'])

pandas.core.series.Series

索引对齐

df1=pd.DataFrame({"A":[1,2,3]},index=[1,2,3])
df2=pd.DataFrame({'A':[1,2,3]},index=[3,1,2])
df1-df2
#因为存在索引对齐，所以结果不是0

	A
1	-1
2	-1
3	2

列的删除与添加

可以使用drop函数，del函数与pop函数

df.drop(index='5',columns='col1')

	col2	col3
1	5	1.3
2	6	2.5
3	7	3.6
4	8	4.6

#pop方法直接在DF上操作，且返回被删除的列，与Python中的pop函数类似
df['col1']=[1,2,3,4,5]
df.pop('col1')

1    1
2    2
3    3
4    4
5    5
Name: col1, dtype: int64

df

	col2	col3
1	5	1.3
2	6	2.5
3	7	3.6
4	8	4.6
5	9	5.8

#可以直接增加，也可通过assign可以增加新列
df1['B']=list('abc')
df1

	A	B
1	1	a
2	2	b
3	3	c

df1.assign(C=pd.Series(list('def')))
#assign方法不会对DF进行修改，所以要重新给予新的变量

	A	B	C
1	1	a	e
2	2	b	f
3	3	c	NaN

根据不同类型选择列

df.select_dtypes(include=['number']).head()

	col2	col3
1	5	1.3
2	6	2.5
3	7	3.6
4	8	4.6
5	9	5.8

将series转换成DF

s=df.mean()
s.name='to_DF'
s

col2    7.00
col3    3.56
Name: to_DF, dtype: float64

s.to_frame()

	to_DF
col2	7.00
col3	3.56

#使用T进行转置
s.to_frame().T

	col2	col3
to_DF	7.0	3.56

常用的基本函数

df=pd.read_csv('E:\jupyter Notebook\天池比赛\joyful-pandas-master\data\\table.csv')
df=df.drop(columns='Unnamed: 0')
df

	School	Class	ID	Gender	Address	Height	Weight	Math	Physics
0	S_1	C_1	1101	M	street_1	173	63	34.0	A+
1	S_1	C_1	1102	F	street_2	192	73	32.5	B+
2	S_1	C_1	1103	M	street_2	186	82	87.2	B+
3	S_1	C_1	1104	F	street_2	167	81	80.4	B-
4	S_1	C_1	1105	F	street_4	159	64	84.8	B+
5	S_1	C_2	1201	M	street_5	188	68	97.0	A-
6	S_1	C_2	1202	F	street_4	176	94	63.5	B-
7	S_1	C_2	1203	M	street_6	160	53	58.8	A+
8	S_1	C_2	1204	F	street_5	162	63	33.8	B
9	S_1	C_2	1205	F	street_6	167	63	68.4	B-
10	S_1	C_3	1301	M	street_4	161	68	31.5	B+
11	S_1	C_3	1302	F	street_1	175	57	87.7	A-
12	S_1	C_3	1303	M	street_7	188	82	49.7	B
13	S_1	C_3	1304	M	street_2	195	70	85.2	A
14	S_1	C_3	1305	F	street_5	187	69	61.7	B-
15	S_2	C_1	2101	M	street_7	174	84	83.3	C
16	S_2	C_1	2102	F	street_6	161	61	50.6	B+
17	S_2	C_1	2103	M	street_4	157	61	52.5	B-
18	S_2	C_1	2104	F	street_5	159	97	72.2	B+
19	S_2	C_1	2105	M	street_4	170	81	34.2	A
20	S_2	C_2	2201	M	street_5	193	100	39.1	B
21	S_2	C_2	2202	F	street_7	194	77	68.5	B+
22	S_2	C_2	2203	M	street_4	155	91	73.8	A+
23	S_2	C_2	2204	M	street_1	175	74	47.2	B-
24	S_2	C_2	2205	F	street_7	183	76	85.4	B
25	S_2	C_3	2301	F	street_4	157	78	72.3	B+
26	S_2	C_3	2302	M	street_5	171	88	32.7	A
27	S_2	C_3	2303	F	street_7	190	99	65.9	C
28	S_2	C_3	2304	F	street_6	164	81	95.5	A-
29	S_2	C_3	2305	M	street_4	187	73	48.9	B
30	S_2	C_4	2401	F	street_2	192	62	45.3	A
31	S_2	C_4	2402	M	street_7	166	82	48.7	B
32	S_2	C_4	2403	F	street_6	158	60	59.7	B+
33	S_2	C_4	2404	F	street_2	160	84	67.7	B
34	S_2	C_4	2405	F	street_6	193	54	47.6	B

unique和nunique

nunique显示有多少个唯一值 —数量

df['Physics'].nunique()

unique显示所有的唯一值 —具体值

df['Physics'].unique()

array(['A+', 'B+', 'B-', 'A-', 'B', 'A', 'C'], dtype=object)

count and value_counts

df['Physics'].count()
#注意，返回的是非缺失值的元素个数！

#value_counts返回每个元素有多少个
df['Physics'].value_counts()

B+    9
B     8
B-    6
A     4
A+    3
A-    3
C     2
Name: Physics, dtype: int64

describe and info

#info函数范围有哪些列，有多少非缺失值，每列的类型
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35 entries, 0 to 34
Data columns (total 9 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   School   35 non-null     object 
 1   Class    35 non-null     object 
 2   ID       35 non-null     int64  
 3   Gender   35 non-null     object 
 4   Address  35 non-null     object 
 5   Height   35 non-null     int64  
 6   Weight   35 non-null     int64  
 7   Math     35 non-null     float64
 8   Physics  35 non-null     object 
dtypes: float64(1), int64(3), object(5)
memory usage: 2.6+ KB

#describe默认统计数值型数据的各个统计量
df.describe()

	ID	Height	Weight	Math
count	35.00000	35.000000	35.000000	35.000000
mean	1803.00000	174.142857	74.657143	61.351429
std	536.87741	13.541098	12.895377	19.915164
min	1101.00000	155.000000	53.000000	31.500000
25%	1204.50000	161.000000	63.000000	47.400000
50%	2103.00000	173.000000	74.000000	61.700000
75%	2301.50000	187.500000	82.000000	77.100000
max	2405.00000	195.000000	100.000000	97.000000

#describe函数可以调整不同的分位数
df.describe(percentiles=[.05,.25,.75,.95])#注意是[]

	ID	Height	Weight	Math
count	35.00000	35.000000	35.000000	35.000000
mean	1803.00000	174.142857	74.657143	61.351429
std	536.87741	13.541098	12.895377	19.915164
min	1101.00000	155.000000	53.000000	31.500000
5%	1102.70000	157.000000	56.100000	32.640000
25%	1204.50000	161.000000	63.000000	47.400000
50%	2103.00000	173.000000	74.000000	61.700000
75%	2301.50000	187.500000	82.000000	77.100000
95%	2403.30000	193.300000	97.600000	90.040000
max	2405.00000	195.000000	100.000000	97.000000

#对于非数值型也可以使用describe函数
df['Physics'].describe()

count     35
unique     7
top       B+
freq       9
Name: Physics, dtype: object

idxmax and nlargest

# idxmax函数返回最大值，在某些情况下使用，同理类比idxmin,
df['Math'].idxmax()
df['Math'].idxmin()

#nlargest函数范围前几个大的元素值，nsmallest功能类似
df['Math'].nlargest()#（）可以填写数量

5     97.0
28    95.5
11    87.7
2     87.2
24    85.4
Name: Math, dtype: float64

clip and repalce

#clip是对超过或低于某些数进行截断
df['Math'].clip(33,80).head(30)

0     34.0
1     33.0
2     80.0
3     80.0
4     80.0
5     80.0
6     63.5
7     58.8
8     33.8
9     68.4
10    33.0
11    80.0
12    49.7
13    80.0
14    61.7
15    80.0
16    50.6
17    52.5
18    72.2
19    34.2
20    39.1
21    68.5
22    73.8
23    47.2
24    80.0
25    72.3
26    33.0
27    65.9
28    80.0
29    48.9
Name: Math, dtype: float64

#repalce是对某些值进行替换
df['Address'].head()

0    street_1
1    street_2
2    street_2
3    street_2
4    street_4
Name: Address, dtype: object

df['Address'].repalce(['street_1','street_2'],['one','two']).head()

---------------------------------------------------------------------------

AttributeError                            Traceback (most recent call last)

<ipython-input-97-4b99be6262e7> in <module>
----> 1 df['Address'].repalce(['street_1','street_2'],['one','two']).head()


C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\generic.py in __getattr__(self, name)
   5272             if self._info_axis._can_hold_identifiers_and_holds_name(name):
   5273                 return self[name]
-> 5274             return object.__getattribute__(self, name)
   5275 
   5276     def __setattr__(self, name: str, value) -> None:


AttributeError: 'Series' object has no attribute 'repalce'

apply函数

#对于series,它可以迭代每一列值操作
df['Math'].apply(lambda x:str(x)+'!').head()

0    34.0!
1    32.5!
2    87.2!
3    80.4!
4    84.8!
Name: Math, dtype: object

#对于DF，迭代每一列操作
df.apply(lambda x:x.apply(lambda x:str(x)+'!')).head()

	School	Class	ID	Gender	Address	Height	Weight	Math	Physics
0	S_1!	C_1!	1101!	M!	street_1!	173!	63!	34.0!	A+!
1	S_1!	C_1!	1102!	F!	street_2!	192!	73!	32.5!	B+!
2	S_1!	C_1!	1103!	M!	street_2!	186!	82!	87.2!	B+!
3	S_1!	C_1!	1104!	F!	street_2!	167!	81!	80.4!	B-!
4	S_1!	C_1!	1105!	F!	street_4!	159!	64!	84.8!	B+!

##对于apply的扩展-可以选择某些列单独加
df['Math'].apply(lambda x:str(x)+"!")

0     34.0!
1     32.5!
2     87.2!
3     80.4!
4     84.8!
5     97.0!
6     63.5!
7     58.8!
8     33.8!
9     68.4!
10    31.5!
11    87.7!
12    49.7!
13    85.2!
14    61.7!
15    83.3!
16    50.6!
17    52.5!
18    72.2!
19    34.2!
20    39.1!
21    68.5!
22    73.8!
23    47.2!
24    85.4!
25    72.3!
26    32.7!
27    65.9!
28    95.5!
29    48.9!
30    45.3!
31    48.7!
32    59.7!
33    67.7!
34    47.6!
Name: Math, dtype: object

排序

索引排序

df.set_index('Math').head(20)
#将Math设为索引

	School	Class	ID	Gender	Address	Height	Weight	Physics
Math
34.0	S_1	C_1	1101	M	street_1	173	63	A+
32.5	S_1	C_1	1102	F	street_2	192	73	B+
87.2	S_1	C_1	1103	M	street_2	186	82	B+
80.4	S_1	C_1	1104	F	street_2	167	81	B-
84.8	S_1	C_1	1105	F	street_4	159	64	B+
97.0	S_1	C_2	1201	M	street_5	188	68	A-
63.5	S_1	C_2	1202	F	street_4	176	94	B-
58.8	S_1	C_2	1203	M	street_6	160	53	A+
33.8	S_1	C_2	1204	F	street_5	162	63	B
68.4	S_1	C_2	1205	F	street_6	167	63	B-
31.5	S_1	C_3	1301	M	street_4	161	68	B+
87.7	S_1	C_3	1302	F	street_1	175	57	A-
49.7	S_1	C_3	1303	M	street_7	188	82	B
85.2	S_1	C_3	1304	M	street_2	195	70	A
61.7	S_1	C_3	1305	F	street_5	187	69	B-
83.3	S_2	C_1	2101	M	street_7	174	84	C
50.6	S_2	C_1	2102	F	street_6	161	61	B+
52.5	S_2	C_1	2103	M	street_4	157	61	B-
72.2	S_2	C_1	2104	F	street_5	159	97	B+
34.2	S_2	C_1	2105	M	street_4	170	81	A

df.set_index('Math').sort_index().head()

	School	Class	ID	Gender	Address	Height	Weight	Physics
Math
31.5	S_1	C_3	1301	M	street_4	161	68	B+
32.5	S_1	C_1	1102	F	street_2	192	73	B+
32.7	S_2	C_3	2302	M	street_5	171	88	A
33.8	S_1	C_2	1204	F	street_5	162	63	B
34.0	S_1	C_1	1101	M	street_1	173	63	A+

值排序

df.sort_values(by='Class').head(20)

	School	Class	ID	Gender	Address	Height	Weight	Math	Physics
0	S_1	C_1	1101	M	street_1	173	63	34.0	A+
19	S_2	C_1	2105	M	street_4	170	81	34.2	A
18	S_2	C_1	2104	F	street_5	159	97	72.2	B+
16	S_2	C_1	2102	F	street_6	161	61	50.6	B+
15	S_2	C_1	2101	M	street_7	174	84	83.3	C
17	S_2	C_1	2103	M	street_4	157	61	52.5	B-
1	S_1	C_1	1102	F	street_2	192	73	32.5	B+
2	S_1	C_1	1103	M	street_2	186	82	87.2	B+
3	S_1	C_1	1104	F	street_2	167	81	80.4	B-
4	S_1	C_1	1105	F	street_4	159	64	84.8	B+
6	S_1	C_2	1202	F	street_4	176	94	63.5	B-
24	S_2	C_2	2205	F	street_7	183	76	85.4	B
23	S_2	C_2	2204	M	street_1	175	74	47.2	B-
22	S_2	C_2	2203	M	street_4	155	91	73.8	A+
21	S_2	C_2	2202	F	street_7	194	77	68.5	B+
5	S_1	C_2	1201	M	street_5	188	68	97.0	A-
20	S_2	C_2	2201	M	street_5	193	100	39.1	B
9	S_1	C_2	1205	F	street_6	167	63	68.4	B-
8	S_1	C_2	1204	F	street_5	162	63	33.8	B
7	S_1	C_2	1203	M	street_6	160	53	58.8	A+

多个值排序，即先对第一层排，在第一层相同的情况下，对第二层排序

df.sort_values(by=['Math','Physics']).head(20)

	School	Class	ID	Gender	Address	Height	Weight	Math	Physics
10	S_1	C_3	1301	M	street_4	161	68	31.5	B+
1	S_1	C_1	1102	F	street_2	192	73	32.5	B+
26	S_2	C_3	2302	M	street_5	171	88	32.7	A
8	S_1	C_2	1204	F	street_5	162	63	33.8	B
0	S_1	C_1	1101	M	street_1	173	63	34.0	A+
19	S_2	C_1	2105	M	street_4	170	81	34.2	A
20	S_2	C_2	2201	M	street_5	193	100	39.1	B
30	S_2	C_4	2401	F	street_2	192	62	45.3	A
23	S_2	C_2	2204	M	street_1	175	74	47.2	B-
34	S_2	C_4	2405	F	street_6	193	54	47.6	B
31	S_2	C_4	2402	M	street_7	166	82	48.7	B
29	S_2	C_3	2305	M	street_4	187	73	48.9	B
12	S_1	C_3	1303	M	street_7	188	82	49.7	B
16	S_2	C_1	2102	F	street_6	161	61	50.6	B+
17	S_2	C_1	2103	M	street_4	157	61	52.5	B-
7	S_1	C_2	1203	M	street_6	160	53	58.8	A+
32	S_2	C_4	2403	F	street_6	158	60	59.7	B+
14	S_1	C_3	1305	F	street_5	187	69	61.7	B-
6	S_1	C_2	1202	F	street_4	176	94	63.5	B-
27	S_2	C_3	2303	F	street_7	190	99	65.9	C

问题与练习

问题

Series和DataFrame有哪些常见属性和方法？

for Series have values,name,type and index
for DF index and values

value_counts会统计缺失值吗？

与idxmax和nlargest功能相反的是哪两组函数？

idxmin and nsmallest

df.mean(axis=1)是什么意思？它与df.mean()的结果一样吗？第一问提到的函数也有axis参数吗？怎么使用？

针对DF进行行均值计算不一样，前者是行，后者是列

练习

现有一份关于美剧《权力的游戏》剧本的数据集，请解决以下问题：

在所有的数据中，一共出现了多少人物？
以单元格计数（即简单把一个单元格视作一句），谁说了最多的话？
以单词计数，谁说了最多的单词？

data1=pd.read_csv('E:\jupyter Notebook\天池比赛\joyful-pandas-master\data\\Game_of_Thrones_Script.csv')
data1.head(20)

	Release Date	Season	Episode	Episode Title	Name	Sentence
0	2011/4/17	Season 1	Episode 1	Winter is Coming	waymar royce	What do you expect? They're savages. One lot s...
1	2011/4/17	Season 1	Episode 1	Winter is Coming	will	I've never seen wildlings do a thing like this...
2	2011/4/17	Season 1	Episode 1	Winter is Coming	waymar royce	How close did you get?
3	2011/4/17	Season 1	Episode 1	Winter is Coming	will	Close as any man would.
4	2011/4/17	Season 1	Episode 1	Winter is Coming	gared	We should head back to the wall.
5	2011/4/17	Season 1	Episode 1	Winter is Coming	royce	Do the dead frighten you?
6	2011/4/17	Season 1	Episode 1	Winter is Coming	gared	Our orders were to track the wildlings. We tra...
7	2011/4/17	Season 1	Episode 1	Winter is Coming	royce	You don't think he'll ask us how they died? Ge...
8	2011/4/17	Season 1	Episode 1	Winter is Coming	will	Whatever did it to them could do it to us. The...
9	2011/4/17	Season 1	Episode 1	Winter is Coming	royce	It's a good thing we're not children. You want...
10	2011/4/17	Season 1	Episode 1	Winter is Coming	royce	Your dead men seem to have moved camp.
11	2011/4/17	Season 1	Episode 1	Winter is Coming	will	They were here.
12	2011/4/17	Season 1	Episode 1	Winter is Coming	gared	See where they went.
13	2011/4/17	Season 1	Episode 1	Winter is Coming	royce	What is it?
14	2011/4/17	Season 1	Episode 1	Winter is Coming	gared	It's …
15	2011/4/17	Season 1	Episode 1	Winter is Coming	jon snow	Go on. Father's watching.
16	2011/4/17	Season 1	Episode 1	Winter is Coming	jon snow	And your mother.
17	2011/4/17	Season 1	Episode 1	Winter is Coming	septa mordane	Fine work, as always. Well done.
18	2011/4/17	Season 1	Episode 1	Winter is Coming	sansa stark	Thank you.
19	2011/4/17	Season 1	Episode 1	Winter is Coming	septa mordane	I love the detail that you've managed to get i...

data1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23911 entries, 0 to 23910
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Release Date   23911 non-null  object
 1   Season         23911 non-null  object
 2   Episode        23911 non-null  object
 3   Episode Title  23911 non-null  object
 4   Name           23911 non-null  object
 5   Sentence       23911 non-null  object
dtypes: object(6)
memory usage: 1.1+ MB

data1['Name'].nunique()
#一共有564个人物

#针对单元格，使用describe，结果是tyrion lannister，次数是1760
data1['Name'].describe()

count                23911
unique                 564
top       tyrion lannister
freq                  1760
Name: Name, dtype: object

第三个题目不会

现有一份关于科比的投篮数据集，请解决如下问题：

哪种action_type和combined_shot_type的组合是最多的？
在所有被记录的game_id中，遭遇到最多的opponent是一个支？

data2=pd.read_csv('E:\jupyter Notebook\天池比赛\joyful-pandas-master\data\\Kobe_data.csv')
data2

	shot_id	action_type	combined_shot_type	game_event_id	game_id	lat	loc_x	loc_y	lon	minutes_remaining	...	shot_made_flag	shot_type	shot_zone_area	shot_zone_basic	shot_zone_range	team_id	team_name	game_date	matchup	opponent
0	1	Jump Shot	Jump Shot	10	20000012	33.9723	167	72	-118.1028	10	...	NaN	2PT Field Goal	Right Side(R)	Mid-Range	16-24 ft.	1610612747	Los Angeles Lakers	2000/10/31	LAL @ POR	POR
1	2	Jump Shot	Jump Shot	12	20000012	34.0443	-157	0	-118.4268	10	...	0.0	2PT Field Goal	Left Side(L)	Mid-Range	8-16 ft.	1610612747	Los Angeles Lakers	2000/10/31	LAL @ POR	POR
2	3	Jump Shot	Jump Shot	35	20000012	33.9093	-101	135	-118.3708	7	...	1.0	2PT Field Goal	Left Side Center(LC)	Mid-Range	16-24 ft.	1610612747	Los Angeles Lakers	2000/10/31	LAL @ POR	POR
3	4	Jump Shot	Jump Shot	43	20000012	33.8693	138	175	-118.1318	6	...	0.0	2PT Field Goal	Right Side Center(RC)	Mid-Range	16-24 ft.	1610612747	Los Angeles Lakers	2000/10/31	LAL @ POR	POR
4	5	Driving Dunk Shot	Dunk	155	20000012	34.0443	0	0	-118.2698	6	...	1.0	2PT Field Goal	Center(C)	Restricted Area	Less Than 8 ft.	1610612747	Los Angeles Lakers	2000/10/31	LAL @ POR	POR
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
30692	30693	Jump Shot	Jump Shot	397	49900088	33.9963	1	48	-118.2688	6	...	0.0	2PT Field Goal	Center(C)	In The Paint (Non-RA)	Less Than 8 ft.	1610612747	Los Angeles Lakers	2000/6/19	LAL vs. IND	IND
30693	30694	Tip Shot	Tip Shot	398	49900088	34.0443	0	0	-118.2698	6	...	NaN	2PT Field Goal	Center(C)	Restricted Area	Less Than 8 ft.	1610612747	Los Angeles Lakers	2000/6/19	LAL vs. IND	IND
30694	30695	Running Jump Shot	Jump Shot	426	49900088	33.8783	-134	166	-118.4038	3	...	1.0	2PT Field Goal	Left Side Center(LC)	Mid-Range	16-24 ft.	1610612747	Los Angeles Lakers	2000/6/19	LAL vs. IND	IND
30695	30696	Jump Shot	Jump Shot	448	49900088	33.7773	31	267	-118.2388	2	...	0.0	3PT Field Goal	Center(C)	Above the Break 3	24+ ft.	1610612747	Los Angeles Lakers	2000/6/19	LAL vs. IND	IND
30696	30697	Jump Shot	Jump Shot	471	49900088	33.9723	1	72	-118.2688	0	...	0.0	2PT Field Goal	Center(C)	In The Paint (Non-RA)	Less Than 8 ft.	1610612747	Los Angeles Lakers	2000/6/19	LAL vs. IND	IND

30697 rows × 25 columns

data2=data2.set_index('shot_id')

---------------------------------------------------------------------------

KeyError                                  Traceback (most recent call last)

<ipython-input-135-7f5596703652> in <module>
----> 1 data2=data2.set_index('shot_id')
      2 data2.head(20)


C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py in set_index(self, keys, drop, append, inplace, verify_integrity)
   4301 
   4302         if missing:
-> 4303             raise KeyError(f"None of {missing} are in the columns")
   4304 
   4305         if inplace:


KeyError: "None of ['shot_id'] are in the columns"

data2.head(20)

	action_type	combined_shot_type	game_event_id	game_id	lat	loc_x	loc_y	lon	minutes_remaining	period	...	shot_made_flag	shot_type	shot_zone_area	shot_zone_basic	shot_zone_range	team_id	team_name	game_date	matchup	opponent
shot_id
1	Jump Shot	Jump Shot	10	20000012	33.9723	167	72	-118.1028	10	1	...	NaN	2PT Field Goal	Right Side(R)	Mid-Range	16-24 ft.	1610612747	Los Angeles Lakers	2000/10/31	LAL @ POR	POR
2	Jump Shot	Jump Shot	12	20000012	34.0443	-157	0	-118.4268	10	1	...	0.0	2PT Field Goal	Left Side(L)	Mid-Range	8-16 ft.	1610612747	Los Angeles Lakers	2000/10/31	LAL @ POR	POR
3	Jump Shot	Jump Shot	35	20000012	33.9093	-101	135	-118.3708	7	1	...	1.0	2PT Field Goal	Left Side Center(LC)	Mid-Range	16-24 ft.	1610612747	Los Angeles Lakers	2000/10/31	LAL @ POR	POR
4	Jump Shot	Jump Shot	43	20000012	33.8693	138	175	-118.1318	6	1	...	0.0	2PT Field Goal	Right Side Center(RC)	Mid-Range	16-24 ft.	1610612747	Los Angeles Lakers	2000/10/31	LAL @ POR	POR
5	Driving Dunk Shot	Dunk	155	20000012	34.0443	0	0	-118.2698	6	2	...	1.0	2PT Field Goal	Center(C)	Restricted Area	Less Than 8 ft.	1610612747	Los Angeles Lakers	2000/10/31	LAL @ POR	POR
6	Jump Shot	Jump Shot	244	20000012	34.0553	-145	-11	-118.4148	9	3	...	0.0	2PT Field Goal	Left Side(L)	Mid-Range	8-16 ft.	1610612747	Los Angeles Lakers	2000/10/31	LAL @ POR	POR
7	Layup Shot	Layup	251	20000012	34.0443	0	0	-118.2698	8	3	...	1.0	2PT Field Goal	Center(C)	Restricted Area	Less Than 8 ft.	1610612747	Los Angeles Lakers	2000/10/31	LAL @ POR	POR
8	Jump Shot	Jump Shot	254	20000012	34.0163	1	28	-118.2688	8	3	...	NaN	2PT Field Goal	Center(C)	Restricted Area	Less Than 8 ft.	1610612747	Los Angeles Lakers	2000/10/31	LAL @ POR	POR
9	Jump Shot	Jump Shot	265	20000012	33.9363	-65	108	-118.3348	6	3	...	1.0	2PT Field Goal	Left Side(L)	In The Paint (Non-RA)	8-16 ft.	1610612747	Los Angeles Lakers	2000/10/31	LAL @ POR	POR
10	Running Jump Shot	Jump Shot	294	20000012	33.9193	-33	125	-118.3028	3	3	...	0.0	2PT Field Goal	Center(C)	In The Paint (Non-RA)	8-16 ft.	1610612747	Los Angeles Lakers	2000/10/31	LAL @ POR	POR
11	Jump Shot	Jump Shot	309	20000012	33.8063	-94	238	-118.3638	1	3	...	0.0	3PT Field Goal	Left Side Center(LC)	Above the Break 3	24+ ft.	1610612747	Los Angeles Lakers	2000/10/31	LAL @ POR	POR
12	Jump Shot	Jump Shot	4	20000019	33.9173	121	127	-118.1488	11	1	...	1.0	2PT Field Goal	Right Side Center(RC)	Mid-Range	16-24 ft.	1610612747	Los Angeles Lakers	2000/11/1	LAL vs. UTA	UTA
13	Running Jump Shot	Jump Shot	27	20000019	33.9343	-67	110	-118.3368	7	1	...	1.0	2PT Field Goal	Left Side(L)	In The Paint (Non-RA)	8-16 ft.	1610612747	Los Angeles Lakers	2000/11/1	LAL vs. UTA	UTA
14	Jump Shot	Jump Shot	66	20000019	34.0403	-94	4	-118.3638	2	1	...	0.0	2PT Field Goal	Left Side(L)	Mid-Range	8-16 ft.	1610612747	Los Angeles Lakers	2000/11/1	LAL vs. UTA	UTA
15	Jump Shot	Jump Shot	80	20000019	33.9973	-23	47	-118.2928	1	1	...	0.0	2PT Field Goal	Center(C)	In The Paint (Non-RA)	Less Than 8 ft.	1610612747	Los Angeles Lakers	2000/11/1	LAL vs. UTA	UTA
16	Jump Shot	Jump Shot	86	20000019	33.8523	62	192	-118.2078	0	1	...	0.0	2PT Field Goal	Center(C)	Mid-Range	16-24 ft.	1610612747	Los Angeles Lakers	2000/11/1	LAL vs. UTA	UTA
17	Driving Layup Shot	Layup	100	20000019	34.0443	0	0	-118.2698	0	1	...	NaN	2PT Field Goal	Center(C)	Restricted Area	Less Than 8 ft.	1610612747	Los Angeles Lakers	2000/11/1	LAL vs. UTA	UTA
18	Jump Shot	Jump Shot	138	20000019	33.8183	-117	226	-118.3868	8	2	...	1.0	3PT Field Goal	Left Side Center(LC)	Above the Break 3	24+ ft.	1610612747	Los Angeles Lakers	2000/11/1	LAL vs. UTA	UTA
19	Jump Shot	Jump Shot	244	20000019	33.9473	-132	97	-118.4018	11	3	...	0.0	2PT Field Goal	Left Side Center(LC)	Mid-Range	16-24 ft.	1610612747	Los Angeles Lakers	2000/11/1	LAL vs. UTA	UTA
20	Driving Layup Shot	Layup	249	20000019	34.0443	0	0	-118.2698	10	3	...	NaN	2PT Field Goal	Center(C)	Restricted Area	Less Than 8 ft.	1610612747	Los Angeles Lakers	2000/11/1	LAL vs. UTA	UTA

20 rows × 24 columns

data2.groupby(by=('action_type','combined_shot_type'))

---------------------------------------------------------------------------

KeyError                                  Traceback (most recent call last)

<ipython-input-138-e68698f5f785> in <module>
----> 1 data2.groupby(by=('action_type','combined_shot_type'))


C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py in groupby(self, by, axis, level, as_index, sort, group_keys, squeeze, observed)
   5808             group_keys=group_keys,
   5809             squeeze=squeeze,
-> 5810             observed=observed,
   5811         )
   5812 


C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\groupby\groupby.py in __init__(self, obj, keys, axis, level, grouper, exclusions, selection, as_index, sort, group_keys, squeeze, observed, mutated)
    407                 sort=sort,
    408                 observed=observed,
--> 409                 mutated=self.mutated,
    410             )
    411 


C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\groupby\grouper.py in get_grouper(obj, key, axis, level, sort, observed, mutated, validate)
    596                 in_axis, name, level, gpr = False, None, gpr, None
    597             else:
--> 598                 raise KeyError(gpr)
    599         elif isinstance(gpr, Grouper) and gpr.key is not None:
    600             # Add key to exclusions


KeyError: ('action_type', 'combined_shot_type')

data2_join=data2.assign(join=data2['combined_shot_type']+data2['action_type'])
data2_join.head()

	action_type	combined_shot_type	game_event_id	game_id	lat	loc_x	loc_y	lon	minutes_remaining	period	...	shot_type	shot_zone_area	shot_zone_basic	shot_zone_range	team_id	team_name	game_date	matchup	opponent	join
shot_id
1	Jump Shot	Jump Shot	10	20000012	33.9723	167	72	-118.1028	10	1	...	2PT Field Goal	Right Side(R)	Mid-Range	16-24 ft.	1610612747	Los Angeles Lakers	2000/10/31	LAL @ POR	POR	Jump ShotJump Shot
2	Jump Shot	Jump Shot	12	20000012	34.0443	-157	0	-118.4268	10	1	...	2PT Field Goal	Left Side(L)	Mid-Range	8-16 ft.	1610612747	Los Angeles Lakers	2000/10/31	LAL @ POR	POR	Jump ShotJump Shot
3	Jump Shot	Jump Shot	35	20000012	33.9093	-101	135	-118.3708	7	1	...	2PT Field Goal	Left Side Center(LC)	Mid-Range	16-24 ft.	1610612747	Los Angeles Lakers	2000/10/31	LAL @ POR	POR	Jump ShotJump Shot
4	Jump Shot	Jump Shot	43	20000012	33.8693	138	175	-118.1318	6	1	...	2PT Field Goal	Right Side Center(RC)	Mid-Range	16-24 ft.	1610612747	Los Angeles Lakers	2000/10/31	LAL @ POR	POR	Jump ShotJump Shot
5	Driving Dunk Shot	Dunk	155	20000012	34.0443	0	0	-118.2698	6	2	...	2PT Field Goal	Center(C)	Restricted Area	Less Than 8 ft.	1610612747	Los Angeles Lakers	2000/10/31	LAL @ POR	POR	DunkDriving Dunk Shot

5 rows × 25 columns

data2_join1=data2['combined_shot_type']+'+'+data2['action_type']

data2_join1.head(100)

shot_id
1         Jump Shot+Jump Shot
2         Jump Shot+Jump Shot
3         Jump Shot+Jump Shot
4         Jump Shot+Jump Shot
5      Dunk+Driving Dunk Shot
                ...          
96        Jump Shot+Jump Shot
97        Jump Shot+Jump Shot
98        Jump Shot+Jump Shot
99        Jump Shot+Jump Shot
100       Jump Shot+Jump Shot
Length: 100, dtype: object

data2_join1.nunique()

data2_join1.value_counts()
#最多的是Jump Shot+Jump Shot

Jump Shot+Jump Shot                             18880
Layup+Layup Shot                                 2567
Layup+Driving Layup Shot                         1978
Jump Shot+Turnaround Jump Shot                   1057
Jump Shot+Fadeaway Jump Shot                     1048
Jump Shot+Running Jump Shot                       926
Jump Shot+Pullup Jump shot                        476
Jump Shot+Turnaround Fadeaway shot                439
Dunk+Slam Dunk Shot                               411
Layup+Reverse Layup Shot                          395
Jump Shot+Jump Bank Shot                          333
Dunk+Driving Dunk Shot                            310
Dunk+Dunk Shot                                    262
Tip Shot+Tip Shot                                 182
Dunk+Alley Oop Dunk Shot                          122
Jump Shot+Step Back Jump shot                     118
Jump Shot+Floating Jump shot                      114
Layup+Driving Reverse Layup Shot                   97
Hook Shot+Hook Shot                                84
Layup+Driving Finger Roll Shot                     82
Layup+Alley Oop Layup shot                         80
Dunk+Reverse Dunk Shot                             75
Layup+Running Layup Shot                           72
Bank Shot+Turnaround Bank shot                     71
Layup+Driving Finger Roll Layup Shot               69
Bank Shot+Running Bank shot                        48
Dunk+Driving Slam Dunk Shot                        48
Hook Shot+Running Hook Shot                        41
Layup+Finger Roll Layup Shot                       33
Jump Shot+Fadeaway Bank shot                       31
Layup+Finger Roll Shot                             28
Jump Shot+Driving Jump shot                        28
Jump Shot+Jump Hook Shot                           24
Dunk+Running Dunk Shot                             19
Dunk+Reverse Slam Dunk Shot                        16
Layup+Putback Layup Shot                           15
Dunk+Follow Up Dunk Shot                           15
Hook Shot+Turnaround Hook Shot                     14
Hook Shot+Driving Hook Shot                        14
Bank Shot+Pullup Bank shot                         12
Layup+Running Reverse Layup Shot                   11
Layup+Running Finger Roll Layup Shot                6
Layup+Cutting Layup Shot                            6
Bank Shot+Driving Bank shot                         5
Bank Shot+Hook Bank Shot                            5
Dunk+Putback Dunk Shot                              5
Jump Shot+Driving Floating Jump Shot                5
Layup+Running Finger Roll Shot                      4
Jump Shot+Running Pull-Up Jump Shot                 4
Layup+Turnaround Finger Roll Shot                   2
Tip Shot+Running Tip Shot                           2
Layup+Tip Layup Shot                                2
Dunk+Putback Slam Dunk Shot                         2
Dunk+Running Slam Dunk Shot                         1
Jump Shot+Turnaround Fadeaway Bank Jump Shot        1
Jump Shot+Driving Floating Bank Jump Shot           1
Layup+Cutting Finger Roll Layup Shot                1
dtype: int64

data2['opponent'].value_counts()
#遭遇最多的是SAS

SAS    1978
PHX    1781
HOU    1666
SAC    1643
DEN    1642
POR    1539
UTA    1490
MIN    1474
GSW    1356
LAC    1285
DAL    1199
MEM    1030
BOS     946
SEA     828
IND     761
PHI     720
ORL     719
DET     715
OKC     677
TOR     664
NYK     657
MIA     627
CHA     620
CLE     619
CHI     610
WAS     600
MIL     586
NOH     581
NJN     520
ATL     519
NOP     344
VAN     246
BKN      55
Name: opponent, dtype: int64