import pandas as pd
import numpy as np
pd. __version__
'1.0.3'
文件的读写#
csv格式
data= pd. read_csv( 'E:\jupyter Notebook\天池比赛\joyful-pandas-master\data\\table.csv' )
data. head( 20 )
School Class ID Gender Address Height Weight Math Physics 0 S_1 C_1 1101 M street_1 173 63 34.0 A+ 1 S_1 C_1 1102 F street_2 192 73 32.5 B+ 2 S_1 C_1 1103 M street_2 186 82 87.2 B+ 3 S_1 C_1 1104 F street_2 167 81 80.4 B- 4 S_1 C_1 1105 F street_4 159 64 84.8 B+ 5 S_1 C_2 1201 M street_5 188 68 97.0 A- 6 S_1 C_2 1202 F street_4 176 94 63.5 B- 7 S_1 C_2 1203 M street_6 160 53 58.8 A+ 8 S_1 C_2 1204 F street_5 162 63 33.8 B 9 S_1 C_2 1205 F street_6 167 63 68.4 B- 10 S_1 C_3 1301 M street_4 161 68 31.5 B+ 11 S_1 C_3 1302 F street_1 175 57 87.7 A- 12 S_1 C_3 1303 M street_7 188 82 49.7 B 13 S_1 C_3 1304 M street_2 195 70 85.2 A 14 S_1 C_3 1305 F street_5 187 69 61.7 B- 15 S_2 C_1 2101 M street_7 174 84 83.3 C 16 S_2 C_1 2102 F street_6 161 61 50.6 B+ 17 S_2 C_1 2103 M street_4 157 61 52.5 B- 18 S_2 C_1 2104 F street_5 159 97 72.2 B+ 19 S_2 C_1 2105 M street_4 170 81 34.2 A
txt格式
data_txt= pd. read_table( 'E:\jupyter Notebook\天池比赛\joyful-pandas-master\data\\table.txt' )
data_txt. head( 20 )
col1 col2 col3 col4 0 2 a 1.4 apple 1 3 b 3.4 banana 2 6 c 2.5 orange 3 5 d 3.2 lemon
xls或者xlsx格式
data_excel= pd. read_excel( 'E:\jupyter Notebook\天池比赛\joyful-pandas-master\data\\table.xlsx' )
data_excel. head( 20 )
School Class ID Gender Address Height Weight Math Physics 0 S_1 C_1 1101 M street_1 173 63 34.0 A+ 1 S_1 C_1 1102 F street_2 192 73 32.5 B+ 2 S_1 C_1 1103 M street_2 186 82 87.2 B+ 3 S_1 C_1 1104 F street_2 167 81 80.4 B- 4 S_1 C_1 1105 F street_4 159 64 84.8 B+ 5 S_1 C_2 1201 M street_5 188 68 97.0 A- 6 S_1 C_2 1202 F street_4 176 94 63.5 B- 7 S_1 C_2 1203 M street_6 160 53 58.8 A+ 8 S_1 C_2 1204 F street_5 162 63 33.8 B 9 S_1 C_2 1205 F street_6 167 63 68.4 B- 10 S_1 C_3 1301 M street_4 161 68 31.5 B+ 11 S_1 C_3 1302 F street_1 175 57 87.7 A- 12 S_1 C_3 1303 M street_7 188 82 49.7 B 13 S_1 C_3 1304 M street_2 195 70 85.2 A 14 S_1 C_3 1305 F street_5 187 69 61.7 B- 15 S_2 C_1 2101 M street_7 174 84 83.3 C 16 S_2 C_1 2102 F street_6 161 61 50.6 B+ 17 S_2 C_1 2103 M street_4 157 61 52.5 B- 18 S_2 C_1 2104 F street_5 159 97 72.2 B+ 19 S_2 C_1 2105 M street_4 170 81 34.2 A
写入
csv格式
data. to_csv( 'E:\jupyter Notebook\天池比赛\joyful-pandas-master\data\\table.csv' )
其他的不过多展示了
数据的基本结构
Series
创建一个Series
对于一个Series,其中最常用的属性为值(values),索引(index),名字(name),类型(type)
s= pd. Series( np. random. randn( 5 ) , index= [ 'a' , 'b' , 'c' , 'd' , 'e' ] , name= 'this is Series' , dtype= 'float64' )
s
a -1.576202
b 1.070319
c 0.668490
d 1.760701
e 0.942655
Name: this is Series, dtype: float64
访问Series属性
s. values
array([-1.57620183, 1.07031892, 0.66849049, 1.76070095, 0.94265474])
都会就不过多敲代码了
取出元素
s[ 'a' ]
-1.5762018254215895
调用方法
print ( [ attr for attr in dir ( s) if not attr. startswith( '_' ) ] )
['T', 'a', 'abs', 'add', 'add_prefix', 'add_suffix', 'agg', 'aggregate', 'align', 'all', 'any', 'append', 'apply', 'argmax', 'argmin', 'argsort', 'array', 'asfreq', 'asof', 'astype', 'at', 'at_time', 'attrs', 'autocorr', 'axes', 'b', 'between', 'between_time', 'bfill', 'bool', 'c', 'clip', 'combine', 'combine_first', 'convert_dtypes', 'copy', 'corr', 'count', 'cov', 'cummax', 'cummin', 'cumprod', 'cumsum', 'd', 'describe', 'diff', 'div', 'divide', 'divmod', 'dot', 'drop', 'drop_duplicates', 'droplevel', 'dropna', 'dtype', 'dtypes', 'duplicated', 'e', 'empty', 'eq', 'equals', 'ewm', 'expanding', 'explode', 'factorize', 'ffill', 'fillna', 'filter', 'first', 'first_valid_index', 'floordiv', 'ge', 'get', 'groupby', 'gt', 'hasnans', 'head', 'hist', 'iat', 'idxmax', 'idxmin', 'iloc', 'index', 'infer_objects', 'interpolate', 'is_monotonic', 'is_monotonic_decreasing', 'is_monotonic_increasing', 'is_unique', 'isin', 'isna', 'isnull', 'item', 'items', 'iteritems', 'keys', 'kurt', 'kurtosis', 'last', 'last_valid_index', 'le', 'loc', 'lt', 'mad', 'map', 'mask', 'max', 'mean', 'median', 'memory_usage', 'min', 'mod', 'mode', 'mul', 'multiply', 'name', 'nbytes', 'ndim', 'ne', 'nlargest', 'notna', 'notnull', 'nsmallest', 'nunique', 'pct_change', 'pipe', 'plot', 'pop', 'pow', 'prod', 'product', 'quantile', 'radd', 'rank', 'ravel', 'rdiv', 'rdivmod', 'reindex', 'reindex_like', 'rename', 'rename_axis', 'reorder_levels', 'repeat', 'replace', 'resample', 'reset_index', 'rfloordiv', 'rmod', 'rmul', 'rolling', 'round', 'rpow', 'rsub', 'rtruediv', 'sample', 'searchsorted', 'sem', 'set_axis', 'shape', 'shift', 'size', 'skew', 'slice_shift', 'sort_index', 'sort_values', 'squeeze', 'std', 'sub', 'subtract', 'sum', 'swapaxes', 'swaplevel', 'tail', 'take', 'to_clipboard', 'to_csv', 'to_dict', 'to_excel', 'to_frame', 'to_hdf', 'to_json', 'to_latex', 'to_list', 'to_markdown', 'to_numpy', 'to_period', 'to_pickle', 'to_sql', 'to_string', 'to_timestamp', 'to_xarray', 'transform', 'transpose', 'truediv', 'truncate', 'tshift', 'tz_convert', 'tz_localize', 'unique', 'unstack', 'update', 'value_counts', 'values', 'var', 'view', 'where', 'xs']
DataFrame
创建一个DF
df= pd. DataFrame( { 'col1' : list ( 'abcde' ) , 'col2' : range ( 5 , 10 ) , 'col3' : [ 1.3 , 2.5 , 3.6 , 4.6 , 5.8 ] } , index= list ( '12345' ) )
df
col1 col2 col3 1 a 5 1.3 2 b 6 2.5 3 c 7 3.6 4 d 8 4.6 5 e 9 5.8
从DF中取出一列为Series
df[ 'col1' ]
1 a
2 b
3 c
4 d
5 e
Name: col1, dtype: object
type ( df)
pandas.core.frame.DataFrame
type ( df[ 'col1' ] )
pandas.core.series.Series
索引对齐
df1= pd. DataFrame( { "A" : [ 1 , 2 , 3 ] } , index= [ 1 , 2 , 3 ] )
df2= pd. DataFrame( { 'A' : [ 1 , 2 , 3 ] } , index= [ 3 , 1 , 2 ] )
df1- df2
列的删除与添加
df. drop( index= '5' , columns= 'col1' )
col2 col3 1 5 1.3 2 6 2.5 3 7 3.6 4 8 4.6
df[ 'col1' ] = [ 1 , 2 , 3 , 4 , 5 ]
df. pop( 'col1' )
1 1
2 2
3 3
4 4
5 5
Name: col1, dtype: int64
df
col2 col3 1 5 1.3 2 6 2.5 3 7 3.6 4 8 4.6 5 9 5.8
df1[ 'B' ] = list ( 'abc' )
df1
df1. assign( C= pd. Series( list ( 'def' ) ) )
根据不同类型选择列
df. select_dtypes( include= [ 'number' ] ) . head( )
col2 col3 1 5 1.3 2 6 2.5 3 7 3.6 4 8 4.6 5 9 5.8
将series转换成DF
s= df. mean( )
s. name= 'to_DF'
s
col2 7.00
col3 3.56
Name: to_DF, dtype: float64
s. to_frame( )
s. to_frame( ) . T
常用的基本函数
df= pd. read_csv( 'E:\jupyter Notebook\天池比赛\joyful-pandas-master\data\\table.csv' )
df= df. drop( columns= 'Unnamed: 0' )
df
School Class ID Gender Address Height Weight Math Physics 0 S_1 C_1 1101 M street_1 173 63 34.0 A+ 1 S_1 C_1 1102 F street_2 192 73 32.5 B+ 2 S_1 C_1 1103 M street_2 186 82 87.2 B+ 3 S_1 C_1 1104 F street_2 167 81 80.4 B- 4 S_1 C_1 1105 F street_4 159 64 84.8 B+ 5 S_1 C_2 1201 M street_5 188 68 97.0 A- 6 S_1 C_2 1202 F street_4 176 94 63.5 B- 7 S_1 C_2 1203 M street_6 160 53 58.8 A+ 8 S_1 C_2 1204 F street_5 162 63 33.8 B 9 S_1 C_2 1205 F street_6 167 63 68.4 B- 10 S_1 C_3 1301 M street_4 161 68 31.5 B+ 11 S_1 C_3 1302 F street_1 175 57 87.7 A- 12 S_1 C_3 1303 M street_7 188 82 49.7 B 13 S_1 C_3 1304 M street_2 195 70 85.2 A 14 S_1 C_3 1305 F street_5 187 69 61.7 B- 15 S_2 C_1 2101 M street_7 174 84 83.3 C 16 S_2 C_1 2102 F street_6 161 61 50.6 B+ 17 S_2 C_1 2103 M street_4 157 61 52.5 B- 18 S_2 C_1 2104 F street_5 159 97 72.2 B+ 19 S_2 C_1 2105 M street_4 170 81 34.2 A 20 S_2 C_2 2201 M street_5 193 100 39.1 B 21 S_2 C_2 2202 F street_7 194 77 68.5 B+ 22 S_2 C_2 2203 M street_4 155 91 73.8 A+ 23 S_2 C_2 2204 M street_1 175 74 47.2 B- 24 S_2 C_2 2205 F street_7 183 76 85.4 B 25 S_2 C_3 2301 F street_4 157 78 72.3 B+ 26 S_2 C_3 2302 M street_5 171 88 32.7 A 27 S_2 C_3 2303 F street_7 190 99 65.9 C 28 S_2 C_3 2304 F street_6 164 81 95.5 A- 29 S_2 C_3 2305 M street_4 187 73 48.9 B 30 S_2 C_4 2401 F street_2 192 62 45.3 A 31 S_2 C_4 2402 M street_7 166 82 48.7 B 32 S_2 C_4 2403 F street_6 158 60 59.7 B+ 33 S_2 C_4 2404 F street_2 160 84 67.7 B 34 S_2 C_4 2405 F street_6 193 54 47.6 B
unique和nunique
nunique显示有多少个唯一值 —数量
df[ 'Physics' ] . nunique( )
7
unique显示所有的唯一值 —具体值
df[ 'Physics' ] . unique( )
array(['A+', 'B+', 'B-', 'A-', 'B', 'A', 'C'], dtype=object)
count and value_counts
df[ 'Physics' ] . count( )
35
df[ 'Physics' ] . value_counts( )
B+ 9
B 8
B- 6
A 4
A+ 3
A- 3
C 2
Name: Physics, dtype: int64
describe and info
df. info( )
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35 entries, 0 to 34
Data columns (total 9 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 School 35 non-null object
1 Class 35 non-null object
2 ID 35 non-null int64
3 Gender 35 non-null object
4 Address 35 non-null object
5 Height 35 non-null int64
6 Weight 35 non-null int64
7 Math 35 non-null float64
8 Physics 35 non-null object
dtypes: float64(1), int64(3), object(5)
memory usage: 2.6+ KB
df. describe( )
ID Height Weight Math count 35.00000 35.000000 35.000000 35.000000 mean 1803.00000 174.142857 74.657143 61.351429 std 536.87741 13.541098 12.895377 19.915164 min 1101.00000 155.000000 53.000000 31.500000 25% 1204.50000 161.000000 63.000000 47.400000 50% 2103.00000 173.000000 74.000000 61.700000 75% 2301.50000 187.500000 82.000000 77.100000 max 2405.00000 195.000000 100.000000 97.000000
df. describe( percentiles= [ .05 , .25 , .75 , .95 ] )
ID Height Weight Math count 35.00000 35.000000 35.000000 35.000000 mean 1803.00000 174.142857 74.657143 61.351429 std 536.87741 13.541098 12.895377 19.915164 min 1101.00000 155.000000 53.000000 31.500000 5% 1102.70000 157.000000 56.100000 32.640000 25% 1204.50000 161.000000 63.000000 47.400000 50% 2103.00000 173.000000 74.000000 61.700000 75% 2301.50000 187.500000 82.000000 77.100000 95% 2403.30000 193.300000 97.600000 90.040000 max 2405.00000 195.000000 100.000000 97.000000
df[ 'Physics' ] . describe( )
count 35
unique 7
top B+
freq 9
Name: Physics, dtype: object
idxmax and nlargest
df[ 'Math' ] . idxmax( )
df[ 'Math' ] . idxmin( )
10
df[ 'Math' ] . nlargest( )
5 97.0
28 95.5
11 87.7
2 87.2
24 85.4
Name: Math, dtype: float64
clip and repalce
df[ 'Math' ] . clip( 33 , 80 ) . head( 30 )
0 34.0
1 33.0
2 80.0
3 80.0
4 80.0
5 80.0
6 63.5
7 58.8
8 33.8
9 68.4
10 33.0
11 80.0
12 49.7
13 80.0
14 61.7
15 80.0
16 50.6
17 52.5
18 72.2
19 34.2
20 39.1
21 68.5
22 73.8
23 47.2
24 80.0
25 72.3
26 33.0
27 65.9
28 80.0
29 48.9
Name: Math, dtype: float64
df[ 'Address' ] . head( )
0 street_1
1 street_2
2 street_2
3 street_2
4 street_4
Name: Address, dtype: object
df[ 'Address' ] . repalce( [ 'street_1' , 'street_2' ] , [ 'one' , 'two' ] ) . head( )
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-97-4b99be6262e7> in <module>
----> 1 df['Address'].repalce(['street_1','street_2'],['one','two']).head()
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\generic.py in __getattr__(self, name)
5272 if self._info_axis._can_hold_identifiers_and_holds_name(name):
5273 return self[name]
-> 5274 return object.__getattribute__(self, name)
5275
5276 def __setattr__(self, name: str, value) -> None:
AttributeError: 'Series' object has no attribute 'repalce'
apply函数
df[ 'Math' ] . apply ( lambda x: str ( x) + '!' ) . head( )
0 34.0!
1 32.5!
2 87.2!
3 80.4!
4 84.8!
Name: Math, dtype: object
df. apply ( lambda x: x. apply ( lambda x: str ( x) + '!' ) ) . head( )
School Class ID Gender Address Height Weight Math Physics 0 S_1! C_1! 1101! M! street_1! 173! 63! 34.0! A+! 1 S_1! C_1! 1102! F! street_2! 192! 73! 32.5! B+! 2 S_1! C_1! 1103! M! street_2! 186! 82! 87.2! B+! 3 S_1! C_1! 1104! F! street_2! 167! 81! 80.4! B-! 4 S_1! C_1! 1105! F! street_4! 159! 64! 84.8! B+!
df[ 'Math' ] . apply ( lambda x: str ( x) + "!" )
0 34.0!
1 32.5!
2 87.2!
3 80.4!
4 84.8!
5 97.0!
6 63.5!
7 58.8!
8 33.8!
9 68.4!
10 31.5!
11 87.7!
12 49.7!
13 85.2!
14 61.7!
15 83.3!
16 50.6!
17 52.5!
18 72.2!
19 34.2!
20 39.1!
21 68.5!
22 73.8!
23 47.2!
24 85.4!
25 72.3!
26 32.7!
27 65.9!
28 95.5!
29 48.9!
30 45.3!
31 48.7!
32 59.7!
33 67.7!
34 47.6!
Name: Math, dtype: object
排序
索引排序
df. set_index( 'Math' ) . head( 20 )
School Class ID Gender Address Height Weight Physics Math 34.0 S_1 C_1 1101 M street_1 173 63 A+ 32.5 S_1 C_1 1102 F street_2 192 73 B+ 87.2 S_1 C_1 1103 M street_2 186 82 B+ 80.4 S_1 C_1 1104 F street_2 167 81 B- 84.8 S_1 C_1 1105 F street_4 159 64 B+ 97.0 S_1 C_2 1201 M street_5 188 68 A- 63.5 S_1 C_2 1202 F street_4 176 94 B- 58.8 S_1 C_2 1203 M street_6 160 53 A+ 33.8 S_1 C_2 1204 F street_5 162 63 B 68.4 S_1 C_2 1205 F street_6 167 63 B- 31.5 S_1 C_3 1301 M street_4 161 68 B+ 87.7 S_1 C_3 1302 F street_1 175 57 A- 49.7 S_1 C_3 1303 M street_7 188 82 B 85.2 S_1 C_3 1304 M street_2 195 70 A 61.7 S_1 C_3 1305 F street_5 187 69 B- 83.3 S_2 C_1 2101 M street_7 174 84 C 50.6 S_2 C_1 2102 F street_6 161 61 B+ 52.5 S_2 C_1 2103 M street_4 157 61 B- 72.2 S_2 C_1 2104 F street_5 159 97 B+ 34.2 S_2 C_1 2105 M street_4 170 81 A
df. set_index( 'Math' ) . sort_index( ) . head( )
School Class ID Gender Address Height Weight Physics Math 31.5 S_1 C_3 1301 M street_4 161 68 B+ 32.5 S_1 C_1 1102 F street_2 192 73 B+ 32.7 S_2 C_3 2302 M street_5 171 88 A 33.8 S_1 C_2 1204 F street_5 162 63 B 34.0 S_1 C_1 1101 M street_1 173 63 A+
值排序
df. sort_values( by= 'Class' ) . head( 20 )
School Class ID Gender Address Height Weight Math Physics 0 S_1 C_1 1101 M street_1 173 63 34.0 A+ 19 S_2 C_1 2105 M street_4 170 81 34.2 A 18 S_2 C_1 2104 F street_5 159 97 72.2 B+ 16 S_2 C_1 2102 F street_6 161 61 50.6 B+ 15 S_2 C_1 2101 M street_7 174 84 83.3 C 17 S_2 C_1 2103 M street_4 157 61 52.5 B- 1 S_1 C_1 1102 F street_2 192 73 32.5 B+ 2 S_1 C_1 1103 M street_2 186 82 87.2 B+ 3 S_1 C_1 1104 F street_2 167 81 80.4 B- 4 S_1 C_1 1105 F street_4 159 64 84.8 B+ 6 S_1 C_2 1202 F street_4 176 94 63.5 B- 24 S_2 C_2 2205 F street_7 183 76 85.4 B 23 S_2 C_2 2204 M street_1 175 74 47.2 B- 22 S_2 C_2 2203 M street_4 155 91 73.8 A+ 21 S_2 C_2 2202 F street_7 194 77 68.5 B+ 5 S_1 C_2 1201 M street_5 188 68 97.0 A- 20 S_2 C_2 2201 M street_5 193 100 39.1 B 9 S_1 C_2 1205 F street_6 167 63 68.4 B- 8 S_1 C_2 1204 F street_5 162 63 33.8 B 7 S_1 C_2 1203 M street_6 160 53 58.8 A+
多个值排序,即先对第一层排,在第一层相同的情况下,对第二层排序
df. sort_values( by= [ 'Math' , 'Physics' ] ) . head( 20 )
School Class ID Gender Address Height Weight Math Physics 10 S_1 C_3 1301 M street_4 161 68 31.5 B+ 1 S_1 C_1 1102 F street_2 192 73 32.5 B+ 26 S_2 C_3 2302 M street_5 171 88 32.7 A 8 S_1 C_2 1204 F street_5 162 63 33.8 B 0 S_1 C_1 1101 M street_1 173 63 34.0 A+ 19 S_2 C_1 2105 M street_4 170 81 34.2 A 20 S_2 C_2 2201 M street_5 193 100 39.1 B 30 S_2 C_4 2401 F street_2 192 62 45.3 A 23 S_2 C_2 2204 M street_1 175 74 47.2 B- 34 S_2 C_4 2405 F street_6 193 54 47.6 B 31 S_2 C_4 2402 M street_7 166 82 48.7 B 29 S_2 C_3 2305 M street_4 187 73 48.9 B 12 S_1 C_3 1303 M street_7 188 82 49.7 B 16 S_2 C_1 2102 F street_6 161 61 50.6 B+ 17 S_2 C_1 2103 M street_4 157 61 52.5 B- 7 S_1 C_2 1203 M street_6 160 53 58.8 A+ 32 S_2 C_4 2403 F street_6 158 60 59.7 B+ 14 S_1 C_3 1305 F street_5 187 69 61.7 B- 6 S_1 C_2 1202 F street_4 176 94 63.5 B- 27 S_2 C_3 2303 F street_7 190 99 65.9 C
问题与练习
问题
Series和DataFrame有哪些常见属性和方法?
for Series have values,name,type and index for DF index and values
value_counts会统计缺失值吗?
与idxmax和nlargest功能相反的是哪两组函数?
df.mean(axis=1)是什么意思?它与df.mean()的结果一样吗?第一问提到的函数也有axis参数吗?怎么使用?
针对DF进行行均值计算 不一样,前者是行,后者是列
练习
现有一份关于美剧《权力的游戏》剧本的数据集,请解决以下问题:
在所有的数据中,一共出现了多少人物? 以单元格计数(即简单把一个单元格视作一句),谁说了最多的话? 以单词计数,谁说了最多的单词?
data1= pd. read_csv( 'E:\jupyter Notebook\天池比赛\joyful-pandas-master\data\\Game_of_Thrones_Script.csv' )
data1. head( 20 )
Release Date Season Episode Episode Title Name Sentence 0 2011/4/17 Season 1 Episode 1 Winter is Coming waymar royce What do you expect? They're savages. One lot s... 1 2011/4/17 Season 1 Episode 1 Winter is Coming will I've never seen wildlings do a thing like this... 2 2011/4/17 Season 1 Episode 1 Winter is Coming waymar royce How close did you get? 3 2011/4/17 Season 1 Episode 1 Winter is Coming will Close as any man would. 4 2011/4/17 Season 1 Episode 1 Winter is Coming gared We should head back to the wall. 5 2011/4/17 Season 1 Episode 1 Winter is Coming royce Do the dead frighten you? 6 2011/4/17 Season 1 Episode 1 Winter is Coming gared Our orders were to track the wildlings. We tra... 7 2011/4/17 Season 1 Episode 1 Winter is Coming royce You don't think he'll ask us how they died? Ge... 8 2011/4/17 Season 1 Episode 1 Winter is Coming will Whatever did it to them could do it to us. The... 9 2011/4/17 Season 1 Episode 1 Winter is Coming royce It's a good thing we're not children. You want... 10 2011/4/17 Season 1 Episode 1 Winter is Coming royce Your dead men seem to have moved camp. 11 2011/4/17 Season 1 Episode 1 Winter is Coming will They were here. 12 2011/4/17 Season 1 Episode 1 Winter is Coming gared See where they went. 13 2011/4/17 Season 1 Episode 1 Winter is Coming royce What is it? 14 2011/4/17 Season 1 Episode 1 Winter is Coming gared It's … 15 2011/4/17 Season 1 Episode 1 Winter is Coming jon snow Go on. Father's watching. 16 2011/4/17 Season 1 Episode 1 Winter is Coming jon snow And your mother. 17 2011/4/17 Season 1 Episode 1 Winter is Coming septa mordane Fine work, as always. Well done. 18 2011/4/17 Season 1 Episode 1 Winter is Coming sansa stark Thank you. 19 2011/4/17 Season 1 Episode 1 Winter is Coming septa mordane I love the detail that you've managed to get i...
data1. info( )
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23911 entries, 0 to 23910
Data columns (total 6 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Release Date 23911 non-null object
1 Season 23911 non-null object
2 Episode 23911 non-null object
3 Episode Title 23911 non-null object
4 Name 23911 non-null object
5 Sentence 23911 non-null object
dtypes: object(6)
memory usage: 1.1+ MB
data1[ 'Name' ] . nunique( )
564
data1[ 'Name' ] . describe( )
count 23911
unique 564
top tyrion lannister
freq 1760
Name: Name, dtype: object
第三个题目不会
现有一份关于科比的投篮数据集,请解决如下问题:
哪种action_type和combined_shot_type的组合是最多的? 在所有被记录的game_id中,遭遇到最多的opponent是一个支?
data2= pd. read_csv( 'E:\jupyter Notebook\天池比赛\joyful-pandas-master\data\\Kobe_data.csv' )
data2
shot_id action_type combined_shot_type game_event_id game_id lat loc_x loc_y lon minutes_remaining ... shot_made_flag shot_type shot_zone_area shot_zone_basic shot_zone_range team_id team_name game_date matchup opponent 0 1 Jump Shot Jump Shot 10 20000012 33.9723 167 72 -118.1028 10 ... NaN 2PT Field Goal Right Side(R) Mid-Range 16-24 ft. 1610612747 Los Angeles Lakers 2000/10/31 LAL @ POR POR 1 2 Jump Shot Jump Shot 12 20000012 34.0443 -157 0 -118.4268 10 ... 0.0 2PT Field Goal Left Side(L) Mid-Range 8-16 ft. 1610612747 Los Angeles Lakers 2000/10/31 LAL @ POR POR 2 3 Jump Shot Jump Shot 35 20000012 33.9093 -101 135 -118.3708 7 ... 1.0 2PT Field Goal Left Side Center(LC) Mid-Range 16-24 ft. 1610612747 Los Angeles Lakers 2000/10/31 LAL @ POR POR 3 4 Jump Shot Jump Shot 43 20000012 33.8693 138 175 -118.1318 6 ... 0.0 2PT Field Goal Right Side Center(RC) Mid-Range 16-24 ft. 1610612747 Los Angeles Lakers 2000/10/31 LAL @ POR POR 4 5 Driving Dunk Shot Dunk 155 20000012 34.0443 0 0 -118.2698 6 ... 1.0 2PT Field Goal Center(C) Restricted Area Less Than 8 ft. 1610612747 Los Angeles Lakers 2000/10/31 LAL @ POR POR ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... 30692 30693 Jump Shot Jump Shot 397 49900088 33.9963 1 48 -118.2688 6 ... 0.0 2PT Field Goal Center(C) In The Paint (Non-RA) Less Than 8 ft. 1610612747 Los Angeles Lakers 2000/6/19 LAL vs. IND IND 30693 30694 Tip Shot Tip Shot 398 49900088 34.0443 0 0 -118.2698 6 ... NaN 2PT Field Goal Center(C) Restricted Area Less Than 8 ft. 1610612747 Los Angeles Lakers 2000/6/19 LAL vs. IND IND 30694 30695 Running Jump Shot Jump Shot 426 49900088 33.8783 -134 166 -118.4038 3 ... 1.0 2PT Field Goal Left Side Center(LC) Mid-Range 16-24 ft. 1610612747 Los Angeles Lakers 2000/6/19 LAL vs. IND IND 30695 30696 Jump Shot Jump Shot 448 49900088 33.7773 31 267 -118.2388 2 ... 0.0 3PT Field Goal Center(C) Above the Break 3 24+ ft. 1610612747 Los Angeles Lakers 2000/6/19 LAL vs. IND IND 30696 30697 Jump Shot Jump Shot 471 49900088 33.9723 1 72 -118.2688 0 ... 0.0 2PT Field Goal Center(C) In The Paint (Non-RA) Less Than 8 ft. 1610612747 Los Angeles Lakers 2000/6/19 LAL vs. IND IND
30697 rows × 25 columns
data2= data2. set_index( 'shot_id' )
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-135-7f5596703652> in <module>
----> 1 data2=data2.set_index('shot_id')
2 data2.head(20)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py in set_index(self, keys, drop, append, inplace, verify_integrity)
4301
4302 if missing:
-> 4303 raise KeyError(f"None of {missing} are in the columns")
4304
4305 if inplace:
KeyError: "None of ['shot_id'] are in the columns"
data2. head( 20 )
action_type combined_shot_type game_event_id game_id lat loc_x loc_y lon minutes_remaining period ... shot_made_flag shot_type shot_zone_area shot_zone_basic shot_zone_range team_id team_name game_date matchup opponent shot_id 1 Jump Shot Jump Shot 10 20000012 33.9723 167 72 -118.1028 10 1 ... NaN 2PT Field Goal Right Side(R) Mid-Range 16-24 ft. 1610612747 Los Angeles Lakers 2000/10/31 LAL @ POR POR 2 Jump Shot Jump Shot 12 20000012 34.0443 -157 0 -118.4268 10 1 ... 0.0 2PT Field Goal Left Side(L) Mid-Range 8-16 ft. 1610612747 Los Angeles Lakers 2000/10/31 LAL @ POR POR 3 Jump Shot Jump Shot 35 20000012 33.9093 -101 135 -118.3708 7 1 ... 1.0 2PT Field Goal Left Side Center(LC) Mid-Range 16-24 ft. 1610612747 Los Angeles Lakers 2000/10/31 LAL @ POR POR 4 Jump Shot Jump Shot 43 20000012 33.8693 138 175 -118.1318 6 1 ... 0.0 2PT Field Goal Right Side Center(RC) Mid-Range 16-24 ft. 1610612747 Los Angeles Lakers 2000/10/31 LAL @ POR POR 5 Driving Dunk Shot Dunk 155 20000012 34.0443 0 0 -118.2698 6 2 ... 1.0 2PT Field Goal Center(C) Restricted Area Less Than 8 ft. 1610612747 Los Angeles Lakers 2000/10/31 LAL @ POR POR 6 Jump Shot Jump Shot 244 20000012 34.0553 -145 -11 -118.4148 9 3 ... 0.0 2PT Field Goal Left Side(L) Mid-Range 8-16 ft. 1610612747 Los Angeles Lakers 2000/10/31 LAL @ POR POR 7 Layup Shot Layup 251 20000012 34.0443 0 0 -118.2698 8 3 ... 1.0 2PT Field Goal Center(C) Restricted Area Less Than 8 ft. 1610612747 Los Angeles Lakers 2000/10/31 LAL @ POR POR 8 Jump Shot Jump Shot 254 20000012 34.0163 1 28 -118.2688 8 3 ... NaN 2PT Field Goal Center(C) Restricted Area Less Than 8 ft. 1610612747 Los Angeles Lakers 2000/10/31 LAL @ POR POR 9 Jump Shot Jump Shot 265 20000012 33.9363 -65 108 -118.3348 6 3 ... 1.0 2PT Field Goal Left Side(L) In The Paint (Non-RA) 8-16 ft. 1610612747 Los Angeles Lakers 2000/10/31 LAL @ POR POR 10 Running Jump Shot Jump Shot 294 20000012 33.9193 -33 125 -118.3028 3 3 ... 0.0 2PT Field Goal Center(C) In The Paint (Non-RA) 8-16 ft. 1610612747 Los Angeles Lakers 2000/10/31 LAL @ POR POR 11 Jump Shot Jump Shot 309 20000012 33.8063 -94 238 -118.3638 1 3 ... 0.0 3PT Field Goal Left Side Center(LC) Above the Break 3 24+ ft. 1610612747 Los Angeles Lakers 2000/10/31 LAL @ POR POR 12 Jump Shot Jump Shot 4 20000019 33.9173 121 127 -118.1488 11 1 ... 1.0 2PT Field Goal Right Side Center(RC) Mid-Range 16-24 ft. 1610612747 Los Angeles Lakers 2000/11/1 LAL vs. UTA UTA 13 Running Jump Shot Jump Shot 27 20000019 33.9343 -67 110 -118.3368 7 1 ... 1.0 2PT Field Goal Left Side(L) In The Paint (Non-RA) 8-16 ft. 1610612747 Los Angeles Lakers 2000/11/1 LAL vs. UTA UTA 14 Jump Shot Jump Shot 66 20000019 34.0403 -94 4 -118.3638 2 1 ... 0.0 2PT Field Goal Left Side(L) Mid-Range 8-16 ft. 1610612747 Los Angeles Lakers 2000/11/1 LAL vs. UTA UTA 15 Jump Shot Jump Shot 80 20000019 33.9973 -23 47 -118.2928 1 1 ... 0.0 2PT Field Goal Center(C) In The Paint (Non-RA) Less Than 8 ft. 1610612747 Los Angeles Lakers 2000/11/1 LAL vs. UTA UTA 16 Jump Shot Jump Shot 86 20000019 33.8523 62 192 -118.2078 0 1 ... 0.0 2PT Field Goal Center(C) Mid-Range 16-24 ft. 1610612747 Los Angeles Lakers 2000/11/1 LAL vs. UTA UTA 17 Driving Layup Shot Layup 100 20000019 34.0443 0 0 -118.2698 0 1 ... NaN 2PT Field Goal Center(C) Restricted Area Less Than 8 ft. 1610612747 Los Angeles Lakers 2000/11/1 LAL vs. UTA UTA 18 Jump Shot Jump Shot 138 20000019 33.8183 -117 226 -118.3868 8 2 ... 1.0 3PT Field Goal Left Side Center(LC) Above the Break 3 24+ ft. 1610612747 Los Angeles Lakers 2000/11/1 LAL vs. UTA UTA 19 Jump Shot Jump Shot 244 20000019 33.9473 -132 97 -118.4018 11 3 ... 0.0 2PT Field Goal Left Side Center(LC) Mid-Range 16-24 ft. 1610612747 Los Angeles Lakers 2000/11/1 LAL vs. UTA UTA 20 Driving Layup Shot Layup 249 20000019 34.0443 0 0 -118.2698 10 3 ... NaN 2PT Field Goal Center(C) Restricted Area Less Than 8 ft. 1610612747 Los Angeles Lakers 2000/11/1 LAL vs. UTA UTA
20 rows × 24 columns
data2. groupby( by= ( 'action_type' , 'combined_shot_type' ) )
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-138-e68698f5f785> in <module>
----> 1 data2.groupby(by=('action_type','combined_shot_type'))
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py in groupby(self, by, axis, level, as_index, sort, group_keys, squeeze, observed)
5808 group_keys=group_keys,
5809 squeeze=squeeze,
-> 5810 observed=observed,
5811 )
5812
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\groupby\groupby.py in __init__(self, obj, keys, axis, level, grouper, exclusions, selection, as_index, sort, group_keys, squeeze, observed, mutated)
407 sort=sort,
408 observed=observed,
--> 409 mutated=self.mutated,
410 )
411
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\groupby\grouper.py in get_grouper(obj, key, axis, level, sort, observed, mutated, validate)
596 in_axis, name, level, gpr = False, None, gpr, None
597 else:
--> 598 raise KeyError(gpr)
599 elif isinstance(gpr, Grouper) and gpr.key is not None:
600 # Add key to exclusions
KeyError: ('action_type', 'combined_shot_type')
data2_join= data2. assign( join= data2[ 'combined_shot_type' ] + data2[ 'action_type' ] )
data2_join. head( )
action_type combined_shot_type game_event_id game_id lat loc_x loc_y lon minutes_remaining period ... shot_type shot_zone_area shot_zone_basic shot_zone_range team_id team_name game_date matchup opponent join shot_id 1 Jump Shot Jump Shot 10 20000012 33.9723 167 72 -118.1028 10 1 ... 2PT Field Goal Right Side(R) Mid-Range 16-24 ft. 1610612747 Los Angeles Lakers 2000/10/31 LAL @ POR POR Jump ShotJump Shot 2 Jump Shot Jump Shot 12 20000012 34.0443 -157 0 -118.4268 10 1 ... 2PT Field Goal Left Side(L) Mid-Range 8-16 ft. 1610612747 Los Angeles Lakers 2000/10/31 LAL @ POR POR Jump ShotJump Shot 3 Jump Shot Jump Shot 35 20000012 33.9093 -101 135 -118.3708 7 1 ... 2PT Field Goal Left Side Center(LC) Mid-Range 16-24 ft. 1610612747 Los Angeles Lakers 2000/10/31 LAL @ POR POR Jump ShotJump Shot 4 Jump Shot Jump Shot 43 20000012 33.8693 138 175 -118.1318 6 1 ... 2PT Field Goal Right Side Center(RC) Mid-Range 16-24 ft. 1610612747 Los Angeles Lakers 2000/10/31 LAL @ POR POR Jump ShotJump Shot 5 Driving Dunk Shot Dunk 155 20000012 34.0443 0 0 -118.2698 6 2 ... 2PT Field Goal Center(C) Restricted Area Less Than 8 ft. 1610612747 Los Angeles Lakers 2000/10/31 LAL @ POR POR DunkDriving Dunk Shot
5 rows × 25 columns
data2_join1= data2[ 'combined_shot_type' ] + '+' + data2[ 'action_type' ]
data2_join1. head( 100 )
shot_id
1 Jump Shot+Jump Shot
2 Jump Shot+Jump Shot
3 Jump Shot+Jump Shot
4 Jump Shot+Jump Shot
5 Dunk+Driving Dunk Shot
...
96 Jump Shot+Jump Shot
97 Jump Shot+Jump Shot
98 Jump Shot+Jump Shot
99 Jump Shot+Jump Shot
100 Jump Shot+Jump Shot
Length: 100, dtype: object
data2_join1. nunique( )
57
data2_join1. value_counts( )
Jump Shot+Jump Shot 18880
Layup+Layup Shot 2567
Layup+Driving Layup Shot 1978
Jump Shot+Turnaround Jump Shot 1057
Jump Shot+Fadeaway Jump Shot 1048
Jump Shot+Running Jump Shot 926
Jump Shot+Pullup Jump shot 476
Jump Shot+Turnaround Fadeaway shot 439
Dunk+Slam Dunk Shot 411
Layup+Reverse Layup Shot 395
Jump Shot+Jump Bank Shot 333
Dunk+Driving Dunk Shot 310
Dunk+Dunk Shot 262
Tip Shot+Tip Shot 182
Dunk+Alley Oop Dunk Shot 122
Jump Shot+Step Back Jump shot 118
Jump Shot+Floating Jump shot 114
Layup+Driving Reverse Layup Shot 97
Hook Shot+Hook Shot 84
Layup+Driving Finger Roll Shot 82
Layup+Alley Oop Layup shot 80
Dunk+Reverse Dunk Shot 75
Layup+Running Layup Shot 72
Bank Shot+Turnaround Bank shot 71
Layup+Driving Finger Roll Layup Shot 69
Bank Shot+Running Bank shot 48
Dunk+Driving Slam Dunk Shot 48
Hook Shot+Running Hook Shot 41
Layup+Finger Roll Layup Shot 33
Jump Shot+Fadeaway Bank shot 31
Layup+Finger Roll Shot 28
Jump Shot+Driving Jump shot 28
Jump Shot+Jump Hook Shot 24
Dunk+Running Dunk Shot 19
Dunk+Reverse Slam Dunk Shot 16
Layup+Putback Layup Shot 15
Dunk+Follow Up Dunk Shot 15
Hook Shot+Turnaround Hook Shot 14
Hook Shot+Driving Hook Shot 14
Bank Shot+Pullup Bank shot 12
Layup+Running Reverse Layup Shot 11
Layup+Running Finger Roll Layup Shot 6
Layup+Cutting Layup Shot 6
Bank Shot+Driving Bank shot 5
Bank Shot+Hook Bank Shot 5
Dunk+Putback Dunk Shot 5
Jump Shot+Driving Floating Jump Shot 5
Layup+Running Finger Roll Shot 4
Jump Shot+Running Pull-Up Jump Shot 4
Layup+Turnaround Finger Roll Shot 2
Tip Shot+Running Tip Shot 2
Layup+Tip Layup Shot 2
Dunk+Putback Slam Dunk Shot 2
Dunk+Running Slam Dunk Shot 1
Jump Shot+Turnaround Fadeaway Bank Jump Shot 1
Jump Shot+Driving Floating Bank Jump Shot 1
Layup+Cutting Finger Roll Layup Shot 1
dtype: int64
data2[ 'opponent' ] . value_counts( )
SAS 1978
PHX 1781
HOU 1666
SAC 1643
DEN 1642
POR 1539
UTA 1490
MIN 1474
GSW 1356
LAC 1285
DAL 1199
MEM 1030
BOS 946
SEA 828
IND 761
PHI 720
ORL 719
DET 715
OKC 677
TOR 664
NYK 657
MIA 627
CHA 620
CLE 619
CHI 610
WAS 600
MIL 586
NOH 581
NJN 520
ATL 519
NOP 344
VAN 246
BKN 55
Name: opponent, dtype: int64
经验总结
本次是第二次参加组队学习,这次是针对pandas的学习 在这个练习中暂时没有遇到什么问题,除了在课后习题中第一题的第三问做不出来,之后等学习到那个时候在解决。其次是自己的解决方案和参考答案方法不同,但结果相同