001
1
import numpy as np
#creat python list
list_1 = [1,2,3,4]
list_1
1
[1, 2, 3, 4]
3
array_1 = np.array(list_1)
array_1
3
array([1, 2, 3, 4])
5
list_2 = [5,6,7,8]
#如果是两个列表注意带括号
array_2 = np.array([list_1,list_2])
array_2
5
array([[1, 2, 3, 4],
[5, 6, 7, 8]])
6
array_2.size
6
8
7
array_2.dtype
7
dtype('int32')
11
array_3 = np.array([1.0,2.0])
array_3
array_3.dtype
11
dtype('float64')
13
array_4 = np.arange(1,10)
array_4
13
array([1, 2, 3, 4, 5, 6, 7, 8, 9])
14
array_5 = np.arange(1,10,2)
array_5
14
array([1, 3, 5, 7, 9])
15
np.zeros(5)
15
array([0., 0., 0., 0., 0.])
16
np.zeros([5,5])
16
array([[0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0.]])
17
np.eye(5)
17
array([[1., 0., 0., 0., 0.],
[0., 1., 0., 0., 0.],
[0., 0., 1., 0., 0.],
[0., 0., 0., 1., 0.],
[0., 0., 0., 0., 1.]])
18
np.eye(5).dtype
18
dtype('float64')
访问元素
19
a = np.arange(1,10)
a[1]
19
2
20
a[1:5]
20
array([2, 3, 4, 5])
32
b = np.array([[1,2,3],[4,5,6]])
b[1,0]
b[:1,1:2]
32
array([[2]])
002
1
import numpy as np
np.random.randn(10)
1
array([-0.79660703, -0.78265001, -1.34747067, 0.4507426 , -0.18268078,
-1.47208304, 1.66663589, 2.44104508, -0.5616724 , 0.20463538])
5
np.random.randint(10,size=(2,3))
5
array([[2, 6, 1],
[4, 7, 5]])
7
np.random.randint(10,size = 20).reshape(4,5)
7
array([[3, 3, 5, 7, 4],
[3, 7, 7, 0, 2],
[2, 8, 6, 8, 6],
[6, 0, 5, 5, 1]])
数组运算
22
a = np.random.randint(10,size = 20).reshape(4,5)
b = np.random.randint(10,size=20).reshape(4,5)
print('a = ', a)
print('b = ', b)
print('a+b=',a + b)
print('a-b=',a - b)
print('a*b',a*b)#对应位置直接做乘法
print('a/b',a/b)
a = [[2 0 4 9 3]
[9 3 3 2 6]
[7 4 3 1 5]
[3 8 4 9 5]]
b = [[9 2 5 8 5]
[8 0 8 3 1]
[1 3 0 7 5]
[9 1 3 1 3]]
a+b= [[11 2 9 17 8]
[17 3 11 5 7]
[ 8 7 3 8 10]
[12 9 7 10 8]]
a-b= [[-7 -2 -1 1 -2]
[ 1 3 -5 -1 5]
[ 6 1 3 -6 0]
[-6 7 1 8 2]]
a*b [[18 0 20 72 15]
[72 0 24 6 6]
[ 7 12 0 7 25]
[27 8 12 9 15]]
a/b [[0.22222222 0. 0.8 1.125 0.6 ]
[1.125 inf 0.375 0.66666667 6. ]
[7. 1.33333333 inf 0.14285714 1. ]
[0.33333333 8. 1.33333333 9. 1.66666667]]
<ipython-input-22-0209bc988bb6>:8: RuntimeWarning: divide by zero encountered in true_divide
print('a/b',a/b)
23
np.mat(a)
23
matrix([[2, 0, 4, 9, 3],
[9, 3, 3, 2, 6],
[7, 4, 3, 1, 5],
[3, 8, 4, 9, 5]])
矩阵的运算
26
a = np.matrix(a)
b = np.matrix(b)
print('a+b=',a + b)
print('a-b=',a - b)
a+b= [[11 2 9 17 8]
[17 3 11 5 7]
[ 8 7 3 8 10]
[12 9 7 10 8]]
a-b= [[-7 -2 -1 1 -2]
[ 1 3 -5 -1 5]
[ 6 1 3 -6 0]
[-6 7 1 8 2]]
Array常用函数
45
a = np.random.randint(10,size = 20).reshape(4,5)
print(a)
print(np.unique(a))
print(sum(a))
print(sum(a[0]))#求第一行的和
print(sum(a[:,0]))#求第一列的和
print(a.max())#求a中的最大值
print(max(a[0]))#第一行的最大值
print(max(a[:,0]))#第一列的最大值
[[5 5 6 6 7]
[1 1 0 6 5]
[4 9 3 1 8]
[0 8 1 4 2]]
[0 1 2 3 4 5 6 7 8 9]
[10 23 10 17 22]
29
10
9
7
5
003
使用pickle序列化numpy array
16
import pickle
import numpy as np
x = np.arange(10)
#x
f = open('x.pkl','wb')
pickle.dump(x,f)
f.close()
f = open('x.pkl','rb')
pickle.load(f)
16
array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
更方便的方法
18
np.save('one_array',x)
np.load('one_array.npy')
18
array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
27
y = np.arange(20)
y
np.savez('two_array.npz',a=x,b=y)#必须有.npz
c = np.load('two_array.npz')
print(c['a'])
print(c['b'])
[0 1 2 3 4 5 6 7 8 9]
[ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19]
004
pandas的两大主要数据结构 Series和DateFrame,其中Series 是带标签的一维数组,可存储整数、浮点数、字符串、Python 对象等类型的数据。
3
import pandas as pd
import numpy as np
s1 = pd.Series([1,2,3,4])
print(s1)
print(s1.values)
0 1
1 2
2 3
3 4
dtype: int64
[1 2 3 4]
4
s1.index
4
RangeIndex(start=0, stop=4, step=1)
6
#通过numpy创建Series
s2 = pd.Series(np.arange(10))
s2
6
0 0
1 1
2 2
3 3
4 4
5 5
6 6
7 7
8 8
9 9
dtype: int32
通过字典创建
9
s3 = pd.Series({'我':1,'是':2})
print(s3)
print(s3.values)
我 1
是 2
dtype: int64
[1 2]
16
s4 = pd.Series([1,2,3,4],index=['A','B','C','D'])
print(s4)
print(s4.values)
print(s4.index)
print(s4['A'])
print(s4[s4>2])
print(s4.to_dict())
print(s4.to_csv())
A 1
B 2
C 3
D 4
dtype: int64
[1 2 3 4]
Index(['A', 'B', 'C', 'D'], dtype='object')
1
C 3
D 4
dtype: int64
{'A': 1, 'B': 2, 'C': 3, 'D': 4}
,0
A,1
B,2
C,3
D,4
18
index_1 = ['A','B','C','D','E','F']
s6 = pd.Series(s4,index=index_1)
print(s6)
A 1.0
B 2.0
C 3.0
D 4.0
E NaN
F NaN
dtype: float64
19
pd.isnull(s6)
19
A False
B False
C False
D False
E True
F True
dtype: bool
20
pd.notnull(s6)
20
A True
B True
C True
D True
E False
F False
dtype: bool
32
s6.name = 'demo1'
print(s6)
s6.index.name = 'demo index1'
print(s6)
print(s6.name)
print(s6.index)
print(s6.values)
demo index1
A 1.0
B 2.0
C 3.0
D 4.0
E NaN
F NaN
Name: demo1, dtype: float64
demo index1
A 1.0
B 2.0
C 3.0
D 4.0
E NaN
F NaN
Name: demo1, dtype: float64
demo1
Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object', name='demo index1')
[ 1. 2. 3. 4. nan nan]
005
9
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
import webbrowser
link = 'https://www.tiobe.com/tiobe-index/'
webbrowser.open(link)
9
True
21
df = pd.read_clipboard()
print(df)
print(type(df))
print(df.columns)
print(df.Ratings)
print('-'*1000)
df_new = DataFrame(df,columns=['Programming Language','Nov 2020'])
print(df_new)
print('-'*100)
print(df['Nov 2019'])
#print(type(df['Nov 2019']))
Nov 2020 Nov 2019 Change Programming Language Ratings Change.1
0 1 2 change C 16.21% +0.17%
1 2 3 change Python 12.12% +2.27%
2 3 1 change Java 11.68% -4.57%
3 4 4 NaN C++ 7.60% +1.99%
4 5 5 NaN C# 4.67% +0.36%
5 6 6 NaN Visual Basic 4.01% -0.22%
6 7 7 NaN JavaScript 2.03% +0.10%
7 8 8 NaN PHP 1.79% +0.07%
8 9 16 change R 1.64% +0.66%
9 10 9 change SQL 1.54% -0.15%
10 11 14 change Groovy 1.51% +0.41%
11 12 21 change Perl 1.51% +0.68%
12 13 20 change Go 1.36% +0.51%
13 14 10 change Swift 1.35% -0.31%
14 15 11 change Ruby 1.22% -0.04%
15 16 15 change Assembly language 1.17% +0.14%
16 17 19 change MATLAB 1.10% +0.21%
17 18 13 change Delphi/Object Pascal 0.86% -0.28%
18 19 12 change Objective-C 0.84% -0.35%
19 20 32 change NaN NaN NaN
<class 'pandas.core.frame.DataFrame'>
Index(['Nov 2020', 'Nov 2019', 'Change', 'Programming Language', 'Ratings',
'Change.1'],
dtype='object')
0 16.21%
1 12.12%
2 11.68%
3 7.60%
4 4.67%
5 4.01%
6 2.03%
7 1.79%
8 1.64%
9 1.54%
10 1.51%
11 1.51%
12 1.36%
13 1.35%
14 1.22%
15 1.17%
16 1.10%
17 0.86%
18 0.84%
19 NaN
Name: Ratings, dtype: object
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Programming Language Nov 2020
0 C 1
1 Python 2
2 Java 3
3 C++ 4
4 C# 5
5 Visual Basic 6
6 JavaScript 7
7 PHP 8
8 R 9
9 SQL 10
10 Groovy 11
11 Perl 12
12 Go 13
13 Swift 14
14 Ruby 15
15 Assembly language 16
16 MATLAB 17
17 Delphi/Object Pascal 18
18 Objective-C 19
19 NaN 20
----------------------------------------------------------------------------------------------------
0 2
1 3
2 1
3 4
4 5
5 6
6 7
7 8
8 16
9 9
10 14
11 21
12 20
13 10
14 11
15 15
16 19
17 13
18 12
19 32
Name: Nov 2019, dtype: int64
23
print(type(df['Nov 2019']))
#添加一个列
df_new = DataFrame(df,columns=['Programming Language','Nov 2020','Sep 2020'])
print(df_new)
<class 'pandas.core.series.Series'>
Programming Language Nov 2020 Sep 2020
0 C 1 NaN
1 Python 2 NaN
2 Java 3 NaN
3 C++ 4 NaN
4 C# 5 NaN
5 Visual Basic 6 NaN
6 JavaScript 7 NaN
7 PHP 8 NaN
8 R 9 NaN
9 SQL 10 NaN
10 Groovy 11 NaN
11 Perl 12 NaN
12 Go 13 NaN
13 Swift 14 NaN
14 Ruby 15 NaN
15 Assembly language 16 NaN
16 MATLAB 17 NaN
17 Delphi/Object Pascal 18 NaN
18 Objective-C 19 NaN
19 NaN 20 NaN
24
#给新加的列赋值
df_new['Sep 2020'] = np.arange(20)
print(df_new)
Programming Language Nov 2020 Sep 2020
0 C 1 0
1 Python 2 1
2 Java 3 2
3 C++ 4 3
4 C# 5 4
5 Visual Basic 6 5
6 JavaScript 7 6
7 PHP 8 7
8 R 9 8
9 SQL 10 9
10 Groovy 11 10
11 Perl 12 11
12 Go 13 12
13 Swift 14 13
14 Ruby 15 14
15 Assembly language 16 15
16 MATLAB 17 16
17 Delphi/Object Pascal 18 17
18 Objective-C 19 18
19 NaN 20 19
29
df_new['Sep 2020'] = pd.Series(np.arange(20))
print(df_new)
Programming Language Nov 2020 Sep 2020
0 C 1 0
1 Python 2 1
2 Java 3 2
3 C++ 4 3
4 C# 5 4
5 Visual Basic 6 5
6 JavaScript 7 6
7 PHP 8 7
8 R 9 8
9 SQL 10 9
10 Groovy 11 10
11 Perl 12 11
12 Go 13 12
13 Swift 14 13
14 Ruby 15 14
15 Assembly language 16 15
16 MATLAB 17 16
17 Delphi/Object Pascal 18 17
18 Objective-C 19 18
19 NaN 20 19
31
#修改列里面的值
df_new['Sep 2020'] = pd.Series([100,200],index=[1,2])
print(df_new)
Programming Language Nov 2020 Sep 2020
0 C 1 NaN
1 Python 2 100.0
2 Java 3 200.0
3 C++ 4 NaN
4 C# 5 NaN
5 Visual Basic 6 NaN
6 JavaScript 7 NaN
7 PHP 8 NaN
8 R 9 NaN
9 SQL 10 NaN
10 Groovy 11 NaN
11 Perl 12 NaN
12 Go 13 NaN
13 Swift 14 NaN
14 Ruby 15 NaN
15 Assembly language 16 NaN
16 MATLAB 17 NaN
17 Delphi/Object Pascal 18 NaN
18 Objective-C 19 NaN
19 NaN 20 NaN
006
深入理解series和dataframe Series
4
import pandas as pd
import numpy as np
data = {'Country':['China','Japan','French'],
'Capital':['Beijing','Tokyo','Paris'],
'Population':['15151561','9879446','448465']}
s1 = pd.Series(data['Country'],index=['A','B','C'])
print(s1)
print(s1.values)
print(s1.index)
A China
B Japan
C French
dtype: object
['China' 'Japan' 'French']
Index(['A', 'B', 'C'], dtype='object')
DataFrame
7
df1 = pd.DataFrame(data)
print(df1)
cou = df1['Country']
print(type(cou))
Country Capital Population
0 China Beijing 15151561
1 Japan Tokyo 9879446
2 French Paris 448465
<class 'pandas.core.series.Series'>
11
print(df1.iterrows())
for row in df1.iterrows():
print(row),print(type(row)),print(len(row))
<generator object DataFrame.iterrows at 0x000002421ABA0900>
(0, Country China
Capital Beijing
Population 15151561
Name: 0, dtype: object)
<class 'tuple'>
2
(1, Country Japan
Capital Tokyo
Population 9879446
Name: 1, dtype: object)
<class 'tuple'>
2
(2, Country French
Capital Paris
Population 448465
Name: 2, dtype: object)
<class 'tuple'>
2
38
data = {'Country':['China','Japan','French'],
'Capital':['Beijing','Tokyo','Paris'],
'Population':['15151561','9879446','448465']}
print(data)
s1 = pd.Series(data['Country'])
s2 = pd.Series(data['Capital'])
s3 = pd.Series(data['Population'])
#非常奇怪,修改colums则里面数据都变成NAN
df_new = pd.DataFrame([s1,s2,s3],index=['Country','Capital','Population'])
print('-'*100)
print(df_new)
print(df1)
print(df_new)
print(df_new.T)
# df_new = df_new.T
# print(df_new)
{'Country': ['China', 'Japan', 'French'], 'Capital': ['Beijing', 'Tokyo', 'Paris'], 'Population': ['15151561', '9879446', '448465']}
----------------------------------------------------------------------------------------------------
0 1 2
Country China Japan French
Capital Beijing Tokyo Paris
Population 15151561 9879446 448465
Country Capital Population
0 China Beijing 15151561
1 Japan Tokyo 9879446
2 French Paris 448465
0 1 2
Country China Japan French
Capital Beijing Tokyo Paris
Population 15151561 9879446 448465
Country Capital Population
0 China Beijing 15151561
1 Japan Tokyo 9879446
2 French Paris 448465
007
Dataframe IO
10
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
import lxml
import webbrowser
link = 'https://pandas.pydata.org/pandas-docs/version/0.20/io.html'
webbrowser.open(link)
Empty DataFrame
Columns: [print(df1)]
Index: []
19
df1 = pd.read_clipboard()
print(df1)
df1.to_clipboard()
#不想有index
df1.to_csv('df1.csv',index=False)
Format Type Data Description Reader Writer
0 text CSV read_csv to_csv
1 text JSON read_json to_json
2 text HTML read_html to_html
3 text Local clipboard read_clipboard to_clipboard
4 binary MS Excel read_excel to_excel
5 binary HDF5 Format read_hdf to_hdf
6 binary Feather Format read_feather to_feather
7 binary Msgpack read_msgpack to_msgpack
8 binary Stata read_stata to_stata
9 binary SAS read_sas
10 binary Python Pickle Format read_pickle to_pickle
11 SQL SQL read_sql to_sql
12 SQL Google Big Query read_gbq to_gbq
20
#csv读取
df2 = pd.read_csv('df1.csv')
print(df2)
Format Type Data Description Reader Writer
0 text CSV read_csv to_csv
1 text JSON read_json to_json
2 text HTML read_html to_html
3 text Local clipboard read_clipboard to_clipboard
4 binary MS Excel read_excel to_excel
5 binary HDF5 Format read_hdf to_hdf
6 binary Feather Format read_feather to_feather
7 binary Msgpack read_msgpack to_msgpack
8 binary Stata read_stata to_stata
9 binary SAS read_sas
10 binary Python Pickle Format read_pickle to_pickle
11 SQL SQL read_sql to_sql
12 SQL Google Big Query read_gbq to_gbq
50
print('json')
pd.read_json(df1.to_json())
json
50
Format Type Data Description Reader Writer
0 text CSV read_csv to_csv
1 text JSON read_json to_json
2 text HTML read_html to_html
3 text Local clipboard read_clipboard to_clipboard
4 binary MS Excel read_excel to_excel
5 binary HDF5 Format read_hdf to_hdf
6 binary Feather Format read_feather to_feather
7 binary Msgpack read_msgpack to_msgpack
8 binary Stata read_stata to_stata
9 binary SAS read_sas
10 binary Python Pickle Format read_pickle to_pickle
11 SQL SQL read_sql to_sql
12 SQL Google Big Query read_gbq to_gbq
55
print(df1)
df1.to_html('df1.html')
Format Type Data Description Reader Writer
0 text CSV read_csv to_csv
1 text JSON read_json to_json
2 text HTML read_html to_html
3 text Local clipboard read_clipboard to_clipboard
4 binary MS Excel read_excel to_excel
5 binary HDF5 Format read_hdf to_hdf
6 binary Feather Format read_feather to_feather
7 binary Msgpack read_msgpack to_msgpack
8 binary Stata read_stata to_stata
9 binary SAS read_sas
10 binary Python Pickle Format read_pickle to_pickle
11 SQL SQL read_sql to_sql
12 SQL Google Big Query read_gbq to_gbq
59
#这块有问题 暂时不知道如何解决
pd.read_html('file://F://learn//datescience_learn//df1.html')
---------------------------------------------------------------------------
ImportError Traceback (most recent call last)
<ipython-input-59-0383a773205d> in <module>
----> 1 pd.read_html('file://F://learn//datescience_learn//df1.html')
d:\python38\lib\site-packages\pandas\util\_decorators.py in wrapper(*args, **kwargs)
294 )
295 warnings.warn(msg, FutureWarning, stacklevel=stacklevel)
--> 296 return func(*args, **kwargs)
297
298 return wrapper
d:\python38\lib\site-packages\pandas\io\html.py in read_html(io, match, flavor, header, index_col, skiprows, attrs, parse_dates, thousands, encoding, decimal, converters, na_values, keep_default_na, displayed_only)
1084 )
1085 validate_header_arg(header)
-> 1086 return _parse(
1087 flavor=flavor,
1088 io=io,
d:\python38\lib\site-packages\pandas\io\html.py in _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs)
892 retained = None
893 for flav in flavor:
--> 894 parser = _parser_dispatch(flav)
895 p = parser(io, compiled_match, attrs, encoding, displayed_only)
896
d:\python38\lib\site-packages\pandas\io\html.py in _parser_dispatch(flavor)
849 else:
850 if not _HAS_LXML:
--> 851 raise ImportError("lxml not found, please install it")
852 return _valid_parsers[flavor]
853
ImportError: lxml not found, please install it
73
df1
#依然错误
#df1.to_excel('df1.xlsx')
73
Format Type Data Description Reader Writer
0 text CSV read_csv to_csv
1 text JSON read_json to_json
2 text HTML read_html to_html
3 text Local clipboard read_clipboard to_clipboard
4 binary MS Excel read_excel to_excel
5 binary HDF5 Format read_hdf to_hdf
6 binary Feather Format read_feather to_feather
7 binary Msgpack read_msgpack to_msgpack
8 binary Stata read_stata to_stata
9 binary SAS read_sas
10 binary Python Pickle Format read_pickle to_pickle
11 SQL SQL read_sql to_sql
12 SQL Google Big Query read_gbq to_gbq
008
loc和iloc
2
import numpy as np
import pandas as pd
imdb = pd.read_csv('movie_metadata.csv')
print(imdb)
color director_name num_critic_for_reviews duration \
0 Color James Cameron 723.0 178.0
1 Color Gore Verbinski 302.0 169.0
2 Color Sam Mendes 602.0 148.0
3 Color Christopher Nolan 813.0 164.0
4 NaN Doug Walker NaN NaN
... ... ... ... ...
5038 Color Scott Smith 1.0 87.0
5039 Color NaN 43.0 43.0
5040 Color Benjamin Roberds 13.0 76.0
5041 Color Daniel Hsia 14.0 100.0
5042 Color Jon Gunn 43.0 90.0
director_facebook_likes actor_3_facebook_likes actor_2_name \
0 0.0 855.0 Joel David Moore
1 563.0 1000.0 Orlando Bloom
2 0.0 161.0 Rory Kinnear
3 22000.0 23000.0 Christian Bale
4 131.0 NaN Rob Walker
... ... ... ...
5038 2.0 318.0 Daphne Zuniga
5039 NaN 319.0 Valorie Curry
5040 0.0 0.0 Maxwell Moody
5041 0.0 489.0 Daniel Henney
5042 16.0 16.0 Brian Herzlinger
actor_1_facebook_likes gross genres \
0 1000.0 760505847.0 Action|Adventure|Fantasy|Sci-Fi
1 40000.0 309404152.0 Action|Adventure|Fantasy
2 11000.0 200074175.0 Action|Adventure|Thriller
3 27000.0 448130642.0 Action|Thriller
4 131.0 NaN Documentary
... ... ... ...
5038 637.0 NaN Comedy|Drama
5039 841.0 NaN Crime|Drama|Mystery|Thriller
5040 0.0 NaN Drama|Horror|Thriller
5041 946.0 10443.0 Comedy|Drama|Romance
5042 86.0 85222.0 Documentary
... num_user_for_reviews language country content_rating budget \
0 ... 3054.0 English USA PG-13 237000000.0
1 ... 1238.0 English USA PG-13 300000000.0
2 ... 994.0 English UK PG-13 245000000.0
3 ... 2701.0 English USA PG-13 250000000.0
4 ... NaN NaN NaN NaN NaN
... ... ... ... ... ... ...
5038 ... 6.0 English Canada NaN NaN
5039 ... 359.0 English USA TV-14 NaN
5040 ... 3.0 English USA NaN 1400.0
5041 ... 9.0 English USA PG-13 NaN
5042 ... 84.0 English USA PG 1100.0
title_year actor_2_facebook_likes imdb_score aspect_ratio \
0 2009.0 936.0 7.9 1.78
1 2007.0 5000.0 7.1 2.35
2 2015.0 393.0 6.8 2.35
3 2012.0 23000.0 8.5 2.35
4 NaN 12.0 7.1 NaN
... ... ... ... ...
5038 2013.0 470.0 7.7 NaN
5039 NaN 593.0 7.5 16.00
5040 2013.0 0.0 6.3 NaN
5041 2012.0 719.0 6.3 2.35
5042 2004.0 23.0 6.6 1.85
movie_facebook_likes
0 33000
1 0
2 85000
3 164000
4 0
... ...
5038 84
5039 32000
5040 16
5041 660
5042 456
[5043 rows x 28 columns]
4
print(imdb.shape)
print(imdb.tail())
print(imdb.head())#可以填一个数
(5043, 28)
color director_name num_critic_for_reviews duration \
5038 Color Scott Smith 1.0 87.0
5039 Color NaN 43.0 43.0
5040 Color Benjamin Roberds 13.0 76.0
5041 Color Daniel Hsia 14.0 100.0
5042 Color Jon Gunn 43.0 90.0
director_facebook_likes actor_3_facebook_likes actor_2_name \
5038 2.0 318.0 Daphne Zuniga
5039 NaN 319.0 Valorie Curry
5040 0.0 0.0 Maxwell Moody
5041 0.0 489.0 Daniel Henney
5042 16.0 16.0 Brian Herzlinger
actor_1_facebook_likes gross genres ... \
5038 637.0 NaN Comedy|Drama ...
5039 841.0 NaN Crime|Drama|Mystery|Thriller ...
5040 0.0 NaN Drama|Horror|Thriller ...
5041 946.0 10443.0 Comedy|Drama|Romance ...
5042 86.0 85222.0 Documentary ...
num_user_for_reviews language country content_rating budget \
5038 6.0 English Canada NaN NaN
5039 359.0 English USA TV-14 NaN
5040 3.0 English USA NaN 1400.0
5041 9.0 English USA PG-13 NaN
5042 84.0 English USA PG 1100.0
title_year actor_2_facebook_likes imdb_score aspect_ratio \
5038 2013.0 470.0 7.7 NaN
5039 NaN 593.0 7.5 16.00
5040 2013.0 0.0 6.3 NaN
5041 2012.0 719.0 6.3 2.35
5042 2004.0 23.0 6.6 1.85
movie_facebook_likes
5038 84
5039 32000
5040 16
5041 660
5042 456
[5 rows x 28 columns]
color director_name num_critic_for_reviews duration \
0 Color James Cameron 723.0 178.0
1 Color Gore Verbinski 302.0 169.0
2 Color Sam Mendes 602.0 148.0
3 Color Christopher Nolan 813.0 164.0
4 NaN Doug Walker NaN NaN
director_facebook_likes actor_3_facebook_likes actor_2_name \
0 0.0 855.0 Joel David Moore
1 563.0 1000.0 Orlando Bloom
2 0.0 161.0 Rory Kinnear
3 22000.0 23000.0 Christian Bale
4 131.0 NaN Rob Walker
actor_1_facebook_likes gross genres ... \
0 1000.0 760505847.0 Action|Adventure|Fantasy|Sci-Fi ...
1 40000.0 309404152.0 Action|Adventure|Fantasy ...
2 11000.0 200074175.0 Action|Adventure|Thriller ...
3 27000.0 448130642.0 Action|Thriller ...
4 131.0 NaN Documentary ...
num_user_for_reviews language country content_rating budget \
0 3054.0 English USA PG-13 237000000.0
1 1238.0 English USA PG-13 300000000.0
2 994.0 English UK PG-13 245000000.0
3 2701.0 English USA PG-13 250000000.0
4 NaN NaN NaN NaN NaN
title_year actor_2_facebook_likes imdb_score aspect_ratio \
0 2009.0 936.0 7.9 1.78
1 2007.0 5000.0 7.1 2.35
2 2015.0 393.0 6.8 2.35
3 2012.0 23000.0 8.5 2.35
4 NaN 12.0 7.1 NaN
movie_facebook_likes
0 33000
1 0
2 85000
3 164000
4 0
[5 rows x 28 columns]
7
print(imdb[['color','director_name']])#两个中括号
color director_name
0 Color James Cameron
1 Color Gore Verbinski
2 Color Sam Mendes
3 Color Christopher Nolan
4 NaN Doug Walker
... ... ...
5038 Color Scott Smith
5039 Color NaN
5040 Color Benjamin Roberds
5041 Color Daniel Hsia
5042 Color Jon Gunn
[5043 rows x 2 columns]
11
sub_df = imdb[['director_name','movie_title','imdb_score']]
print(sub_df)
print(sub_df.head(5))
director_name movie_title \
0 James Cameron Avatar
1 Gore Verbinski Pirates of the Caribbean: At World's End
2 Sam Mendes Spectre
3 Christopher Nolan The Dark Knight Rises
4 Doug Walker Star Wars: Episode VII - The Force Awakens ...
... ... ...
5038 Scott Smith Signed Sealed Delivered
5039 NaN The Following
5040 Benjamin Roberds A Plague So Pleasant
5041 Daniel Hsia Shanghai Calling
5042 Jon Gunn My Date with Drew
imdb_score
0 7.9
1 7.1
2 6.8
3 8.5
4 7.1
... ...
5038 7.7
5039 7.5
5040 6.3
5041 6.3
5042 6.6
[5043 rows x 3 columns]
director_name movie_title \
0 James Cameron Avatar
1 Gore Verbinski Pirates of the Caribbean: At World's End
2 Sam Mendes Spectre
3 Christopher Nolan The Dark Knight Rises
4 Doug Walker Star Wars: Episode VII - The Force Awakens ...
imdb_score
0 7.9
1 7.1
2 6.8
3 8.5
4 7.1
21
#通过index来过滤
tem_df = sub_df.iloc[10:20,:]
print(tem_df)
print('-'*100)
print(sub_df.iloc[10:20,0:2])
print(tem_df.loc[15:17,:])
print(tem_df.loc[15:17,'movie_title'])
director_name movie_title imdb_score
10 Zack Snyder Batman v Superman: Dawn of Justice 6.9
11 Bryan Singer Superman Returns 6.1
12 Marc Forster Quantum of Solace 6.7
13 Gore Verbinski Pirates of the Caribbean: Dead Man's Chest 7.3
14 Gore Verbinski The Lone Ranger 6.5
15 Zack Snyder Man of Steel 7.2
16 Andrew Adamson The Chronicles of Narnia: Prince Caspian 6.6
17 Joss Whedon The Avengers 8.1
18 Rob Marshall Pirates of the Caribbean: On Stranger Tides 6.7
19 Barry Sonnenfeld Men in Black 3 6.8
----------------------------------------------------------------------------------------------------
director_name movie_title
10 Zack Snyder Batman v Superman: Dawn of Justice
11 Bryan Singer Superman Returns
12 Marc Forster Quantum of Solace
13 Gore Verbinski Pirates of the Caribbean: Dead Man's Chest
14 Gore Verbinski The Lone Ranger
15 Zack Snyder Man of Steel
16 Andrew Adamson The Chronicles of Narnia: Prince Caspian
17 Joss Whedon The Avengers
18 Rob Marshall Pirates of the Caribbean: On Stranger Tides
19 Barry Sonnenfeld Men in Black 3
director_name movie_title imdb_score
15 Zack Snyder Man of Steel 7.2
16 Andrew Adamson The Chronicles of Narnia: Prince Caspian 6.6
17 Joss Whedon The Avengers 8.1
director_name movie_title imdb_score
15 Zack Snyder Man of Steel 7.2
16 Andrew Adamson The Chronicles of Narnia: Prince Caspian 6.6
17 Joss Whedon The Avengers 8.1
009
Reindexing Series and DataFrame
2
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
s1 = Series([1,2,3,4],index=['A','B','C','D'])
print(s1)
s1.reindex(index=['A','B','C','D','E'])
A 1
B 2
C 3
D 4
dtype: int64
2
A 1.0
B 2.0
C 3.0
D 4.0
E NaN
dtype: float64
3
s1.reindex(index=['A','B','C','D','E'],fill_value=10)
3
A 1
B 2
C 3
D 4
E 10
dtype: int64
8
s2 = Series(['A','B','C'],index=[1,5,10])
print(s2)
print(s2.reindex(index=np.arange(15),method = 'ffill'))
1 A
5 B
10 C
dtype: object
0 NaN
1 A
2 A
3 A
4 A
5 B
6 B
7 B
8 B
9 B
10 C
11 C
12 C
13 C
14 C
dtype: object
reindex dataframe
11
df1 = DataFrame(np.random.rand(25).reshape(5,5),index=['A','B','D','E','F'],columns=['c1','c2','c3','c4','c5'])
print(df1)
print(df1.reindex(index=['A','B','C','D','E','F']))
c1 c2 c3 c4 c5
A 0.196816 0.756110 0.409648 0.212659 0.532618
B 0.422182 0.162479 0.530976 0.234258 0.969964
D 0.826811 0.560312 0.192798 0.424592 0.926423
E 0.504550 0.725850 0.911693 0.595041 0.179368
F 0.608460 0.392263 0.203229 0.910608 0.295107
c1 c2 c3 c4 c5
A 0.196816 0.756110 0.409648 0.212659 0.532618
B 0.422182 0.162479 0.530976 0.234258 0.969964
C NaN NaN NaN NaN NaN
D 0.826811 0.560312 0.192798 0.424592 0.926423
E 0.504550 0.725850 0.911693 0.595041 0.179368
F 0.608460 0.392263 0.203229 0.910608 0.295107
12
print(df1.reindex(columns=['c1','c2','c3','c4','c5','c6']))
c1 c2 c3 c4 c5 c6
A 0.196816 0.756110 0.409648 0.212659 0.532618 NaN
B 0.422182 0.162479 0.530976 0.234258 0.969964 NaN
D 0.826811 0.560312 0.192798 0.424592 0.926423 NaN
E 0.504550 0.725850 0.911693 0.595041 0.179368 NaN
F 0.608460 0.392263 0.203229 0.910608 0.295107 NaN
17
print(df1.reindex(index=['A','B','C','D','E','F'],columns=['c1','c2','c3','c4','c5','c6']))
#减少index也可以,把列表里面的index 改少点
#s1.drop()
#df1.drop()#可以删除行列
print(df1.drop('c1',axis=1))
print(df1.drop('A',axis=0))
c1 c2 c3 c4 c5 c6
A 0.196816 0.756110 0.409648 0.212659 0.532618 NaN
B 0.422182 0.162479 0.530976 0.234258 0.969964 NaN
C NaN NaN NaN NaN NaN NaN
D 0.826811 0.560312 0.192798 0.424592 0.926423 NaN
E 0.504550 0.725850 0.911693 0.595041 0.179368 NaN
F 0.608460 0.392263 0.203229 0.910608 0.295107 NaN
c2 c3 c4 c5
A 0.756110 0.409648 0.212659 0.532618
B 0.162479 0.530976 0.234258 0.969964
D 0.560312 0.192798 0.424592 0.926423
E 0.725850 0.911693 0.595041 0.179368
F 0.392263 0.203229 0.910608 0.295107
c1 c2 c3 c4 c5
B 0.422182 0.162479 0.530976 0.234258 0.969964
D 0.826811 0.560312 0.192798 0.424592 0.926423
E 0.504550 0.725850 0.911693 0.595041 0.179368
F 0.608460 0.392263 0.203229 0.910608 0.295107
010
2
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
#NAA not a number
n = np.nan
type(n)
2
float
3
m = 1
print(m+n)
nan
NAN in series
5
s1 = Series([1,np.nan,3,4],index=['A','B','C','D'])
print(s1.isnull())
print(s1.dropna())
#这章。。
A False
B True
C False
D False
dtype: bool
A 1.0
C 3.0
D 4.0
dtype: float64
011
5
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
s1 = Series(np.random.rand(6),index=[['1','1','1','2','2','2'],['a','b','c','a','b','c']])
print(s1)
1 a 0.344513
b 0.525910
c 0.517284
2 a 0.519497
b 0.592509
c 0.185066
dtype: float64
6
type(s1['1'])
print(s1['1']['a'])
0.3445134511367778
7
df1 = s1.unstack()
print(df1)
a b c
1 0.344513 0.525910 0.517284
2 0.519497 0.592509 0.185066
8
df2 = DataFrame([s1['1'],s1['2']])
print(df2)
a b c
0 0.344513 0.525910 0.517284
1 0.519497 0.592509 0.185066
9
s2 = df1.unstack()
print(s2)
#s2 = df1.T.unstack()
a 1 0.344513
2 0.519497
b 1 0.525910
2 0.592509
c 1 0.517284
2 0.185066
dtype: float64
15
df = DataFrame(np.arange(16).reshape(4,4),index = [['a','a','b','b'],[1,2,1,2]],
columns=[['BJ','BJ','SH','GZ'],[8,9,88,9]])
print(df)
print(df['BJ'][8])
BJ SH GZ
8 9 88 9
a 1 0 1 2 3
2 4 5 6 7
b 1 8 9 10 11
2 12 13 14 15
a 1 0
2 4
b 1 8
2 12
Name: 8, dtype: int32