数据科学入门与实战：numpy&pandas入门

最新推荐文章于 2024-04-10 07:56:51 发布

范德彪陕西分彪

最新推荐文章于 2024-04-10 07:56:51 发布

阅读量349

点赞数

分类专栏：数据科学入门与实战

本文链接：https://blog.csdn.net/weixin_46815330/article/details/109736146

版权

数据科学入门与实战专栏收录该内容

20 篇文章 0 订阅

订阅专栏

001

1
import numpy as np
#creat python list
list_1 = [1,2,3,4]
list_1
1
[1, 2, 3, 4]
3
array_1 = np.array(list_1)
array_1
3
array([1, 2, 3, 4])
5
list_2 = [5,6,7,8]
#如果是两个列表注意带括号
array_2 = np.array([list_1,list_2])
array_2
5
array([[1, 2, 3, 4],
       [5, 6, 7, 8]])
6
array_2.size
6
8
7
array_2.dtype
7
dtype('int32')
11
array_3 = np.array([1.0,2.0])
array_3
array_3.dtype
11
dtype('float64')
13
array_4 = np.arange(1,10)
array_4

13
array([1, 2, 3, 4, 5, 6, 7, 8, 9])
14
array_5 = np.arange(1,10,2)
array_5

14
array([1, 3, 5, 7, 9])
15
np.zeros(5)

15
array([0., 0., 0., 0., 0.])
16
np.zeros([5,5])

16
array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]])
17
np.eye(5)

17
array([[1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.]])
18
np.eye(5).dtype

18
dtype('float64')
访问元素

19
a = np.arange(1,10)
a[1]

19
2
20
a[1:5]

20
array([2, 3, 4, 5])
32
b = np.array([[1,2,3],[4,5,6]])
b[1,0]
b[:1,1:2]
32
array([[2]])

002

1
import numpy as np
np.random.randn(10)

1
array([-0.79660703, -0.78265001, -1.34747067,  0.4507426 , -0.18268078,
       -1.47208304,  1.66663589,  2.44104508, -0.5616724 ,  0.20463538])
5
np.random.randint(10,size=(2,3))

5
array([[2, 6, 1],
       [4, 7, 5]])
7
np.random.randint(10,size = 20).reshape(4,5)

7
array([[3, 3, 5, 7, 4],
       [3, 7, 7, 0, 2],
       [2, 8, 6, 8, 6],
       [6, 0, 5, 5, 1]])
数组运算

22
a = np.random.randint(10,size = 20).reshape(4,5)
b = np.random.randint(10,size=20).reshape(4,5)
print('a = ', a)
print('b = ', b)
print('a+b=',a + b)
print('a-b=',a - b)
print('a*b',a*b)#对应位置直接做乘法
print('a/b',a/b)
a =  [[2 0 4 9 3]
 [9 3 3 2 6]
 [7 4 3 1 5]
 [3 8 4 9 5]]
b =  [[9 2 5 8 5]
 [8 0 8 3 1]
 [1 3 0 7 5]
 [9 1 3 1 3]]
a+b= [[11  2  9 17  8]
 [17  3 11  5  7]
 [ 8  7  3  8 10]
 [12  9  7 10  8]]
a-b= [[-7 -2 -1  1 -2]
 [ 1  3 -5 -1  5]
 [ 6  1  3 -6  0]
 [-6  7  1  8  2]]
a*b [[18  0 20 72 15]
 [72  0 24  6  6]
 [ 7 12  0  7 25]
 [27  8 12  9 15]]
a/b [[0.22222222 0.         0.8        1.125      0.6       ]
 [1.125             inf 0.375      0.66666667 6.        ]
 [7.         1.33333333        inf 0.14285714 1.        ]
 [0.33333333 8.         1.33333333 9.         1.66666667]]
<ipython-input-22-0209bc988bb6>:8: RuntimeWarning: divide by zero encountered in true_divide
  print('a/b',a/b)
23
np.mat(a)

23
matrix([[2, 0, 4, 9, 3],
        [9, 3, 3, 2, 6],
        [7, 4, 3, 1, 5],
        [3, 8, 4, 9, 5]])
矩阵的运算

26
a = np.matrix(a)
b = np.matrix(b)
print('a+b=',a + b)
print('a-b=',a - b)

a+b= [[11  2  9 17  8]
 [17  3 11  5  7]
 [ 8  7  3  8 10]
 [12  9  7 10  8]]
a-b= [[-7 -2 -1  1 -2]
 [ 1  3 -5 -1  5]
 [ 6  1  3 -6  0]
 [-6  7  1  8  2]]
Array常用函数

45
a = np.random.randint(10,size = 20).reshape(4,5)
print(a)
print(np.unique(a))
print(sum(a))
print(sum(a[0]))#求第一行的和
print(sum(a[:,0]))#求第一列的和
print(a.max())#求a中的最大值
print(max(a[0]))#第一行的最大值
print(max(a[:,0]))#第一列的最大值
[[5 5 6 6 7]
 [1 1 0 6 5]
 [4 9 3 1 8]
 [0 8 1 4 2]]
[0 1 2 3 4 5 6 7 8 9]
[10 23 10 17 22]
29
10
9
7
5

003


使用pickle序列化numpy array

16
import pickle
import numpy as np
x = np.arange(10)
#x
f = open('x.pkl','wb')
pickle.dump(x,f)
f.close()
f = open('x.pkl','rb')
pickle.load(f)

16
array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
更方便的方法

18
np.save('one_array',x)
np.load('one_array.npy')
18
array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
27
y = np.arange(20)
y
np.savez('two_array.npz',a=x,b=y)#必须有.npz
c = np.load('two_array.npz')
print(c['a'])
print(c['b'])
[0 1 2 3 4 5 6 7 8 9]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]

004

pandas的两大主要数据结构 Series和DateFrame，其中Series 是带标签的一维数组，可存储整数、浮点数、字符串、Python 对象等类型的数据。

3
import pandas as pd
import numpy as np
s1 = pd.Series([1,2,3,4])
print(s1)
print(s1.values)
0    1
1    2
2    3
3    4
dtype: int64
[1 2 3 4]
4
s1.index

4
RangeIndex(start=0, stop=4, step=1)
6
#通过numpy创建Series
s2 = pd.Series(np.arange(10))
s2
6
0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: int32
通过字典创建

9
s3 = pd.Series({'我':1,'是':2})
print(s3)
print(s3.values)
我    1
是    2
dtype: int64
[1 2]
16
s4 = pd.Series([1,2,3,4],index=['A','B','C','D'])
print(s4)
print(s4.values)
print(s4.index)
print(s4['A'])
print(s4[s4>2])
print(s4.to_dict())
print(s4.to_csv())
A    1
B    2
C    3
D    4
dtype: int64
[1 2 3 4]
Index(['A', 'B', 'C', 'D'], dtype='object')
1
C    3
D    4
dtype: int64
{'A': 1, 'B': 2, 'C': 3, 'D': 4}
,0
A,1
B,2
C,3
D,4

18
index_1 = ['A','B','C','D','E','F']
s6 = pd.Series(s4,index=index_1)
print(s6)

A    1.0
B    2.0
C    3.0
D    4.0
E    NaN
F    NaN
dtype: float64
19
pd.isnull(s6)

19
A    False
B    False
C    False
D    False
E     True
F     True
dtype: bool
20
pd.notnull(s6)


20
A     True
B     True
C     True
D     True
E    False
F    False
dtype: bool
32
s6.name = 'demo1'
print(s6)
s6.index.name = 'demo index1'
print(s6)
print(s6.name)
print(s6.index)
print(s6.values)
demo index1
A    1.0
B    2.0
C    3.0
D    4.0
E    NaN
F    NaN
Name: demo1, dtype: float64
demo index1
A    1.0
B    2.0
C    3.0
D    4.0
E    NaN
F    NaN
Name: demo1, dtype: float64
demo1
Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object', name='demo index1')
[ 1.  2.  3.  4. nan nan]

005

9
import numpy as np
import pandas as pd
from pandas import Series,DataFrame

import webbrowser
link = 'https://www.tiobe.com/tiobe-index/'
webbrowser.open(link)
9
True
21
df = pd.read_clipboard()
print(df)
print(type(df))
print(df.columns)
print(df.Ratings)
print('-'*1000)
df_new = DataFrame(df,columns=['Programming Language','Nov 2020'])
print(df_new)
print('-'*100)
print(df['Nov 2019'])
#print(type(df['Nov 2019']))
    Nov 2020  Nov 2019  Change  Programming Language Ratings Change.1
0          1         2  change                     C  16.21%   +0.17%
1          2         3  change                Python  12.12%   +2.27%
2          3         1  change                  Java  11.68%   -4.57%
3          4         4     NaN                   C++   7.60%   +1.99%
4          5         5     NaN                    C#   4.67%   +0.36%
5          6         6     NaN          Visual Basic   4.01%   -0.22%
6          7         7     NaN            JavaScript   2.03%   +0.10%
7          8         8     NaN                   PHP   1.79%   +0.07%
8          9        16  change                     R   1.64%   +0.66%
9         10         9  change                   SQL   1.54%   -0.15%
10        11        14  change                Groovy   1.51%   +0.41%
11        12        21  change                  Perl   1.51%   +0.68%
12        13        20  change                    Go   1.36%   +0.51%
13        14        10  change                 Swift   1.35%   -0.31%
14        15        11  change                  Ruby   1.22%   -0.04%
15        16        15  change     Assembly language   1.17%   +0.14%
16        17        19  change                MATLAB   1.10%   +0.21%
17        18        13  change  Delphi/Object Pascal   0.86%   -0.28%
18        19        12  change           Objective-C   0.84%   -0.35%
19        20        32  change                   NaN     NaN      NaN
<class 'pandas.core.frame.DataFrame'>
Index(['Nov 2020', 'Nov 2019', 'Change', 'Programming Language', 'Ratings',
       'Change.1'],
      dtype='object')
0     16.21%
1     12.12%
2     11.68%
3      7.60%
4      4.67%
5      4.01%
6      2.03%
7      1.79%
8      1.64%
9      1.54%
10     1.51%
11     1.51%
12     1.36%
13     1.35%
14     1.22%
15     1.17%
16     1.10%
17     0.86%
18     0.84%
19       NaN
Name: Ratings, dtype: object
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
    Programming Language  Nov 2020
0                      C         1
1                 Python         2
2                   Java         3
3                    C++         4
4                     C#         5
5           Visual Basic         6
6             JavaScript         7
7                    PHP         8
8                      R         9
9                    SQL        10
10                Groovy        11
11                  Perl        12
12                    Go        13
13                 Swift        14
14                  Ruby        15
15     Assembly language        16
16                MATLAB        17
17  Delphi/Object Pascal        18
18           Objective-C        19
19                   NaN        20
----------------------------------------------------------------------------------------------------
0      2
1      3
2      1
3      4
4      5
5      6
6      7
7      8
8     16
9      9
10    14
11    21
12    20
13    10
14    11
15    15
16    19
17    13
18    12
19    32
Name: Nov 2019, dtype: int64
23
print(type(df['Nov 2019']))
#添加一个列
df_new = DataFrame(df,columns=['Programming Language','Nov 2020','Sep 2020'])
print(df_new)
<class 'pandas.core.series.Series'>
    Programming Language  Nov 2020  Sep 2020
0                      C         1       NaN
1                 Python         2       NaN
2                   Java         3       NaN
3                    C++         4       NaN
4                     C#         5       NaN
5           Visual Basic         6       NaN
6             JavaScript         7       NaN
7                    PHP         8       NaN
8                      R         9       NaN
9                    SQL        10       NaN
10                Groovy        11       NaN
11                  Perl        12       NaN
12                    Go        13       NaN
13                 Swift        14       NaN
14                  Ruby        15       NaN
15     Assembly language        16       NaN
16                MATLAB        17       NaN
17  Delphi/Object Pascal        18       NaN
18           Objective-C        19       NaN
19                   NaN        20       NaN
24
#给新加的列赋值
df_new['Sep 2020'] = np.arange(20)
print(df_new)

    Programming Language  Nov 2020  Sep 2020
0                      C         1         0
1                 Python         2         1
2                   Java         3         2
3                    C++         4         3
4                     C#         5         4
5           Visual Basic         6         5
6             JavaScript         7         6
7                    PHP         8         7
8                      R         9         8
9                    SQL        10         9
10                Groovy        11        10
11                  Perl        12        11
12                    Go        13        12
13                 Swift        14        13
14                  Ruby        15        14
15     Assembly language        16        15
16                MATLAB        17        16
17  Delphi/Object Pascal        18        17
18           Objective-C        19        18
19                   NaN        20        19
29
df_new['Sep 2020'] = pd.Series(np.arange(20))
print(df_new)
    Programming Language  Nov 2020  Sep 2020
0                      C         1         0
1                 Python         2         1
2                   Java         3         2
3                    C++         4         3
4                     C#         5         4
5           Visual Basic         6         5
6             JavaScript         7         6
7                    PHP         8         7
8                      R         9         8
9                    SQL        10         9
10                Groovy        11        10
11                  Perl        12        11
12                    Go        13        12
13                 Swift        14        13
14                  Ruby        15        14
15     Assembly language        16        15
16                MATLAB        17        16
17  Delphi/Object Pascal        18        17
18           Objective-C        19        18
19                   NaN        20        19
31
#修改列里面的值
df_new['Sep 2020'] = pd.Series([100,200],index=[1,2])
print(df_new)
    Programming Language  Nov 2020  Sep 2020
0                      C         1       NaN
1                 Python         2     100.0
2                   Java         3     200.0
3                    C++         4       NaN
4                     C#         5       NaN
5           Visual Basic         6       NaN
6             JavaScript         7       NaN
7                    PHP         8       NaN
8                      R         9       NaN
9                    SQL        10       NaN
10                Groovy        11       NaN
11                  Perl        12       NaN
12                    Go        13       NaN
13                 Swift        14       NaN
14                  Ruby        15       NaN
15     Assembly language        16       NaN
16                MATLAB        17       NaN
17  Delphi/Object Pascal        18       NaN
18           Objective-C        19       NaN
19                   NaN        20       NaN

006


深入理解series和dataframe Series

4
import pandas as pd
import numpy as np
data = {'Country':['China','Japan','French'],
        'Capital':['Beijing','Tokyo','Paris'],
        'Population':['15151561','9879446','448465']}

s1 = pd.Series(data['Country'],index=['A','B','C'])
print(s1)
print(s1.values)
print(s1.index)
A     China
B     Japan
C    French
dtype: object
['China' 'Japan' 'French']
Index(['A', 'B', 'C'], dtype='object')
DataFrame

7
df1 = pd.DataFrame(data)
print(df1)
cou = df1['Country']
print(type(cou))
  Country  Capital Population
0   China  Beijing   15151561
1   Japan    Tokyo    9879446
2  French    Paris     448465
<class 'pandas.core.series.Series'>
11
print(df1.iterrows())
for row in df1.iterrows():
    print(row),print(type(row)),print(len(row))

<generator object DataFrame.iterrows at 0x000002421ABA0900>
(0, Country          China
Capital        Beijing
Population    15151561
Name: 0, dtype: object)
<class 'tuple'>
2
(1, Country         Japan
Capital         Tokyo
Population    9879446
Name: 1, dtype: object)
<class 'tuple'>
2
(2, Country       French
Capital        Paris
Population    448465
Name: 2, dtype: object)
<class 'tuple'>
2
38
data = {'Country':['China','Japan','French'],
        'Capital':['Beijing','Tokyo','Paris'],
        'Population':['15151561','9879446','448465']}
print(data)
s1 = pd.Series(data['Country'])
s2 = pd.Series(data['Capital'])
s3 = pd.Series(data['Population'])
#非常奇怪，修改colums则里面数据都变成NAN
df_new = pd.DataFrame([s1,s2,s3],index=['Country','Capital','Population'])
print('-'*100)
print(df_new)
print(df1)
print(df_new)
print(df_new.T)


# df_new = df_new.T
# print(df_new)
{'Country': ['China', 'Japan', 'French'], 'Capital': ['Beijing', 'Tokyo', 'Paris'], 'Population': ['15151561', '9879446', '448465']}
----------------------------------------------------------------------------------------------------
                   0        1       2
Country        China    Japan  French
Capital      Beijing    Tokyo   Paris
Population  15151561  9879446  448465
  Country  Capital Population
0   China  Beijing   15151561
1   Japan    Tokyo    9879446
2  French    Paris     448465
                   0        1       2
Country        China    Japan  French
Capital      Beijing    Tokyo   Paris
Population  15151561  9879446  448465
  Country  Capital Population
0   China  Beijing   15151561
1   Japan    Tokyo    9879446
2  French    Paris     448465

007


Dataframe IO

10
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
import lxml
import webbrowser
link = 'https://pandas.pydata.org/pandas-docs/version/0.20/io.html'
webbrowser.open(link)

Empty DataFrame
Columns: [print(df1)]
Index: []
19
df1 = pd.read_clipboard()
print(df1)
df1.to_clipboard()
#不想有index
df1.to_csv('df1.csv',index=False)
   Format Type      Data Description          Reader        Writer
0         text                   CSV        read_csv        to_csv
1         text                  JSON       read_json       to_json
2         text                  HTML       read_html       to_html
3         text       Local clipboard  read_clipboard  to_clipboard
4       binary              MS Excel      read_excel      to_excel
5       binary           HDF5 Format        read_hdf        to_hdf
6       binary        Feather Format    read_feather    to_feather
7       binary               Msgpack    read_msgpack    to_msgpack
8       binary                 Stata      read_stata      to_stata
9       binary                   SAS        read_sas              
10      binary  Python Pickle Format     read_pickle     to_pickle
11         SQL                   SQL        read_sql        to_sql
12         SQL      Google Big Query        read_gbq        to_gbq
20
#csv读取
df2 = pd.read_csv('df1.csv')
print(df2)

   Format Type      Data Description          Reader        Writer
0         text                   CSV        read_csv        to_csv
1         text                  JSON       read_json       to_json
2         text                  HTML       read_html       to_html
3         text       Local clipboard  read_clipboard  to_clipboard
4       binary              MS Excel      read_excel      to_excel
5       binary           HDF5 Format        read_hdf        to_hdf
6       binary        Feather Format    read_feather    to_feather
7       binary               Msgpack    read_msgpack    to_msgpack
8       binary                 Stata      read_stata      to_stata
9       binary                   SAS        read_sas              
10      binary  Python Pickle Format     read_pickle     to_pickle
11         SQL                   SQL        read_sql        to_sql
12         SQL      Google Big Query        read_gbq        to_gbq
50
print('json')
pd.read_json(df1.to_json())
json
50
Format Type	Data Description	Reader	Writer
0	text	CSV	read_csv	to_csv
1	text	JSON	read_json	to_json
2	text	HTML	read_html	to_html
3	text	Local clipboard	read_clipboard	to_clipboard
4	binary	MS Excel	read_excel	to_excel
5	binary	HDF5 Format	read_hdf	to_hdf
6	binary	Feather Format	read_feather	to_feather
7	binary	Msgpack	read_msgpack	to_msgpack
8	binary	Stata	read_stata	to_stata
9	binary	SAS	read_sas	
10	binary	Python Pickle Format	read_pickle	to_pickle
11	SQL	SQL	read_sql	to_sql
12	SQL	Google Big Query	read_gbq	to_gbq
55
print(df1)
df1.to_html('df1.html')
   Format Type      Data Description          Reader        Writer
0         text                   CSV        read_csv        to_csv
1         text                  JSON       read_json       to_json
2         text                  HTML       read_html       to_html
3         text       Local clipboard  read_clipboard  to_clipboard
4       binary              MS Excel      read_excel      to_excel
5       binary           HDF5 Format        read_hdf        to_hdf
6       binary        Feather Format    read_feather    to_feather
7       binary               Msgpack    read_msgpack    to_msgpack
8       binary                 Stata      read_stata      to_stata
9       binary                   SAS        read_sas              
10      binary  Python Pickle Format     read_pickle     to_pickle
11         SQL                   SQL        read_sql        to_sql
12         SQL      Google Big Query        read_gbq        to_gbq
59
#这块有问题 暂时不知道如何解决
pd.read_html('file://F://learn//datescience_learn//df1.html')
---------------------------------------------------------------------------

ImportError                               Traceback (most recent call last)

<ipython-input-59-0383a773205d> in <module>
----> 1 pd.read_html('file://F://learn//datescience_learn//df1.html')

d:\python38\lib\site-packages\pandas\util\_decorators.py in wrapper(*args, **kwargs)
    294                 )
    295                 warnings.warn(msg, FutureWarning, stacklevel=stacklevel)
--> 296             return func(*args, **kwargs)
    297 
    298         return wrapper

d:\python38\lib\site-packages\pandas\io\html.py in read_html(io, match, flavor, header, index_col, skiprows, attrs, parse_dates, thousands, encoding, decimal, converters, na_values, keep_default_na, displayed_only)
   1084         )
   1085     validate_header_arg(header)
-> 1086     return _parse(
   1087         flavor=flavor,
   1088         io=io,

d:\python38\lib\site-packages\pandas\io\html.py in _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs)
    892     retained = None
    893     for flav in flavor:
--> 894         parser = _parser_dispatch(flav)
    895         p = parser(io, compiled_match, attrs, encoding, displayed_only)
    896 

d:\python38\lib\site-packages\pandas\io\html.py in _parser_dispatch(flavor)
    849     else:
    850         if not _HAS_LXML:
--> 851             raise ImportError("lxml not found, please install it")
    852     return _valid_parsers[flavor]
    853 

ImportError: lxml not found, please install it

73
df1
#依然错误
#df1.to_excel('df1.xlsx')
73
Format Type	Data Description	Reader	Writer
0	text	CSV	read_csv	to_csv
1	text	JSON	read_json	to_json
2	text	HTML	read_html	to_html
3	text	Local clipboard	read_clipboard	to_clipboard
4	binary	MS Excel	read_excel	to_excel
5	binary	HDF5 Format	read_hdf	to_hdf
6	binary	Feather Format	read_feather	to_feather
7	binary	Msgpack	read_msgpack	to_msgpack
8	binary	Stata	read_stata	to_stata
9	binary	SAS	read_sas	
10	binary	Python Pickle Format	read_pickle	to_pickle
11	SQL	SQL	read_sql	to_sql
12	SQL	Google Big Query	read_gbq	to_gbq

008

loc和iloc

2
import numpy as np
import pandas as pd

imdb = pd.read_csv('movie_metadata.csv')
print(imdb)
      color      director_name  num_critic_for_reviews  duration  \
0     Color      James Cameron                   723.0     178.0   
1     Color     Gore Verbinski                   302.0     169.0   
2     Color         Sam Mendes                   602.0     148.0   
3     Color  Christopher Nolan                   813.0     164.0   
4       NaN        Doug Walker                     NaN       NaN   
...     ...                ...                     ...       ...   
5038  Color        Scott Smith                     1.0      87.0   
5039  Color                NaN                    43.0      43.0   
5040  Color   Benjamin Roberds                    13.0      76.0   
5041  Color        Daniel Hsia                    14.0     100.0   
5042  Color           Jon Gunn                    43.0      90.0   

      director_facebook_likes  actor_3_facebook_likes      actor_2_name  \
0                         0.0                   855.0  Joel David Moore   
1                       563.0                  1000.0     Orlando Bloom   
2                         0.0                   161.0      Rory Kinnear   
3                     22000.0                 23000.0    Christian Bale   
4                       131.0                     NaN        Rob Walker   
...                       ...                     ...               ...   
5038                      2.0                   318.0     Daphne Zuniga   
5039                      NaN                   319.0     Valorie Curry   
5040                      0.0                     0.0     Maxwell Moody   
5041                      0.0                   489.0     Daniel Henney   
5042                     16.0                    16.0  Brian Herzlinger   

      actor_1_facebook_likes        gross                           genres  \
0                     1000.0  760505847.0  Action|Adventure|Fantasy|Sci-Fi   
1                    40000.0  309404152.0         Action|Adventure|Fantasy   
2                    11000.0  200074175.0        Action|Adventure|Thriller   
3                    27000.0  448130642.0                  Action|Thriller   
4                      131.0          NaN                      Documentary   
...                      ...          ...                              ...   
5038                   637.0          NaN                     Comedy|Drama   
5039                   841.0          NaN     Crime|Drama|Mystery|Thriller   
5040                     0.0          NaN            Drama|Horror|Thriller   
5041                   946.0      10443.0             Comedy|Drama|Romance   
5042                    86.0      85222.0                      Documentary   

      ... num_user_for_reviews language  country  content_rating       budget  \
0     ...               3054.0  English      USA           PG-13  237000000.0   
1     ...               1238.0  English      USA           PG-13  300000000.0   
2     ...                994.0  English       UK           PG-13  245000000.0   
3     ...               2701.0  English      USA           PG-13  250000000.0   
4     ...                  NaN      NaN      NaN             NaN          NaN   
...   ...                  ...      ...      ...             ...          ...   
5038  ...                  6.0  English   Canada             NaN          NaN   
5039  ...                359.0  English      USA           TV-14          NaN   
5040  ...                  3.0  English      USA             NaN       1400.0   
5041  ...                  9.0  English      USA           PG-13          NaN   
5042  ...                 84.0  English      USA              PG       1100.0   

      title_year actor_2_facebook_likes imdb_score  aspect_ratio  \
0         2009.0                  936.0        7.9          1.78   
1         2007.0                 5000.0        7.1          2.35   
2         2015.0                  393.0        6.8          2.35   
3         2012.0                23000.0        8.5          2.35   
4            NaN                   12.0        7.1           NaN   
...          ...                    ...        ...           ...   
5038      2013.0                  470.0        7.7           NaN   
5039         NaN                  593.0        7.5         16.00   
5040      2013.0                    0.0        6.3           NaN   
5041      2012.0                  719.0        6.3          2.35   
5042      2004.0                   23.0        6.6          1.85   

     movie_facebook_likes  
0                   33000  
1                       0  
2                   85000  
3                  164000  
4                       0  
...                   ...  
5038                   84  
5039                32000  
5040                   16  
5041                  660  
5042                  456  

[5043 rows x 28 columns]
4
print(imdb.shape)
print(imdb.tail())
print(imdb.head())#可以填一个数

(5043, 28)
      color     director_name  num_critic_for_reviews  duration  \
5038  Color       Scott Smith                     1.0      87.0   
5039  Color               NaN                    43.0      43.0   
5040  Color  Benjamin Roberds                    13.0      76.0   
5041  Color       Daniel Hsia                    14.0     100.0   
5042  Color          Jon Gunn                    43.0      90.0   

      director_facebook_likes  actor_3_facebook_likes      actor_2_name  \
5038                      2.0                   318.0     Daphne Zuniga   
5039                      NaN                   319.0     Valorie Curry   
5040                      0.0                     0.0     Maxwell Moody   
5041                      0.0                   489.0     Daniel Henney   
5042                     16.0                    16.0  Brian Herzlinger   

      actor_1_facebook_likes    gross                        genres  ...  \
5038                   637.0      NaN                  Comedy|Drama  ...   
5039                   841.0      NaN  Crime|Drama|Mystery|Thriller  ...   
5040                     0.0      NaN         Drama|Horror|Thriller  ...   
5041                   946.0  10443.0          Comedy|Drama|Romance  ...   
5042                    86.0  85222.0                   Documentary  ...   

     num_user_for_reviews language  country  content_rating  budget  \
5038                  6.0  English   Canada             NaN     NaN   
5039                359.0  English      USA           TV-14     NaN   
5040                  3.0  English      USA             NaN  1400.0   
5041                  9.0  English      USA           PG-13     NaN   
5042                 84.0  English      USA              PG  1100.0   

      title_year actor_2_facebook_likes imdb_score  aspect_ratio  \
5038      2013.0                  470.0        7.7           NaN   
5039         NaN                  593.0        7.5         16.00   
5040      2013.0                    0.0        6.3           NaN   
5041      2012.0                  719.0        6.3          2.35   
5042      2004.0                   23.0        6.6          1.85   

     movie_facebook_likes  
5038                   84  
5039                32000  
5040                   16  
5041                  660  
5042                  456  

[5 rows x 28 columns]
   color      director_name  num_critic_for_reviews  duration  \
0  Color      James Cameron                   723.0     178.0   
1  Color     Gore Verbinski                   302.0     169.0   
2  Color         Sam Mendes                   602.0     148.0   
3  Color  Christopher Nolan                   813.0     164.0   
4    NaN        Doug Walker                     NaN       NaN   

   director_facebook_likes  actor_3_facebook_likes      actor_2_name  \
0                      0.0                   855.0  Joel David Moore   
1                    563.0                  1000.0     Orlando Bloom   
2                      0.0                   161.0      Rory Kinnear   
3                  22000.0                 23000.0    Christian Bale   
4                    131.0                     NaN        Rob Walker   

   actor_1_facebook_likes        gross                           genres  ...  \
0                  1000.0  760505847.0  Action|Adventure|Fantasy|Sci-Fi  ...   
1                 40000.0  309404152.0         Action|Adventure|Fantasy  ...   
2                 11000.0  200074175.0        Action|Adventure|Thriller  ...   
3                 27000.0  448130642.0                  Action|Thriller  ...   
4                   131.0          NaN                      Documentary  ...   

  num_user_for_reviews language  country  content_rating       budget  \
0               3054.0  English      USA           PG-13  237000000.0   
1               1238.0  English      USA           PG-13  300000000.0   
2                994.0  English       UK           PG-13  245000000.0   
3               2701.0  English      USA           PG-13  250000000.0   
4                  NaN      NaN      NaN             NaN          NaN   

   title_year actor_2_facebook_likes imdb_score  aspect_ratio  \
0      2009.0                  936.0        7.9          1.78   
1      2007.0                 5000.0        7.1          2.35   
2      2015.0                  393.0        6.8          2.35   
3      2012.0                23000.0        8.5          2.35   
4         NaN                   12.0        7.1           NaN   

  movie_facebook_likes  
0                33000  
1                    0  
2                85000  
3               164000  
4                    0  

[5 rows x 28 columns]
7
print(imdb[['color','director_name']])#两个中括号

      color      director_name
0     Color      James Cameron
1     Color     Gore Verbinski
2     Color         Sam Mendes
3     Color  Christopher Nolan
4       NaN        Doug Walker
...     ...                ...
5038  Color        Scott Smith
5039  Color                NaN
5040  Color   Benjamin Roberds
5041  Color        Daniel Hsia
5042  Color           Jon Gunn

[5043 rows x 2 columns]
11
sub_df = imdb[['director_name','movie_title','imdb_score']]
print(sub_df)
print(sub_df.head(5))
          director_name                                        movie_title  \
0         James Cameron                                            Avatar    
1        Gore Verbinski          Pirates of the Caribbean: At World's End    
2            Sam Mendes                                           Spectre    
3     Christopher Nolan                             The Dark Knight Rises    
4           Doug Walker  Star Wars: Episode VII - The Force Awakens    ...   
...                 ...                                                ...   
5038        Scott Smith                           Signed Sealed Delivered    
5039                NaN                         The Following                
5040   Benjamin Roberds                              A Plague So Pleasant    
5041        Daniel Hsia                                  Shanghai Calling    
5042           Jon Gunn                                 My Date with Drew    

      imdb_score  
0            7.9  
1            7.1  
2            6.8  
3            8.5  
4            7.1  
...          ...  
5038         7.7  
5039         7.5  
5040         6.3  
5041         6.3  
5042         6.6  

[5043 rows x 3 columns]
       director_name                                        movie_title  \
0      James Cameron                                            Avatar    
1     Gore Verbinski          Pirates of the Caribbean: At World's End    
2         Sam Mendes                                           Spectre    
3  Christopher Nolan                             The Dark Knight Rises    
4        Doug Walker  Star Wars: Episode VII - The Force Awakens    ...   

   imdb_score  
0         7.9  
1         7.1  
2         6.8  
3         8.5  
4         7.1  
21
#通过index来过滤
tem_df = sub_df.iloc[10:20,:]
print(tem_df)
print('-'*100)
print(sub_df.iloc[10:20,0:2])
print(tem_df.loc[15:17,:])
print(tem_df.loc[15:17,'movie_title'])

       director_name                                   movie_title  imdb_score
10       Zack Snyder           Batman v Superman: Dawn of Justice          6.9
11      Bryan Singer                             Superman Returns          6.1
12      Marc Forster                            Quantum of Solace          6.7
13    Gore Verbinski   Pirates of the Caribbean: Dead Man's Chest          7.3
14    Gore Verbinski                              The Lone Ranger          6.5
15       Zack Snyder                                 Man of Steel          7.2
16    Andrew Adamson     The Chronicles of Narnia: Prince Caspian          6.6
17       Joss Whedon                                 The Avengers          8.1
18      Rob Marshall  Pirates of the Caribbean: On Stranger Tides          6.7
19  Barry Sonnenfeld                               Men in Black 3          6.8
----------------------------------------------------------------------------------------------------
       director_name                                   movie_title
10       Zack Snyder           Batman v Superman: Dawn of Justice 
11      Bryan Singer                             Superman Returns 
12      Marc Forster                            Quantum of Solace 
13    Gore Verbinski   Pirates of the Caribbean: Dead Man's Chest 
14    Gore Verbinski                              The Lone Ranger 
15       Zack Snyder                                 Man of Steel 
16    Andrew Adamson     The Chronicles of Narnia: Prince Caspian 
17       Joss Whedon                                 The Avengers 
18      Rob Marshall  Pirates of the Caribbean: On Stranger Tides 
19  Barry Sonnenfeld                               Men in Black 3 
     director_name                                movie_title  imdb_score
15     Zack Snyder                              Man of Steel          7.2
16  Andrew Adamson  The Chronicles of Narnia: Prince Caspian          6.6
17     Joss Whedon                              The Avengers          8.1
     director_name                                movie_title  imdb_score
15     Zack Snyder                              Man of Steel          7.2
16  Andrew Adamson  The Chronicles of Narnia: Prince Caspian          6.6
17     Joss Whedon                              The Avengers          8.1

009



Reindexing Series and DataFrame

2
import numpy as np
import pandas as pd
from pandas import Series,DataFrame


s1 = Series([1,2,3,4],index=['A','B','C','D'])
print(s1)
s1.reindex(index=['A','B','C','D','E'])
A    1
B    2
C    3
D    4
dtype: int64
2
A    1.0
B    2.0
C    3.0
D    4.0
E    NaN
dtype: float64
3
s1.reindex(index=['A','B','C','D','E'],fill_value=10)

3
A     1
B     2
C     3
D     4
E    10
dtype: int64
8
s2 = Series(['A','B','C'],index=[1,5,10])
print(s2)
print(s2.reindex(index=np.arange(15),method = 'ffill'))

1     A
5     B
10    C
dtype: object
0     NaN
1       A
2       A
3       A
4       A
5       B
6       B
7       B
8       B
9       B
10      C
11      C
12      C
13      C
14      C
dtype: object
reindex dataframe

11
df1 = DataFrame(np.random.rand(25).reshape(5,5),index=['A','B','D','E','F'],columns=['c1','c2','c3','c4','c5'])
print(df1)
print(df1.reindex(index=['A','B','C','D','E','F']))
         c1        c2        c3        c4        c5
A  0.196816  0.756110  0.409648  0.212659  0.532618
B  0.422182  0.162479  0.530976  0.234258  0.969964
D  0.826811  0.560312  0.192798  0.424592  0.926423
E  0.504550  0.725850  0.911693  0.595041  0.179368
F  0.608460  0.392263  0.203229  0.910608  0.295107
         c1        c2        c3        c4        c5
A  0.196816  0.756110  0.409648  0.212659  0.532618
B  0.422182  0.162479  0.530976  0.234258  0.969964
C       NaN       NaN       NaN       NaN       NaN
D  0.826811  0.560312  0.192798  0.424592  0.926423
E  0.504550  0.725850  0.911693  0.595041  0.179368
F  0.608460  0.392263  0.203229  0.910608  0.295107
12
print(df1.reindex(columns=['c1','c2','c3','c4','c5','c6']))
         c1        c2        c3        c4        c5  c6
A  0.196816  0.756110  0.409648  0.212659  0.532618 NaN
B  0.422182  0.162479  0.530976  0.234258  0.969964 NaN
D  0.826811  0.560312  0.192798  0.424592  0.926423 NaN
E  0.504550  0.725850  0.911693  0.595041  0.179368 NaN
F  0.608460  0.392263  0.203229  0.910608  0.295107 NaN
17
print(df1.reindex(index=['A','B','C','D','E','F'],columns=['c1','c2','c3','c4','c5','c6']))
#减少index也可以，把列表里面的index 改少点
#s1.drop()
#df1.drop()#可以删除行列
print(df1.drop('c1',axis=1))
print(df1.drop('A',axis=0))
         c1        c2        c3        c4        c5  c6
A  0.196816  0.756110  0.409648  0.212659  0.532618 NaN
B  0.422182  0.162479  0.530976  0.234258  0.969964 NaN
C       NaN       NaN       NaN       NaN       NaN NaN
D  0.826811  0.560312  0.192798  0.424592  0.926423 NaN
E  0.504550  0.725850  0.911693  0.595041  0.179368 NaN
F  0.608460  0.392263  0.203229  0.910608  0.295107 NaN
         c2        c3        c4        c5
A  0.756110  0.409648  0.212659  0.532618
B  0.162479  0.530976  0.234258  0.969964
D  0.560312  0.192798  0.424592  0.926423
E  0.725850  0.911693  0.595041  0.179368
F  0.392263  0.203229  0.910608  0.295107
         c1        c2        c3        c4        c5
B  0.422182  0.162479  0.530976  0.234258  0.969964
D  0.826811  0.560312  0.192798  0.424592  0.926423
E  0.504550  0.725850  0.911693  0.595041  0.179368
F  0.608460  0.392263  0.203229  0.910608  0.295107

010

2
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
#NAA not a number
n = np.nan
type(n)
2
float
3
m = 1
print(m+n)

nan
NAN in series

5
s1 = Series([1,np.nan,3,4],index=['A','B','C','D'])
print(s1.isnull())
print(s1.dropna())
#这章。。
A    False
B     True
C    False
D    False
dtype: bool
A    1.0
C    3.0
D    4.0
dtype: float64

011

5
import numpy as np
import pandas as pd
from pandas import Series,DataFrame

s1 = Series(np.random.rand(6),index=[['1','1','1','2','2','2'],['a','b','c','a','b','c']])
print(s1)
1  a    0.344513
   b    0.525910
   c    0.517284
2  a    0.519497
   b    0.592509
   c    0.185066
dtype: float64
6
type(s1['1'])
print(s1['1']['a'])
0.3445134511367778
7
df1 = s1.unstack()
print(df1)
          a         b         c
1  0.344513  0.525910  0.517284
2  0.519497  0.592509  0.185066
8
df2 = DataFrame([s1['1'],s1['2']])
print(df2)
          a         b         c
0  0.344513  0.525910  0.517284
1  0.519497  0.592509  0.185066
9
s2 = df1.unstack()
print(s2)
#s2 = df1.T.unstack()
a  1    0.344513
   2    0.519497
b  1    0.525910
   2    0.592509
c  1    0.517284
   2    0.185066
dtype: float64
15
df = DataFrame(np.arange(16).reshape(4,4),index = [['a','a','b','b'],[1,2,1,2]],
               columns=[['BJ','BJ','SH','GZ'],[8,9,88,9]])
print(df)
print(df['BJ'][8])

     BJ      SH  GZ
     8   9   88  9 
a 1   0   1   2   3
  2   4   5   6   7
b 1   8   9  10  11
  2  12  13  14  15
a  1     0
   2     4
b  1     8
   2    12
Name: 8, dtype: int32