第四节 Pandas 入门
慕课网python数科学入门课程学习笔记
一. Series 序列
import numpy as np
import pandas as pd
s1 = pd.Series([1,2,3,4])
s1
0 1 1 2 2 3 3 4 dtype: int64
s1.values
array([1, 2, 3, 4], dtype=int64)
s1.index
RangeIndex(start=0, stop=4, step=1)
s2 = pd.Series(np.arange(10))
s2
0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 dtype: int32
s3 = pd.Series({'1':1, '2':2, '3':3})
s3
1 1 2 2 3 3 dtype: int64
s4 = pd.Series([1,2,3,4],index=['A','B','C','D'])
s4
A 1 B 2 C 3 D 4 dtype: int64
序列的访问、转换、属性
s4['A']
1
s4[s4>2]
C 3 D 4 dtype: int64
s4.to_dict()
{‘A’: 1, ‘B’: 2, ‘C’: 3, ‘D’: 4}
s4
A 1 B 2 C 3 D 4 dtype: int64
index_1 = ['A', 'B', 'C', 'D', 'E']
s5 = pd.Series(s4,index=index_1)
s5
A 1.0 B 2.0 C 3.0 D 4.0 E NaN dtype: float64
pd.isnull(s5)
A False B False C False D False E True dtype: bool
s5.name = 'demo'
s5.index.name = 'key value'
s5
key value A 1.0 B 2.0 C 3.0 D 4.0 E NaN Name: demo, dtype: float64
s5.index
Index([‘A’, ‘B’, ‘C’, ‘D’, ‘E’], dtype=’object’, name=’key value’)
s5.values
array([ 1., 2., 3., 4., nan]) ——-
二.Dataframe 入门
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
import webbrowser
link = 'https://www.tiobe.com/tiobe-index/'
webbrowser.open(link)
df = pd.read_clipboard()
df
| May | 2018 | May.1 | 2017 | Change | Programming | Language | Ratings | Change.1 |
---|
0 | 1 | 1 | Java | 16.380% | +1.74% | NaN | NaN | NaN | NaN |
---|
1 | 2 | 2 | C | 14.000% | +7.00% | NaN | NaN | NaN | NaN |
---|
2 | 3 | 3 | C++ | 7.668% | +2.92% | NaN | NaN | NaN | NaN |
---|
3 | 4 | 4 | Python | 5.192% | +1.64% | NaN | NaN | NaN | NaN |
---|
4 | 5 | 5 | C# | 4.402% | +0.95% | NaN | NaN | NaN | NaN |
---|
5 | 6 | 6 | Visual | Basic | .NET | 4.124% | +0.73% | NaN | NaN |
---|
6 | 7 | 9 | change | PHP | 3.321% | +0.63% | NaN | NaN | NaN |
---|
type(df)
pandas.core.frame.DataFrame
df.columns
Index([‘May’, ‘2018’, ‘May.1’, ‘2017’, ‘Change’, ‘Programming’, ‘Language’, ‘Ratings’, ‘Change.1’], dtype=’object’)
df.Change
0 +1.74% 1 +7.00% 2 +2.92% 3 +1.64% 4 +0.95% 5 .NET 6 3.321% Name: Change, dtype: object
df_filter = DataFrame(df,columns=['May','2018','Change'])
df_filter
| May | 2018 | Change |
---|
0 | 1 | 1 | +1.74% |
---|
1 | 2 | 2 | +7.00% |
---|
2 | 3 | 3 | +2.92% |
---|
3 | 4 | 4 | +1.64% |
---|
4 | 5 | 5 | +0.95% |
---|
5 | 6 | 6 | .NET |
---|
6 | 7 | 9 | 3.321% |
---|
df_filter['Change']
0 +1.74% 1 +7.00% 2 +2.92% 3 +1.64% 4 +0.95% 5 .NET 6 3.321% Name: Change, dtype: object
type(df_filter['Change'])
pandas.core.series.Series
添加列 和 值
df_new = DataFrame(df,columns=['Change','Sep 2019'])
df_new
| Change | Sep 2019 |
---|
0 | +1.74% | NaN |
---|
1 | +7.00% | NaN |
---|
2 | +2.92% | NaN |
---|
3 | +1.64% | NaN |
---|
4 | +0.95% | NaN |
---|
5 | .NET | NaN |
---|
6 | 3.321% | NaN |
---|
df_new['Sep 2019'] = range(0,7)
df_new
| Change | Sep 2019 |
---|
0 | +1.74% | 0 |
---|
1 | +7.00% | 1 |
---|
2 | +2.92% | 2 |
---|
3 | +1.64% | 3 |
---|
4 | +0.95% | 4 |
---|
5 | .NET | 5 |
---|
6 | 3.321% | 6 |
---|
df_new['Sep 2019'] = np.arange(0,7)
df_new
| Change | Sep 2019 |
---|
0 | +1.74% | 0 |
---|
1 | +7.00% | 1 |
---|
2 | +2.92% | 2 |
---|
3 | +1.64% | 3 |
---|
4 | +0.95% | 4 |
---|
5 | .NET | 5 |
---|
6 | 3.321% | 6 |
---|
df_new['Sep 2019'] = pd.Series(np.arange(0,7))
df_new
| Change | Sep 2019 |
---|
0 | +1.74% | 0 |
---|
1 | +7.00% | 1 |
---|
2 | +2.92% | 2 |
---|
3 | +1.64% | 3 |
---|
4 | +0.95% | 4 |
---|
5 | .NET | 5 |
---|
6 | 3.321% | 6 |
---|
df_new['Sep 2019'] = pd.Series([100,200],index=[1,2])
df_new
| Change | Sep 2019 |
---|
0 | +1.74% | NaN |
---|
1 | +7.00% | 100.0 |
---|
2 | +2.92% | 200.0 |
---|
3 | +1.64% | NaN |
---|
4 | +0.95% | NaN |
---|
5 | .NET | NaN |
---|
6 | 3.321% | NaN |
---|
三.深入理解Series和DataFrame
1.Series 和 DataFrame 对比
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
data = {'Country':['A', 'B', 'C'],
'Capital':['AA', 'BB','CC'],
'Population':[11111,22222,33333]}
Series
s1 = pd.Series(data['Country'])
s1
0 A 1 B 2 C dtype: object
Dataframe
df = pd.DataFrame(data)
df
| Country | Capital | Population |
---|
0 | A | AA | 11111 |
---|
1 | B | BB | 22222 |
---|
2 | C | CC | 33333 |
---|
df_torows = df.iterrows()
for row in df_torows:
print(row)
print(type(row))
(0, Country A Capital AA Population 11111 Name: 0, dtype: object)
for row in df.iterrows():
print(type(row[0]),type(row[1]))
break
通过 几个 Series 创建 Dataframe
s1 = pd.Series(data['Country'])
s2 = pd.Series(data['Capital'])
s3 = pd.Series(data['Population'])
df_new = pd.DataFrame([s1,s2,s3],index=['Country', 'Capital', 'Capital'])
df_new
| 0 | 1 | 2 |
---|
Country | A | B | C |
---|
Capital | AA | BB | CC |
---|
Capital | 11111 | 22222 | 33333 |
---|
df
| Country | Capital | Population |
---|
0 | A | AA | 11111 |
---|
1 | B | BB | 22222 |
---|
2 | C | CC | 33333 |
---|
df_new = df_new.T
df_new
| Country | Capital | Capital |
---|
0 | A | AA | 11111 |
---|
1 | B | BB | 22222 |
---|
2 | C | CC | 33333 |
---|
2.Dataframe IO
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
import webbrowser
link = 'http://pandas.pydata.org/pandas-docs/version/0.20/io.html'
webbrowser.open(link)
True
df = pd.read_clipboard()
df
| Format Type | Data Description | Reader | Writer |
---|
0 | text | CSV | read_csv | to_csv |
---|
1 | text | JSON | read_json | to_json |
---|
2 | text | HTML | read_html | to_html |
---|
3 | text | Local clipboard | read_clipboard | to_clipboard |
---|
4 | binary | MS Excel | read_excel | to_excel |
---|
5 | binary | HDF5 Format | read_hdf | to_hdf |
---|
6 | binary | Feather Format | read_feather | to_feather |
---|
7 | binary | Msgpack | read_msgpack | to_msgpack |
---|
8 | binary | Stata | read_stata | to_stata |
---|
9 | binary | SAS | read_sas | None |
---|
10 | binary | Python Pickle Format | read_pickle | to_pickle |
---|
11 | SQL | SQL | read_sql | to_sql |
---|
12 | SQL | Google Big Query | read_gbq | to_gbq |
---|
df.to_csv('df1.csv',index=False)
!ls
Dataframe IO.ipynb Dataframe.ipynb Series.ipynb df1.csv 娣卞叆鐞嗚ВSeries鍜孌ataFrame.ipynb
!more df1.csv
Format Type,Data Description,Reader,Writer text,CSV,read_csv,to_csv text,JSON,read_json,to_json text,HTML,read_html,to_html text,Local clipboard,read_clipboard,to_clipboard binary,MS Excel,read_excel,to_excel binary,HDF5 Format,read_hdf,to_hdf binary,Feather Format,read_feather,to_feather binary,Msgpack,read_msgpack,to_msgpack binary,Stata,read_stata,to_stata binary,SAS,read_sas, binary,Python Pickle Format,read_pickle,to_pickle SQL,SQL,read_sql,to_sql SQL,Google Big Query,read_gbq,to_gbq
df2 = pd.read_csv('df1.csv')
df2
| Format Type | Data Description | Reader | Writer |
---|
0 | text | CSV | read_csv | to_csv |
---|
1 | text | JSON | read_json | to_json |
---|
2 | text | HTML | read_html | to_html |
---|
3 | text | Local clipboard | read_clipboard | to_clipboard |
---|
4 | binary | MS Excel | read_excel | to_excel |
---|
5 | binary | HDF5 Format | read_hdf | to_hdf |
---|
6 | binary | Feather Format | read_feather | to_feather |
---|
7 | binary | Msgpack | read_msgpack | to_msgpack |
---|
8 | binary | Stata | read_stata | to_stata |
---|
9 | binary | SAS | read_sas | NaN |
---|
10 | binary | Python Pickle Format | read_pickle | to_pickle |
---|
11 | SQL | SQL | read_sql | to_sql |
---|
12 | SQL | Google Big Query | read_gbq | to_gbq |
---|
df.to_json()
'{"Format Type":{"0":"text","1":"text","2":"text","3":"text","4":"binary","5":"binary","6":"binary","7":"binary","8":"binary","9":"binary","10":"binary","11":"SQL","12":"SQL"},"Data Description":。。。
3.Selecting and Index
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
!ls
Dataframe IO.ipynb Dataframe.ipynb Selecting and Indexing.ipynb Series.ipynb data.csv df1.csv 娣卞叆鐞嗚ВSeries鍜孌ataFrame.ipynb
read_data = pd.read_csv('data.csv')
read_data.shape
(20, 8)
read_data.head(4)
| A | B | C | D | E | F | G | H |
---|
0 | 1 | 1 | Java | 16.38% | 1.74% | 21 | Apex | 0.90% |
---|
1 | 2 | 2 | C | 14.00% | 7.00% | 22 | PL/SQL | 0.90% |
---|
2 | 3 | 3 | C++ | 7.67% | 2.92% | 23 | Transact-SQL | 0.88% |
---|
3 | 4 | 4 | Python | 5.19% | 1.64% | 24 | Ada | 0.87% |
---|
read_data.tail(3)
| A | B | C | D | E | F | G | H |
---|
17 | 18 | 10 | Perl | 0.91% | -1.69% | 38 | Alice | 0.47% |
---|
18 | 19 | 13 | Swift | 0.91% | -1.37% | 39 | Lua | 0.42% |
---|
19 | 20 | 31 | Scala | 0.90% | 0.18% | 40 | Fortran | 0.42% |
---|
sub_data = read_data[['A', 'B','C']]
sub_data.head(3)
sub_data.iloc[3:6, :]
| A | B | C |
---|
3 | 4 | 4 | Python |
---|
4 | 5 | 5 | C# |
---|
5 | 6 | 6 | Visual Basic .NET |
---|
read_data.loc[10:13, : 'D']
| A | B | C | D |
---|
10 | 11 | 14 | R | 1.18% |
---|
11 | 12 | 18 | Delphi/Object Pascal | 1.01% |
---|
12 | 13 | 8 | Assembly language | 1.00% |
---|
13 | 14 | 16 | Go | 0.97% |
---|
四.Reindexing Series and DataFrame
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
1.series reindex
shift + Tab 键可查看函数说明
s1 = Series([1,2,3,4],index=['A','B','C','D'])
s1
A 1 B 2 C 3 D 4 dtype: int64
s1.reindex(index=['A','B','C','D','E'],fill_value=10)
A 1 B 2 C 3 D 4 E 10 dtype: int64
s2 = Series(['A','B','C'],index=[1,3,6])
s2
1 A 3 B 6 C dtype: object
s2.reindex(index=range(8))
0 NaN 1 A 2 NaN 3 B 4 NaN 5 NaN 6 C 7 NaN dtype: object
s2.reindex(index=range(8),method='ffill')
0 NaN 1 A 2 A 3 B 4 B 5 B 6 C 7 C dtype: object
2.reindex dataframe
df1 = DataFrame(np.random.rand(25).reshape([5,5]),index=['A','B','D','E','F'], columns=['c1', 'c2', 'c3', 'c4', 'c5'])
df1
| c1 | c2 | c3 | c4 | c5 |
---|
A | 0.123618 | 0.348567 | 0.119156 | 0.380952 | 0.379118 |
---|
B | 0.476492 | 0.254976 | 0.629318 | 0.728708 | 0.747153 |
---|
D | 0.965314 | 0.424126 | 0.913850 | 0.092063 | 0.196096 |
---|
E | 0.960760 | 0.866313 | 0.226766 | 0.865781 | 0.465341 |
---|
F | 0.982832 | 0.340850 | 0.725084 | 0.519617 | 0.889651 |
---|
df1.reindex(index=['A','B','C','D','E','F'])
| c1 | c2 | c3 | c4 | c5 |
---|
A | 0.123618 | 0.348567 | 0.119156 | 0.380952 | 0.379118 |
---|
B | 0.476492 | 0.254976 | 0.629318 | 0.728708 | 0.747153 |
---|
C | NaN | NaN | NaN | NaN | NaN |
---|
D | 0.965314 | 0.424126 | 0.913850 | 0.092063 | 0.196096 |
---|
E | 0.960760 | 0.866313 | 0.226766 | 0.865781 | 0.465341 |
---|
F | 0.982832 | 0.340850 | 0.725084 | 0.519617 | 0.889651 |
---|
3.利用 reindex() 选取某些值
s1
A 1 B 2 C 3 D 4 dtype: int64
s1.reindex(index=['A','B'])
A 1 B 2 dtype: int64
df1.reindex(index=['A','B'])
| c1 | c2 | c3 | c4 | c5 |
---|
A | 0.123618 | 0.348567 | 0.119156 | 0.380952 | 0.379118 |
---|
B | 0.476492 | 0.254976 | 0.629318 | 0.728708 | 0.747153 |
---|
4.删除
s1.drop('A')
B 2 C 3 D 4 dtype: int64
df1.drop('A',axis=0)
| c1 | c2 | c3 | c4 | c5 |
---|
B | 0.476492 | 0.254976 | 0.629318 | 0.728708 | 0.747153 |
---|
D | 0.965314 | 0.424126 | 0.913850 | 0.092063 | 0.196096 |
---|
E | 0.960760 | 0.866313 | 0.226766 | 0.865781 | 0.465341 |
---|
F | 0.982832 | 0.340850 | 0.725084 | 0.519617 | 0.889651 |
---|
五.NaN —— Not a Number
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
Nan in Numpy
n = np.nan
type(n)
float
m = 1
m + n
nan
Nan in Series
s1 = Series([1,2,np.nan,3,4],index=['A', 'B', 'C', 'D', 'E'])
s1
A 1.0 B 2.0 C NaN D 3.0 E 4.0 dtype: float64
s1.isnull()
A False B False C True D False E False dtype: bool
s1.dropna()
A 1.0 B 2.0 D 3.0 E 4.0 dtype: float64
NaN in DataFrame
dframe = DataFrame([[1,2,3],[np.nan,5,6],[7,np.nan,9],[np.nan, np.nan, np.nan]])
dframe
| 0 | 1 | 2 |
---|
0 | 1.0 | 2.0 | 3.0 |
---|
1 | NaN | 5.0 | 6.0 |
---|
2 | 7.0 | NaN | 9.0 |
---|
3 | NaN | NaN | NaN |
---|
df1 = dframe.dropna(axis=0, how='any')
df1
df2 = dframe.dropna(thresh=2)
df2
| 0 | 1 | 2 |
---|
0 | 1.0 | 2.0 | 3.0 |
---|
1 | NaN | 5.0 | 6.0 |
---|
2 | 7.0 | NaN | 9.0 |
---|
df3 = dframe.fillna(value=1)
df3
| 0 | 1 | 2 |
---|
0 | 1.0 | 2.0 | 3.0 |
---|
1 | 1.0 | 5.0 | 6.0 |
---|
2 | 7.0 | 1.0 | 9.0 |
---|
3 | 1.0 | 1.0 | 1.0 |
---|
df3 = dframe.fillna(value={0:0, 1:1,2:2,3:3})
df3
| 0 | 1 | 2 |
---|
0 | 1.0 | 2.0 | 3.0 |
---|
1 | 0.0 | 5.0 | 6.0 |
---|
2 | 7.0 | 1.0 | 9.0 |
---|
3 | 0.0 | 1.0 | 2.0 |
---|
六、多级Index
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
s1 = Series(np.random.randn(6),index=[['1','1','1','2','2','2'],
['a','b','c','a','b','c']])
s1
1 a 0.301681 b -1.596626 c -0.261337 2 a 0.739900 b 0.299108 c 0.074713 dtype: float64
s1['1']
a 0.301681 b -1.596626 c -0.261337 dtype: float64
s1['1']['a']
0.3016807350048885
s1[:,'a']
1 0.301681 2 0.739900 dtype: float64
二级 index Series 与 DataFrame互相转换
df1 = s1.unstack()
df1
| a | b | c |
---|
1 | 0.301681 | -1.596626 | -0.261337 |
---|
2 | 0.739900 | 0.299108 | 0.074713 |
---|
df2 = DataFrame([s1['1'],s1['2']])
df2
| a | b | c |
---|
0 | 0.301681 | -1.596626 | -0.261337 |
---|
1 | 0.739900 | 0.299108 | 0.074713 |
---|
s2 = df1.unstack()
s2
a 1 0.301681 2 0.739900 b 1 -1.596626 2 0.299108 c 1 -0.261337 2 0.074713 dtype: float64
s2 = df1.T.unstack()
s2
1 a 0.301681 b -1.596626 c -0.261337 2 a 0.739900 b 0.299108 c 0.074713 dtype: float64
多级index DataFrame
df = DataFrame(np.arange(16).reshape(4,4),
index=[['a','a','b','b'],[1,2,1,2]],
columns=[['c','c','d','d'],[5,5,6,7]])
df
| | c | d |
---|
| | 5 | 5 | 6 | 7 |
---|
a | 1 | 0 | 1 | 2 | 3 |
---|
2 | 4 | 5 | 6 | 7 |
---|
b | 1 | 8 | 9 | 10 | 11 |
---|
2 | 12 | 13 | 14 | 15 |
---|
df['d']
df['d'][6]
a 1 2 2 6 b 1 10 2 14 Name: 6, dtype: int32 —–
七、Mapping 和 Replace
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
df1 = DataFrame({"城市":["北京","上海","广州"],"人口":[1000,2000,3000]})
df1
| 城市 | 人口 |
---|
0 | 北京 | 1000 |
---|
1 | 上海 | 2000 |
---|
2 | 广州 | 3000 |
---|
df1['GDP'] = Series([1000,2000,1500])
df1
| 城市 | 人口 | GDP |
---|
0 | 北京 | 1000 | 1000 |
---|
1 | 上海 | 2000 | 2000 |
---|
2 | 广州 | 3000 | 1500 |
---|
map() 方法给DataFrame添加列
gdp_map = {"北京":100,"广州":300,"上海":200}
df1['GDP'] = df1['城市'].map(gdp_map)
df1
| 城市 | 人口 | GDP |
---|
0 | 北京 | 1000 | 100 |
---|
1 | 上海 | 2000 | 200 |
---|
2 | 广州 | 3000 | 300 |
---|
df2 = DataFrame({"城市":["北京","上海","广州"],"人口":[1000,2000,3000]},
index=['A','B','C'])
df2
| 城市 | 人口 |
---|
A | 北京 | 1000 |
---|
B | 上海 | 2000 |
---|
C | 广州 | 3000 |
---|
df2['GDP'] = Series([1000,2000,1500])
df2
| 城市 | 人口 | GDP |
---|
A | 北京 | 1000 | NaN |
---|
B | 上海 | 2000 | NaN |
---|
C | 广州 | 3000 | NaN |
---|
df2['GDP'] = Series([1000,2000,1500],index=['A','B','C'])
df2
| 城市 | 人口 | GDP |
---|
A | 北京 | 1000 | 1000 |
---|
B | 上海 | 2000 | 2000 |
---|
C | 广州 | 3000 | 1500 |
---|
replace in series
s1 = Series(np.arange(6))
s1
0 0
1 1
2 2
3 3
4 4
5 5
dtype: int32
s1.replace(1,np.nan)
0 0.0
1 NaN
2 2.0
3 3.0
4 4.0
5 5.0
dtype: float64
s1.replace([1,2,3],[10,20,30])
0 0
1 10
2 20
3 30
4 4
5 5
dtype: int64