Python数据科学入门-(Pandas)笔记02

第四节 Pandas 入门

慕课网python数科学入门课程学习笔记

一. Series 序列

import numpy as np
import pandas as pd
s1 = pd.Series([1,2,3,4])  # 创建序列
s1
0 1 1 2 2 3 3 4 dtype: int64
s1.values
array([1, 2, 3, 4], dtype=int64)
s1.index
RangeIndex(start=0, stop=4, step=1)
s2 = pd.Series(np.arange(10))  # 使用numpy的数组方法创建
s2
0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 dtype: int32
s3 = pd.Series({'1':1, '2':2, '3':3})  # 使用字典创建序列
s3
1 1 2 2 3 3 dtype: int64
s4 = pd.Series([1,2,3,4],index=['A','B','C','D'])
s4
A 1 B 2 C 3 D 4 dtype: int64

序列的访问、转换、属性

s4['A']
1
s4[s4>2]
C 3 D 4 dtype: int64
s4.to_dict()  # 序列转换为字典
{‘A’: 1, ‘B’: 2, ‘C’: 3, ‘D’: 4}
s4
A 1 B 2 C 3 D 4 dtype: int64
index_1 = ['A', 'B', 'C', 'D', 'E']
s5 = pd.Series(s4,index=index_1)
s5
A 1.0 B 2.0 C 3.0 D 4.0 E NaN dtype: float64
pd.isnull(s5)   # 判空   notnull()  判非空
A False B False C False D False E True dtype: bool
s5.name = 'demo'   # 给序列取名
s5.index.name = 'key   value'   # 给序列 index 取名
s5
key value A 1.0 B 2.0 C 3.0 D 4.0 E NaN Name: demo, dtype: float64
s5.index
Index([‘A’, ‘B’, ‘C’, ‘D’, ‘E’], dtype=’object’, name=’key value’)
s5.values
array([ 1., 2., 3., 4., nan]) ——-

二.Dataframe 入门

import numpy as np
import pandas as pd
from pandas import Series,DataFrame
import webbrowser
link = 'https://www.tiobe.com/tiobe-index/'
webbrowser.open(link)   # 打开链接
# 读取粘贴板
df = pd.read_clipboard()  # 先复制刚才打开网页的 表格的一部分
df
May2018May.12017ChangeProgrammingLanguageRatingsChange.1
011Java16.380%+1.74%NaNNaNNaNNaN
122C14.000%+7.00%NaNNaNNaNNaN
233C++7.668%+2.92%NaNNaNNaNNaN
344Python5.192%+1.64%NaNNaNNaNNaN
455C#4.402%+0.95%NaNNaNNaNNaN
566VisualBasic.NET4.124%+0.73%NaNNaN
679changePHP3.321%+0.63%NaNNaNNaN
type(df)
pandas.core.frame.DataFrame
df.columns  # 返回 表格列名
Index([‘May’, ‘2018’, ‘May.1’, ‘2017’, ‘Change’, ‘Programming’, ‘Language’, ‘Ratings’, ‘Change.1’], dtype=’object’)
df.Change  # 返回 Change 列的数据
0 +1.74% 1 +7.00% 2 +2.92% 3 +1.64% 4 +0.95% 5 .NET 6 3.321% Name: Change, dtype: object
df_filter = DataFrame(df,columns=['May','2018','Change'])  # 过滤
df_filter
May2018Change
011+1.74%
122+7.00%
233+2.92%
344+1.64%
455+0.95%
566.NET
6793.321%
df_filter['Change']   # 访问某一列
0 +1.74% 1 +7.00% 2 +2.92% 3 +1.64% 4 +0.95% 5 .NET 6 3.321% Name: Change, dtype: object
type(df_filter['Change'])
pandas.core.series.Series

添加列 和 值

df_new = DataFrame(df,columns=['Change','Sep 2019'])  # 添加不存在的 列
df_new
ChangeSep 2019
0+1.74%NaN
1+7.00%NaN
2+2.92%NaN
3+1.64%NaN
4+0.95%NaN
5.NETNaN
63.321%NaN
df_new['Sep 2019'] = range(0,7)  # 给 无值得赋值
df_new
ChangeSep 2019
0+1.74%0
1+7.00%1
2+2.92%2
3+1.64%3
4+0.95%4
5.NET5
63.321%6
df_new['Sep 2019'] = np.arange(0,7)
df_new
ChangeSep 2019
0+1.74%0
1+7.00%1
2+2.92%2
3+1.64%3
4+0.95%4
5.NET5
63.321%6
df_new['Sep 2019'] = pd.Series(np.arange(0,7))
df_new
ChangeSep 2019
0+1.74%0
1+7.00%1
2+2.92%2
3+1.64%3
4+0.95%4
5.NET5
63.321%6
df_new['Sep 2019'] = pd.Series([100,200],index=[1,2])  # 修改特定位置的值
df_new
ChangeSep 2019
0+1.74%NaN
1+7.00%100.0
2+2.92%200.0
3+1.64%NaN
4+0.95%NaN
5.NETNaN
63.321%NaN

三.深入理解Series和DataFrame

1.Series 和 DataFrame 对比
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
data = {'Country':['A', 'B', 'C'], 
        'Capital':['AA', 'BB','CC'], 
        'Population':[11111,22222,33333]}

Series

s1 = pd.Series(data['Country'])
s1
0 A 1 B 2 C dtype: object

Dataframe

df = pd.DataFrame(data)
df
CountryCapitalPopulation
0AAA11111
1BBB22222
2CCC33333
df_torows = df.iterrows()

for row in df_torows:
    print(row)
    print(type(row))  # 类型: tuple
(0, Country A Capital AA Population 11111 Name: 0, dtype: object)
for row in df.iterrows():
    print(type(row[0]),type(row[1]))
    break

通过 几个 Series 创建 Dataframe

s1 = pd.Series(data['Country'])
s2 = pd.Series(data['Capital'])
s3 = pd.Series(data['Population'])
df_new = pd.DataFrame([s1,s2,s3],index=['Country', 'Capital', 'Capital'])
df_new    # 可以看到 输出的 Dataframe 倒过来了
012
CountryABC
CapitalAABBCC
Capital111112222233333
df
CountryCapitalPopulation
0AAA11111
1BBB22222
2CCC33333
# 转置操作
df_new = df_new.T
df_new
CountryCapitalCapital
0AAA11111
1BBB22222
2CCC33333

2.Dataframe IO

import numpy as np
import pandas as pd
from pandas import Series,DataFrame
import webbrowser
link = 'http://pandas.pydata.org/pandas-docs/version/0.20/io.html'
webbrowser.open(link)
True
df = pd.read_clipboard()   # 读取粘贴板内容
df
Format TypeData DescriptionReaderWriter
0textCSVread_csvto_csv
1textJSONread_jsonto_json
2textHTMLread_htmlto_html
3textLocal clipboardread_clipboardto_clipboard
4binaryMS Excelread_excelto_excel
5binaryHDF5 Formatread_hdfto_hdf
6binaryFeather Formatread_featherto_feather
7binaryMsgpackread_msgpackto_msgpack
8binaryStataread_statato_stata
9binarySASread_sasNone
10binaryPython Pickle Formatread_pickleto_pickle
11SQLSQLread_sqlto_sql
12SQLGoogle Big Queryread_gbqto_gbq
df.to_csv('df1.csv',index=False)   # 写文件  index=False 去掉index 默认不去
!ls
Dataframe IO.ipynb Dataframe.ipynb Series.ipynb df1.csv 娣卞叆鐞嗚ВSeries鍜孌ataFrame.ipynb
!more df1.csv
Format Type,Data Description,Reader,Writer text,CSV,read_csv,to_csv text,JSON,read_json,to_json text,HTML,read_html,to_html text,Local clipboard,read_clipboard,to_clipboard binary,MS Excel,read_excel,to_excel binary,HDF5 Format,read_hdf,to_hdf binary,Feather Format,read_feather,to_feather binary,Msgpack,read_msgpack,to_msgpack binary,Stata,read_stata,to_stata binary,SAS,read_sas, binary,Python Pickle Format,read_pickle,to_pickle SQL,SQL,read_sql,to_sql SQL,Google Big Query,read_gbq,to_gbq
df2 = pd.read_csv('df1.csv')
df2
Format TypeData DescriptionReaderWriter
0textCSVread_csvto_csv
1textJSONread_jsonto_json
2textHTMLread_htmlto_html
3textLocal clipboardread_clipboardto_clipboard
4binaryMS Excelread_excelto_excel
5binaryHDF5 Formatread_hdfto_hdf
6binaryFeather Formatread_featherto_feather
7binaryMsgpackread_msgpackto_msgpack
8binaryStataread_statato_stata
9binarySASread_sasNaN
10binaryPython Pickle Formatread_pickleto_pickle
11SQLSQLread_sqlto_sql
12SQLGoogle Big Queryread_gbqto_gbq
df.to_json()  # 转换为 json 结构  反过来也行
'{"Format Type":{"0":"text","1":"text","2":"text","3":"text","4":"binary","5":"binary","6":"binary","7":"binary","8":"binary","9":"binary","10":"binary","11":"SQL","12":"SQL"},"Data Description":。。。

3.Selecting and Index

import numpy as np
import pandas as pd
from pandas import Series,DataFrame
!ls
Dataframe IO.ipynb Dataframe.ipynb Selecting and Indexing.ipynb Series.ipynb data.csv df1.csv 娣卞叆鐞嗚ВSeries鍜孌ataFrame.ipynb
read_data = pd.read_csv('data.csv')
read_data.shape   # 行 列数
(20, 8)
read_data.head(4)    # 返回前4行
ABCDEFGH
011Java16.38%1.74%21Apex0.90%
122C14.00%7.00%22PL/SQL0.90%
233C++7.67%2.92%23Transact-SQL0.88%
344Python5.19%1.64%24Ada0.87%
read_data.tail(3)  # 返回后3行
ABCDEFGH
171810Perl0.91%-1.69%38Alice0.47%
181913Swift0.91%-1.37%39Lua0.42%
192031Scala0.90%0.18%40Fortran0.42%
sub_data = read_data[['A', 'B','C']]   # 返回某些列
sub_data.head(3)
ABC
011Java
122C
233C++
sub_data.iloc[3:6, :]  # 切片(iloc 基于index 和行名列名无关)   第3行到底6行,对列不过滤
ABC
344Python
455C#
566Visual Basic .NET
read_data.loc[10:13, : 'D']   # loc 基于label 过滤
ABCD
101114R1.18%
111218Delphi/Object Pascal1.01%
12138Assembly language1.00%
131416Go0.97%

四.Reindexing Series and DataFrame

import numpy as np
import pandas as pd
from pandas import Series,DataFrame

1.series reindex

shift + Tab 键可查看函数说明

s1 = Series([1,2,3,4],index=['A','B','C','D'])
s1
A 1 B 2 C 3 D 4 dtype: int64
s1.reindex(index=['A','B','C','D','E'],fill_value=10) # 重新设置 index
A 1 B 2 C 3 D 4 E 10 dtype: int64
s2 = Series(['A','B','C'],index=[1,3,6])
s2
1 A 3 B 6 C dtype: object
s2.reindex(index=range(8))
0 NaN 1 A 2 NaN 3 B 4 NaN 5 NaN 6 C 7 NaN dtype: object
s2.reindex(index=range(8),method='ffill')  # 自动填充
0 NaN 1 A 2 A 3 B 4 B 5 B 6 C 7 C dtype: object

2.reindex dataframe

# index 中故意 漏下 C
df1 = DataFrame(np.random.rand(25).reshape([5,5]),index=['A','B','D','E','F'], columns=['c1', 'c2', 'c3', 'c4', 'c5'])
df1
c1c2c3c4c5
A0.1236180.3485670.1191560.3809520.379118
B0.4764920.2549760.6293180.7287080.747153
D0.9653140.4241260.9138500.0920630.196096
E0.9607600.8663130.2267660.8657810.465341
F0.9828320.3408500.7250840.5196170.889651
# C 行填充了 NaN  若改 列 也一样
df1.reindex(index=['A','B','C','D','E','F'])
c1c2c3c4c5
A0.1236180.3485670.1191560.3809520.379118
B0.4764920.2549760.6293180.7287080.747153
CNaNNaNNaNNaNNaN
D0.9653140.4241260.9138500.0920630.196096
E0.9607600.8663130.2267660.8657810.465341
F0.9828320.3408500.7250840.5196170.889651

3.利用 reindex() 选取某些值

s1
A 1 B 2 C 3 D 4 dtype: int64
s1.reindex(index=['A','B'])   # Series 的reindex()
A 1 B 2 dtype: int64
df1.reindex(index=['A','B'])  # Dataframe 的reindex()
c1c2c3c4c5
A0.1236180.3485670.1191560.3809520.379118
B0.4764920.2549760.6293180.7287080.747153

4.删除

s1.drop('A')  
B 2 C 3 D 4 dtype: int64
# axis=0 表示 A 代表 表格的行的index,axis=1 表示指定的是列的index
df1.drop('A',axis=0)   
c1c2c3c4c5
B0.4764920.2549760.6293180.7287080.747153
D0.9653140.4241260.9138500.0920630.196096
E0.9607600.8663130.2267660.8657810.465341
F0.9828320.3408500.7250840.5196170.889651

五.NaN —— Not a Number

import numpy as np
import pandas as pd
from pandas import Series,DataFrame
Nan in Numpy
n = np.nan   
type(n)
float
m = 1
m + n   # nan 
nan
Nan in Series
s1 = Series([1,2,np.nan,3,4],index=['A', 'B', 'C', 'D', 'E'])
s1
A 1.0 B 2.0 C NaN D 3.0 E 4.0 dtype: float64
s1.isnull()
A False B False C True D False E False dtype: bool
s1.dropna()   # 删除 NaN
A 1.0 B 2.0 D 3.0 E 4.0 dtype: float64
NaN in DataFrame
dframe = DataFrame([[1,2,3],[np.nan,5,6],[7,np.nan,9],[np.nan, np.nan, np.nan]])
dframe
012
01.02.03.0
1NaN5.06.0
27.0NaN9.0
3NaNNaNNaN
# 不写 默认0 表示对行操作  只要该行有nan就删
# how='any/all' 该行 部分/全部 是 NaN时删除
df1 = dframe.dropna(axis=0, how='any') 
df1
012
01.02.03.0
# thresh=2 参数表示 该行/列的NaN大于2的就删
df2 = dframe.dropna(thresh=2)
df2
012
01.02.03.0
1NaN5.06.0
27.0NaN9.0
# 值为 NaN的改为 1
df3 = dframe.fillna(value=1)
df3
012
01.02.03.0
11.05.06.0
27.01.09.0
31.01.01.0
# 某一列值为 NaN的改为 1
df3 = dframe.fillna(value={0:0, 1:1,2:2,3:3})
df3
012
01.02.03.0
10.05.06.0
27.01.09.0
30.01.02.0

六、多级Index

import numpy as np
import pandas as pd
from pandas import Series,DataFrame
#二级 index 
s1 = Series(np.random.randn(6),index=[['1','1','1','2','2','2'],
                                      ['a','b','c','a','b','c']])
s1
1 a 0.301681 b -1.596626 c -0.261337 2 a 0.739900 b 0.299108 c 0.074713 dtype: float64
s1['1']
a 0.301681 b -1.596626 c -0.261337 dtype: float64
s1['1']['a']
0.3016807350048885
s1[:,'a']   # 二级的 a 也返回了
1 0.301681 2 0.739900 dtype: float64

二级 index Series 与 DataFrame互相转换

# 二级的Series 转置 为dataframe
df1 = s1.unstack()
df1
abc
10.301681-1.596626-0.261337
20.7399000.2991080.074713
# 二级 Series 转换为 Datafraem
df2 = DataFrame([s1['1'],s1['2']])
df2
abc
00.301681-1.596626-0.261337
10.7399000.2991080.074713
# DataFrame 转换为多级 index 的Series
s2 = df1.unstack()
s2
a 1 0.301681 2 0.739900 b 1 -1.596626 2 0.299108 c 1 -0.261337 2 0.074713 dtype: float64
s2 = df1.T.unstack()
s2
1 a 0.301681 b -1.596626 c -0.261337 2 a 0.739900 b 0.299108 c 0.074713 dtype: float64

多级index DataFrame

df = DataFrame(np.arange(16).reshape(4,4),
              index=[['a','a','b','b'],[1,2,1,2]],
               columns=[['c','c','d','d'],[5,5,6,7]])
df
cd
5567
a10123
24567
b1891011
212131415
df['d']
67
a123
267
b11011
21415
df['d'][6]
a 1 2 2 6 b 1 10 2 14 Name: 6, dtype: int32 —–

七、Mapping 和 Replace

import numpy as np
import pandas as pd
from pandas import Series,DataFrame
df1 = DataFrame({"城市":["北京","上海","广州"],"人口":[1000,2000,3000]})
df1              
城市人口
0北京1000
1上海2000
2广州3000
# 普通方法 给DataFrame 添加列
# 如果 df1 的index 不是顺序的 0,1,2...,将不能正确赋值,需要知道 index
df1['GDP'] = Series([1000,2000,1500])
df1
城市人口GDP
0北京10001000
1上海20002000
2广州30001500

map() 方法给DataFrame添加列

# map方法(字典) 添加新列,
#这样就无需关心 按顺序对应,以及index匹配问题
gdp_map = {"北京":100,"广州":300,"上海":200}
df1['GDP'] = df1['城市'].map(gdp_map)
df1
城市人口GDP
0北京1000100
1上海2000200
2广州3000300

df2 = DataFrame({"城市":["北京","上海","广州"],"人口":[1000,2000,3000]},
                index=['A','B','C'])
df2
城市人口
A北京1000
B上海2000
C广州3000
# index 不是默认的 需要指定index,否则为 NaN
df2['GDP'] = Series([1000,2000,1500])
df2
城市人口GDP
A北京1000NaN
B上海2000NaN
C广州3000NaN
# 指定 index
df2['GDP'] = Series([1000,2000,1500],index=['A','B','C'])
df2
城市人口GDP
A北京10001000
B上海20002000
C广州30001500

replace in series

s1 = Series(np.arange(6))
s1
0    0
1    1
2    2
3    3
4    4
5    5
dtype: int32
s1.replace(1,np.nan)
0    0.0
1    NaN
2    2.0
3    3.0
4    4.0
5    5.0
dtype: float64
s1.replace([1,2,3],[10,20,30])
0     0
1    10
2    20
3    30
4     4
5     5
dtype: int64
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值