链接:https://pan.baidu.com/s/1Hy_4a_UAJ6hboPPoTGOpvw
提取码:zlys
#%%
print("hello jupyterNotebook")
#%% md
# 一、Series操作
#%% md
导入pandas模块,并约定模块简称为pd
#%%
import pandas as pd
print(pd.__version__)
#%% md
Pandas 的数据结构:Pandas 主要有 Series(一维数组),
DataFrame(二维数组),Panel(三维数组),Panel4D(四维数组),
PanelND(更多维数组)等数据结构。其中 Series 和 DataFrame 应用的最为广泛。
#%%
#创建 Series 语法:s = pd.Series(data, index=index),可以通过多种方式进行创建,以下介绍了 3 个常用方法
#%%
arr = [0,1,2,3,4]
s1 = pd.Series(arr)#如果不指定索引,则默认从0开始
s1
#%%
import numpy as np
n = np.random.random(5) #创建一个随机Ndarray数组
index = ['a','b','c','d','e']
s2 = pd.Series(n,index=index)
s2
#%%
d = {'a':1,'b':2,'c':3,'d':4,'e':5} #定义示例字典
s3 = pd.Series(d)
s3
#%%
print(s1)
s1.index = ['A','B','C','D','E']
s1
#%%
s4 = s3.append(s1)
s4
#%%
print(s4)
s4 = s4.drop('e')
s4
#%%
s4['A'] = 6
print(s4)
s4['B']
#%%
#series加减乘除法
#都是按照索引对应计算,如果索引不同则填充为NAN(空值).
print(s4.add(s3))
print(s4.sub(s3))
print(s4.mul(s3))
print(s4.div(s3))
#%%
#中位数,求和,最大值,最小值
print(s4.median())
print(s4.sum())
print(s4.max())
print(s4.min())
#%% md
## 二、Dataframe 操作
#%%
dates = pd.date_range('today', periods=6)#定义时间序列为index
num_arr = np.random.randn(6,4) #传入numpy随机数组
columns = ['A','B','C','D']#将列表作为列名
df1 = pd.DataFrame(num_arr,index=dates,columns=columns)
df1
#%%
data = {'animal':['cat','cat','snake','dog','dog','cat','snake','cat','dog','dog'],
'age':[2.5,3,0.5,np.nan,5,2,4.5,np.nan,7,3],
'visits':[1,3,2,3,2,3,1,2,1,1],
'priority':['yes','yes','no','yes','no','no','yes','yes','no','no']}
labels = ['a','b','c','d','e','f','g','h','i','j']
df2 = pd.DataFrame(data,index=labels)
df2
#%%
df2.dtypes
#%%
df2.head()
#%%
df2.index
#%%
df2.tail(3)
#%%
df2.columns
#%%
df2.values
#%%
df2.describe()
#%%
df2.T
#%%
df2[1:3]
#%%
df2.sort_values(by='age')
#%%
df2['age']
#%%
df2.age
#%%
df2[['age','animal']]
#%%
print(df2)
df2.iloc[1:3]#查询2,3行
#%%
df3 = df2.copy()
df3
#%%
df3.isnull()
##%
num = pd.Series([0,1,2,3,4,5,6,7,8,9],index=df3.index)
df3['No.'] = num
df3
#%%
df3.iat[1,1] =2
df3
#%%
df3.loc['f','age'] = 1.5
df3
#%%
#求平均值
df3.mean()
#%%
df3['visits'].sum()
#%%
#将字符串转化为小写字母
string = pd.Series(['A','B','C','Aaba','Baca',
np.nan,'CABA','dog','cat'])
print(string)
string.str.lower()
#%%
string.str.upper()
#%%
df4 = df3.copy()
print(df4)
df4.fillna(value=3)
#%%
df5 = df3.copy()
print(df5)
df5.dropna(how='any')#删除任何存在NAN的值
#%%
left = pd.DataFrame({'key':['fool','foo2'],'one':[1,2]})
right = pd.DataFrame({'key': ['foo2', 'foo3'], 'two': [4, 5]})
print(left)
print(right)
#按照key列对齐连接,只存在foo2相同,所以最后变成一行
pd.merge(left,right,on='key',how='left')
#%%
df3.to_csv('animal.csv')
print("写入成功!")
#%%
df3.to_excel('animal.xlsx',sheet_name='Sheet1')
print("写入成功!")
#%%
pd.read_excel('animal.xlsx','Sheet1',index_col=None,na_values=['NA'])
#%%
dti = pd.date_range(start='2018-01-01',end='2018-12-31',freq='D')
s = pd.Series(np.random.rand(len(dti)),index=dti)
s
#%%
s[s.index.weekday==2].sum()
#%%
s.resample('M').mean()
#%%
s = pd.date_range('today',periods=100,freq='S')
ts = pd.Series(np.random.randint(0,500,len(s)),index=s)
ts.resample('Min').sum()
#%%
#UTC世界时间标准
s = pd.date_range('today',periods=1,freq='D')
ts = pd.Series(np.random.randn(len(s)),s)
ts_utc = ts.tz_localize('UTC')
ts_utc
#%%
ts_utc.tz_convert('Asia/Shanghai')
#%%
rng = pd.date_range('1/1/2018',periods=5,freq='M')
ts = pd.Series(np.random.randn(len(rng)),index=rng)
print(ts)
ps = ts.to_period()
print(ps)
ps.to_timestamp()
#%% md
# 三、Series多重索引
#%%
import pandas as pd
import numpy as np
letters = ['A','B','C']
numbers = list(range(10))
mi = pd.MultiIndex.from_product([letters,numbers])
s = pd.Series(np.random.rand(30),index=mi)
s
#%%
s.loc[:,[1,3,6]]
#%%
s.loc[pd.IndexSlice[:'B',5:]]
#%% md
# 四、DataFrame 多重索引
#%%
frmae = pd.DataFrame(np.arange(12).reshape(6,2),
index=[list('AAABBB'),list('123123')],
columns=['hello','shiyanlou']
)
frmae
#%%
frmae.index.names = ['first','second']
frmae
#%%
frmae.groupby('first').sum()
#%%
print(frmae)
frmae.stack()
#%%
print(frmae)
frmae.unstack()
#%%
data = {'animal':['cat','cat','snake','dog','dog','cat','snake','cat','dog','dog'],
'age':[2.5,3,0.5,np.nan,5,2,4.5,np.nan,7,3],
'visits':[1,3,2,3,2,3,1,2,1,1],
'priority':['yes','yes','no','yes','no','no','yes','yes','no','no']}
labels = ['a','b','c','d','e','f','g','h','i','j']
df = pd.DataFrame(data,index=labels)
df[df['age'] > 3]
#%%
df.iloc[2:4,1:3]
#%%
df = pd.DataFrame(data,index=labels)
df[(df['animal'] == 'cat') & (df['age'] <3)]
#%%
df[df['animal'].isin(['cat','dog'])]
df2 =df.copy()
#%%
df.loc[df2.index[[3,4,8]],['animal','age']]
#%%
#false 为降序,true 为升序
df.sort_values(by=['age','visits'],ascending=[False,True])
#%%
df['priority'].map({'yes':True,'no':False})
#%%
df.groupby('animal').sum()
#%%
import pandas as pd
import numpy as np
temp_df1 = pd.DataFrame(np.random.randn(5,4))
temp_df2 = pd.DataFrame(np.random.randn(5,4))
temp_df3 = pd.DataFrame(np.random.randn(5,4))
print(temp_df1)
print(temp_df2)
print(temp_df3)
pieces = [temp_df1,temp_df2,temp_df3]
pd.concat(pieces)
#%%
df = pd.DataFrame(np.random.random(size=(5,10)),columns=list('abcdefghij'))
print(df)
df.sum().idxmin()
#idxmax(),idmin()为Series 函数返回最大值最小值的索引值
#%%
df = pd.DataFrame(np.random.random(size=(5,3)))
print(df)
df.sub(df.me)
#%%
df = pd.DataFrame({'A':list('aaabbcaabcccbbc'),
'B':[12,345,3,1,45,14,4,52,54,23,235,21,22,3,87]})
df.groupby('A')['B'].nlargest(3).sum(level=0)
#%% md
# 透视表
#%%
df = pd.DataFrame({'A':['one','one','two','three']*3,
'B':['A','B','C']*4,
'C':['foo','foo','foo','bar','bar','bar']*2,
'D':np.random.randn(12),
'E':np.random.randn(12)})
print(df)
pd.pivot_table(df,index=['A','B'])
#%%
pd.pivot_table(df,values=['D'],index=['A','B'])
#%%
pd.pivot_table(df,values=['D'],index=['A','B'],aggfunc=[np.sum,len])
#%%
pd.pivot_table(df,values=['D'],index=['A','B'],
columns=['C'],aggfunc=np.sum)
#%%
pd.pivot_table(df,values=['D'],index=['A','B'],
columns=['C'],aggfunc=np.sum,fill_value=0)
#%% md
# 绝对类型
#%% md
在数据的形式上主要包括数量型和性质型,数据型表示着数据可数范围可变,而性质型表示
范围已经确定不可改变,绝对型数据就是性质型数据中的一种。
#%%
df = pd.DataFrame({"id":[1,2,3,4,5,6],
"raw_grade":['a','b','b','a','a','e']})
df["grade"] = df["raw_grade"].astype("category")
df["grade"].cat.categories = ["very good","good","very bad"]
df
#%%
df.sort_values(by="grade")
#%%
df.groupby("grade").size()
#%% md
# 数据清洗
#%%
df = pd.DataFrame({'From_To': ['LoNDon_paris', 'MAdrid_miLAN', 'londON_StockhOlm',
'Budapest_PaRis', 'Brussels_londOn'],
'FlightNumber': [10045, np.nan, 10065, np.nan, 10085],
'RecentDelays': [[23, 47], [], [24, 43, 87], [13], [67, 32]],
'Airline': ['KLM(!)', '<Air France> (12)', '(British Airways. )',
'12. Air France', '"Swiss Air"']})
df['FlightNumber'] = df['FlightNumber'].interpolate().astype(int)
df
#%%
temp = df.From_To.str.split('_',expand=True)
temp.columns = ['From','To']
temp
#%%
temp['From'] = temp['From'].str.capitalize()
temp['To'] = temp['To'].str.capitalize()
#%%
df['Airline'] = df['Airline'].str.extract('([a-zA-Z\s]+)',expand=False).str.strip()
df
#%%
delays = df['RecentDelays'].apply(pd.Series)
delays.columns = ['delay_{}'.format(n)
for n in range(1,len(delays.columns)+1)]
df = df.drop('RecentDelays',axis=1).join(delays)
df
#%%
df = pd.DataFrame({'name': ['Alice', 'Bob', 'Candy', 'Dany', 'Ella',
'Frank', 'Grace', 'Jenny'],
'grades': [58, 83, 79, 65, 93, 45, 61, 88]})
def choice(x):
if x > 60:
return 1
else:
return 0
df.grades = pd.Series(map(lambda x: choice(x),df.grades))
df
#%%
df = pd.DataFrame({'A': [1, 2, 2, 3, 4, 5, 5, 5, 6, 7, 7]})
df.loc[df['A'].shift() !=df['A']]
#%% md
有时候,DataFrame 中不同列之间的数据差距太大,需要对其进行归一化处理。
其中,Max-Min 归一化是简单而常见的一种方式,公式如下:
𝑌=𝑋−𝑋𝑚𝑖𝑛/𝑋𝑚𝑎𝑥−𝑋𝑚𝑖𝑛
#%%
def normalization(df):
numerator = df.sub(df.min())
denominator = (df.max()).sub(df.min())
Y = numerator.div(denominator)
return Y
df = pd.DataFrame(np.random.random(size=(5,3)))
print(df)
normalization(df)
#%% md
# Pandas 绘图操作
#%%
%matplotlib inline
ts = pd.Series(np.random.randn(100),index=pd.date_range('today',periods=100))
ts = ts.cumsum()
ts
#%%
df = pd.DataFrame(np.random.randn(100,4),index=ts.index,columns=['A','B','C','D'])
df = df.cumsum()
df.plot()
#%%
df = pd.DataFrame({"xs": [1, 5, 2, 8, 1], "ys": [4, 2, 1, 9, 6]})
df = df.cumsum()
df.plot("xs","ys",color='red',marker="*")
#%%
df = pd.DataFrame({"revenue": [57, 68, 63, 71, 72, 90, 80, 62, 59, 51, 47, 52],
"advertising": [2.1, 1.9, 2.7, 3.0, 3.6, 3.2, 2.7, 2.4, 1.8, 1.6, 1.3, 1.9],
"month": range(12)
})
ax = df.plot.bar("month","revenue",color = "yellow")
df.plot("month","advertising",secondary_y=True,ax =ax )