DataFrame和Series的基本操作

链接:https://pan.baidu.com/s/1Hy_4a_UAJ6hboPPoTGOpvw
提取码:zlys

#%%

print("hello jupyterNotebook")

#%% md

# 一、Series操作

#%% md

导入pandas模块,并约定模块简称为pd

#%%

import pandas as pd
print(pd.__version__)

#%% md

Pandas 的数据结构:Pandas 主要有 Series(一维数组),
DataFrame(二维数组),Panel(三维数组),Panel4D(四维数组),
PanelND(更多维数组)等数据结构。其中 Series 和 DataFrame 应用的最为广泛。

#%%

#创建 Series 语法:s = pd.Series(data, index=index),可以通过多种方式进行创建,以下介绍了 3 个常用方法

#%%

arr = [0,1,2,3,4]
s1 = pd.Series(arr)#如果不指定索引,则默认从0开始
s1

#%%

import numpy as np
n = np.random.random(5) #创建一个随机Ndarray数组
index = ['a','b','c','d','e']
s2 = pd.Series(n,index=index)
s2

#%%

d = {'a':1,'b':2,'c':3,'d':4,'e':5} #定义示例字典
s3 = pd.Series(d)
s3

#%%

print(s1)
s1.index = ['A','B','C','D','E']
s1

#%%

s4 = s3.append(s1)
s4

#%%

print(s4)
s4 = s4.drop('e')
s4

#%%

s4['A'] = 6
print(s4)
s4['B']

#%%

#series加减乘除法
#都是按照索引对应计算,如果索引不同则填充为NAN(空值).
print(s4.add(s3))
print(s4.sub(s3))
print(s4.mul(s3))
print(s4.div(s3))

#%%

#中位数,求和,最大值,最小值
print(s4.median())
print(s4.sum())
print(s4.max())
print(s4.min())

#%% md

## 二、Dataframe 操作

#%%

dates = pd.date_range('today', periods=6)#定义时间序列为index
num_arr = np.random.randn(6,4) #传入numpy随机数组
columns = ['A','B','C','D']#将列表作为列名
df1 = pd.DataFrame(num_arr,index=dates,columns=columns)
df1

#%%

data = {'animal':['cat','cat','snake','dog','dog','cat','snake','cat','dog','dog'],
        'age':[2.5,3,0.5,np.nan,5,2,4.5,np.nan,7,3],
        'visits':[1,3,2,3,2,3,1,2,1,1],
        'priority':['yes','yes','no','yes','no','no','yes','yes','no','no']}
labels = ['a','b','c','d','e','f','g','h','i','j']
df2 = pd.DataFrame(data,index=labels)
df2

#%%

df2.dtypes

#%%

df2.head()

#%%

df2.index

#%%

df2.tail(3)

#%%

df2.columns

#%%

df2.values

#%%

df2.describe()

#%%

df2.T

#%%

df2[1:3]

#%%

df2.sort_values(by='age')

#%%

df2['age']

#%%

df2.age

#%%

df2[['age','animal']]

#%%

print(df2)
df2.iloc[1:3]#查询2,3行

#%%

df3 = df2.copy()
df3

#%%

df3.isnull()
##%
num = pd.Series([0,1,2,3,4,5,6,7,8,9],index=df3.index)
df3['No.'] = num
df3

#%%

df3.iat[1,1] =2
df3

#%%

df3.loc['f','age'] = 1.5
df3

#%%

#求平均值
df3.mean()

#%%

df3['visits'].sum()

#%%

#将字符串转化为小写字母
string = pd.Series(['A','B','C','Aaba','Baca',
                    np.nan,'CABA','dog','cat'])
print(string)
string.str.lower()

#%%

string.str.upper()

#%%

df4 = df3.copy()
print(df4)
df4.fillna(value=3)

#%%

df5 = df3.copy()
print(df5)
df5.dropna(how='any')#删除任何存在NAN的值

#%%

left = pd.DataFrame({'key':['fool','foo2'],'one':[1,2]})
right = pd.DataFrame({'key': ['foo2', 'foo3'], 'two': [4, 5]})
print(left)
print(right)
#按照key列对齐连接,只存在foo2相同,所以最后变成一行
pd.merge(left,right,on='key',how='left')

#%%

df3.to_csv('animal.csv')
print("写入成功!")

#%%

df3.to_excel('animal.xlsx',sheet_name='Sheet1')
print("写入成功!")

#%%

pd.read_excel('animal.xlsx','Sheet1',index_col=None,na_values=['NA'])

#%%

dti = pd.date_range(start='2018-01-01',end='2018-12-31',freq='D')
s = pd.Series(np.random.rand(len(dti)),index=dti)
s

#%%

s[s.index.weekday==2].sum()

#%%

s.resample('M').mean()

#%%

s = pd.date_range('today',periods=100,freq='S')
ts = pd.Series(np.random.randint(0,500,len(s)),index=s)
ts.resample('Min').sum()

#%%

#UTC世界时间标准
s = pd.date_range('today',periods=1,freq='D')
ts = pd.Series(np.random.randn(len(s)),s)
ts_utc = ts.tz_localize('UTC')
ts_utc

#%%

ts_utc.tz_convert('Asia/Shanghai')

#%%

rng = pd.date_range('1/1/2018',periods=5,freq='M')
ts = pd.Series(np.random.randn(len(rng)),index=rng)
print(ts)
ps = ts.to_period()
print(ps)
ps.to_timestamp()

#%% md

# 三、Series多重索引

#%%

import pandas as pd
import numpy as np
letters = ['A','B','C']
numbers = list(range(10))
mi = pd.MultiIndex.from_product([letters,numbers])
s = pd.Series(np.random.rand(30),index=mi)
s

#%%

s.loc[:,[1,3,6]]

#%%

s.loc[pd.IndexSlice[:'B',5:]]

#%% md

# 四、DataFrame 多重索引

#%%

frmae = pd.DataFrame(np.arange(12).reshape(6,2),
                     index=[list('AAABBB'),list('123123')],
                     columns=['hello','shiyanlou']
                     )
frmae

#%%

frmae.index.names = ['first','second']
frmae

#%%

frmae.groupby('first').sum()

#%%

print(frmae)
frmae.stack()

#%%

print(frmae)
frmae.unstack()

#%%

data = {'animal':['cat','cat','snake','dog','dog','cat','snake','cat','dog','dog'],
        'age':[2.5,3,0.5,np.nan,5,2,4.5,np.nan,7,3],
        'visits':[1,3,2,3,2,3,1,2,1,1],
        'priority':['yes','yes','no','yes','no','no','yes','yes','no','no']}
labels = ['a','b','c','d','e','f','g','h','i','j']
df = pd.DataFrame(data,index=labels)
df[df['age'] > 3]

#%%

df.iloc[2:4,1:3]

#%%

df = pd.DataFrame(data,index=labels)
df[(df['animal'] == 'cat') & (df['age'] <3)]

#%%

df[df['animal'].isin(['cat','dog'])]
df2 =df.copy()

#%%

df.loc[df2.index[[3,4,8]],['animal','age']]

#%%

#false 为降序,true 为升序
df.sort_values(by=['age','visits'],ascending=[False,True])

#%%

df['priority'].map({'yes':True,'no':False})

#%%


df.groupby('animal').sum()

#%%

import pandas as pd
import numpy as np
temp_df1 = pd.DataFrame(np.random.randn(5,4))
temp_df2 = pd.DataFrame(np.random.randn(5,4))
temp_df3 = pd.DataFrame(np.random.randn(5,4))
print(temp_df1)
print(temp_df2)
print(temp_df3)
pieces = [temp_df1,temp_df2,temp_df3]
pd.concat(pieces)

#%%

df = pd.DataFrame(np.random.random(size=(5,10)),columns=list('abcdefghij'))
print(df)
df.sum().idxmin()
#idxmax(),idmin()为Series 函数返回最大值最小值的索引值

#%%

df = pd.DataFrame(np.random.random(size=(5,3)))
print(df)
df.sub(df.me)

#%%

df = pd.DataFrame({'A':list('aaabbcaabcccbbc'),
                   'B':[12,345,3,1,45,14,4,52,54,23,235,21,22,3,87]})
df.groupby('A')['B'].nlargest(3).sum(level=0)

#%% md

# 透视表

#%%

df = pd.DataFrame({'A':['one','one','two','three']*3,
                   'B':['A','B','C']*4,
                   'C':['foo','foo','foo','bar','bar','bar']*2,
                   'D':np.random.randn(12),
                   'E':np.random.randn(12)})
print(df)
pd.pivot_table(df,index=['A','B'])

#%%

pd.pivot_table(df,values=['D'],index=['A','B'])

#%%

pd.pivot_table(df,values=['D'],index=['A','B'],aggfunc=[np.sum,len])

#%%

pd.pivot_table(df,values=['D'],index=['A','B'],
               columns=['C'],aggfunc=np.sum)

#%%

pd.pivot_table(df,values=['D'],index=['A','B'],
               columns=['C'],aggfunc=np.sum,fill_value=0)

#%% md

# 绝对类型

#%% md

在数据的形式上主要包括数量型和性质型,数据型表示着数据可数范围可变,而性质型表示
范围已经确定不可改变,绝对型数据就是性质型数据中的一种。

#%%

df =  pd.DataFrame({"id":[1,2,3,4,5,6],
                 "raw_grade":['a','b','b','a','a','e']})
df["grade"] = df["raw_grade"].astype("category")

df["grade"].cat.categories = ["very good","good","very bad"]
df

#%%

df.sort_values(by="grade")

#%%

df.groupby("grade").size()

#%% md

# 数据清洗

#%%

df = pd.DataFrame({'From_To': ['LoNDon_paris', 'MAdrid_miLAN', 'londON_StockhOlm',
                               'Budapest_PaRis', 'Brussels_londOn'],
                   'FlightNumber': [10045, np.nan, 10065, np.nan, 10085],
                   'RecentDelays': [[23, 47], [], [24, 43, 87], [13], [67, 32]],
                   'Airline': ['KLM(!)', '<Air France> (12)', '(British Airways. )',
                               '12. Air France', '"Swiss Air"']})
df['FlightNumber'] = df['FlightNumber'].interpolate().astype(int)
df

#%%

temp = df.From_To.str.split('_',expand=True)
temp.columns = ['From','To']
temp

#%%

temp['From'] = temp['From'].str.capitalize()
temp['To'] = temp['To'].str.capitalize()

#%%

df['Airline'] = df['Airline'].str.extract('([a-zA-Z\s]+)',expand=False).str.strip()
df

#%%

delays = df['RecentDelays'].apply(pd.Series)
delays.columns = ['delay_{}'.format(n)
                  for n in range(1,len(delays.columns)+1)]
df = df.drop('RecentDelays',axis=1).join(delays)
df

#%%

df = pd.DataFrame({'name': ['Alice', 'Bob', 'Candy', 'Dany', 'Ella',
                            'Frank', 'Grace', 'Jenny'],
                   'grades': [58, 83, 79, 65, 93, 45, 61, 88]})
def choice(x):
    if x > 60:
        return 1
    else:
        return 0
df.grades = pd.Series(map(lambda x: choice(x),df.grades))
df

#%%

df = pd.DataFrame({'A': [1, 2, 2, 3, 4, 5, 5, 5, 6, 7, 7]})
df.loc[df['A'].shift() !=df['A']]

#%% md

有时候,DataFrame 中不同列之间的数据差距太大,需要对其进行归一化处理。

其中,Max-Min 归一化是简单而常见的一种方式,公式如下:

𝑌=𝑋−𝑋𝑚𝑖𝑛/𝑋𝑚𝑎𝑥−𝑋𝑚𝑖𝑛

#%%

def normalization(df):
    numerator = df.sub(df.min())
    denominator = (df.max()).sub(df.min())
    Y = numerator.div(denominator)
    return Y
df = pd.DataFrame(np.random.random(size=(5,3)))
print(df)
normalization(df)

#%% md

# Pandas 绘图操作

#%%

%matplotlib inline
ts = pd.Series(np.random.randn(100),index=pd.date_range('today',periods=100))
ts = ts.cumsum()
ts

#%%

df = pd.DataFrame(np.random.randn(100,4),index=ts.index,columns=['A','B','C','D'])
df = df.cumsum()
df.plot()

#%%

df = pd.DataFrame({"xs": [1, 5, 2, 8, 1], "ys": [4, 2, 1, 9, 6]})
df = df.cumsum()
df.plot("xs","ys",color='red',marker="*")

#%%

df = pd.DataFrame({"revenue": [57, 68, 63, 71, 72, 90, 80, 62, 59, 51, 47, 52],
                   "advertising": [2.1, 1.9, 2.7, 3.0, 3.6, 3.2, 2.7, 2.4, 1.8, 1.6, 1.3, 1.9],
                   "month": range(12)
                   })
ax = df.plot.bar("month","revenue",color = "yellow")
df.plot("month","advertising",secondary_y=True,ax =ax )

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值