1. pandas介绍
-
series():是一个一维数据结构,它由index和value组成。
-
dataframe():是一个二维结构,除了拥有index和value之外,还拥有column。
-
date_range():是pandas中常用的函数,用于生成一个固定频率的DatetimeIndex时间索引。
原型:date_range(start=None, end=None, periods=None, freq=None, tz=None, normalize=False, name=None, closed=None, **kwargs)
常用参数为start、end、periods、freq。
start:指定生成时间序列的开始时间
end:指定生成时间序列的结束时间
periods:指定生成时间序列的数量 freq:生成频率,默认‘D’,可以是’H’、‘D’、‘M’、‘5H’、‘10D’、…
函数调用时至少要指定参数start、end、periods中的两个
# -*- coding: utf-8 -*-
"""
Created on Sun May 23 08:30:05 2021
@author: yangyue
"""
'''
pandas基本介绍:
Series():序列
DataFrame()
'''
import pandas as pd
import numpy as np
s = pd.Series([1,3,6,np.nan,44,1])
print('s =\n',s)
datas = pd.date_range('20160101',periods=6) #periods:时期
print('datas = ',datas)
#行是日期,列是a,b,c,d
df = pd.DataFrame(np.random.randn(6,4),index = datas,columns=['a','b','c','d'])
print('df = ',df)
#还可以用字典的方式定义DataFrame()
print(df.dtypes)
print(df.index) #输出列的序号
print(df.columns) #输出行的序号
print(df.values) #只打印值
print(df.describe()) #打印平均值,方差之类的描述
print(df.T) #如果是个矩阵,转置
print(df.shape)
print('排序后:\n',df.sort_index(axis = 1,ascending=False).values) #ascend:上升
print('排序后:\n',df.sort_index(axis = 1,ascending=True).values) #ascend:上升
2. pandas选择数
import pandas as pd
import numpy as np
'''
pandas:选择数据
'''
datas = pd.date_range('20160101',periods=6) #periods:时期
df = pd.DataFrame(np.random.randn(6,4),index = datas,columns=['a','b','c','d'])
print(df)
print(df['a'])
print(df['2016-01-01':'2016-01-04'])
########通过标签来选择
print(df.loc['20160101'])
print(df.loc[:,['a','b']])
print(df.loc['20160101',['a','b']])
############通过位置来选择
print(df.iloc[3,1])
print(df.iloc[3:5,1:3])
print(df.iloc[[1,3,5],:])
#############一起选择
#print(df.ix[:3,['a','b']])
print("###################")
print(df.a > 8)
3. pandas设置数据
import pandas as pd
import numpy as np
'''
pandas:设置数据
'''
datas = pd.date_range('20160101',periods=6) #periods:时期
df = pd.DataFrame(np.random.randn(6,4),index = datas,columns=['a','b','c','d'])
print('df = \n',df)
df.iloc[2,2] = 111 #通过位置改变值
print(df)
df.loc['20160101','a'] = 2222 #通过标签
print(df)
df[df.a > 4] = 0
print(df)
#加一列
df['f'] = 0
print(df)
df['e'] = pd.Series([1,2,3,4,5,6],index=pd.date_range('20160101',periods = 6))
print(df)
dates = pd.date_range('20130101',periods=6)
df = pd.DataFrame(np.arange(24).reshape(6,4),index = dates,columns=['A','B','C','D'])
print(df)
df.iloc[0,1] = np.nan
df.iloc[1,2] = np.nan
print(df)
print(df.dropna(axis=0,how='any')) # how = 'any' or'all' any:出现就丢掉 all:都是才丢掉
print(df.fillna(value = 0))
print(df.isnull())
print(np.any(df.isnull()) == True)
4. pandas读取保存文件
import pandas as pd
# read from
data = pd.read_csv('student.csv')
print(data)
# save to
data.to_pickle('student.pickle')
5.pandas concatenating
import pandas as pd
import numpy as np
# concatenating
# ignore index
df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'])
df2 = pd.DataFrame(np.ones((3,4))*1, columns=['a','b','c','d'])
df3 = pd.DataFrame(np.ones((3,4))*2, columns=['a','b','c','d'])
res = pd.concat([df1, df2, df3], axis=0, ignore_index=True)
# join, ('inner', 'outer')
df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'], index=[1,2,3])
df2 = pd.DataFrame(np.ones((3,4))*1, columns=['b','c','d', 'e'], index=[2,3,4])
res = pd.concat([df1, df2], axis=1, join='outer')
res = pd.concat([df1, df2], axis=1, join='inner')
# join_axes
res = pd.concat([df1, df2], axis=1, join_axes=[df1.index])
# append
df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'])
df2 = pd.DataFrame(np.ones((3,4))*1, columns=['a','b','c','d'])
df2 = pd.DataFrame(np.ones((3,4))*1, columns=['b','c','d', 'e'], index=[2,3,4])
res = df1.append(df2, ignore_index=True)
res = df1.append([df2, df3])
s1 = pd.Series([1,2,3,4], index=['a','b','c','d'])
res = df1.append(s1, ignore_index=True)
print(res)