核心数据结构
- Series一维带标签数组
"""Series 创建"""
series = pd.Series(np.random.randn(5),index=list("abcde"))
series = pd.Series([1,2,3,4,5],index=list("abcde"))
series = pd.Series(dict(a=10,b=20,c=30,d=40,e=50))
series = pd.Series(6,index=list("abcde"))
"""属性和方法"""
print(series.index)
print(series.name)
print(series.shape)
print(series.dtypes)
print(series.values)
series.head(2)
series.tail(2)
series.describe()
series.sort_values()
series.reindex(new_indexs)
"""索引"""
print("series[0]:{0}".format(series[0]))
print("series[0:4]:{0}".format(series[0:4]))
print("series['a']:{0}".format(series["a"]))
print("a in series:{0}".format("a" in series))
print("series.get('a'):{0}".format(series.get('a')))
series1 = pd.Series(np.random.randn(6),index=list("abcdef"))
series2 = pd.Series(np.random.randn(5),index=list("acfgj"))
series1+series2
- DataFrame二维带标签数组
"""创建DataFrame对象"""
df = pd.DataFrame(np.random.randn(6,4),index=list("ABCDEF"),columns=["one","two","three","four"])
df = pd.DataFrame(dict(A=list(range(6)),B=list(range(1,7)),C=list(range(2,8))),index=list("abcdef"))
df = pd.DataFrame([[1,2,3,4],[2,3,4,5],[3,4,5,6]])
df = pd.DataFrame([(1,2,3,4),(5,6,7,8)],index=list("AB"),columns=["one","two","three","four"])
df = pd.DataFrame([{"A":1,"B":2},{"A":2,"B":4}])
df = pd.DataFrame(series,index=list("abcde"))
"""属性和方法"""
df.index
df.columns
df.pop()
del df[col]
df.insert()
df.assign()
df.reindex()
df.drop()
"""索引"""
df[col]
df.loc[lable]
df.iloc[index_label]
df[bool_vector]
df.swaplevel()
df.sortlevel()
df1+df2
- Panel三维带标签数组(使用较少,详情查手册)
索引
- 层次化索引
层次化索引可以使数据在一个轴上有多个索引级别。即以二维方式表达高维数据,使数据组织方式更清晰。
"""Series多层索引"""
a = [['a', 'a', 'a', 'b', 'b', 'c', 'c'], [1, 2, 3, 1, 2, 2, 3]]
tuples = list(zip(*a))
index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])
s = pd.Series(np.random.randn(7), index=index)
s.index.levels[0]
"""DataFrame多层索引"""
df = pd.DataFrame(np.random.randint(1, 10, (4, 3)),
index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
columns=[['one', 'one', 'two'], ['blue', 'red', 'blue']])
"""索引交换及排序"""
df.swaplevel(name1,name2)
df.sortlevel(0)
df.sum(level=0)
"""索引与列的转换"""
df.set_index(cols)
df.reset_index()
基础运算
df.apply()
df.applymap()
df.sort_values()
df.sort_index()
series.unique()
series.value_counts()
series.isin()
分组计算
"""对Series分组"""
df[col].groupby(df[col1])
df[col].groupby([df[col1],df[col2]])
"""对DataFrame分组"""
df.groupby(col)
df.groupby([col1,col2])
"""获取分组元素个数"""
df.groupby([col1,col2]).size()
"""对分组进行迭代"""
for name,group in df.groupby(col):
print(name)
print(group)
"""将分组转化为字典"""
dict(list(df.groupby(col)))
"""按列分组"""
df.groupby(df.dtypes,axis=1)
"""通过字典分组"""
df.groupby(dict(col1="red",col2="red",col3='blue',col4='blue'),axis=1)
"""按索引级别分组"""
columns = pd.MultiIndex.from_arrays([['China', 'USA', 'China', 'USA', 'China'],
['A', 'A', 'B', 'C', 'B']], names=['country', 'index'])
df = pd.DataFrame(np.random.randint(1, 10, (5, 5)), columns=columns)
df.groupby(level='country', axis=1)
"""分组重置索引index"""
df.groupby(col).reset_index()
df.groupby(col,as_index=False)
聚合计算
"""内置聚合函数"""
df.groupby(df.col).sum()
df.groupby(df.col).min()
df.groupby(df.col).max()
df.groupby(df.col).mean()
df.groupby(df.col).std()
df.groupby(df.col).describe()
"""自定义聚合函数"""
def peak(s):
return s.max() - s.min()
df.groupby(df.col).agg(peak)
"""应用多个聚合函数"""
df.groupby(df.col).agg(['mean','std',peak])
"""给聚合后的列取名"""
df.groupby(df.col).agg([('mean','col_mean'),('std','col_std'),(peak,'col_peak')])
"""对不同列应用不同的聚合函数"""
df.groupby([col1,col2]).agg(dict(col1=['mean',peak],col2=['sum','std']))
分组运算和转换
df = pd.DataFrame({'key1': ['a', 'a', 'b', 'b', 'a'],
'key2': ['one', 'two', 'one', 'two', 'one'],
'data1': np.random.randint(1, 10, 5),
'data2': np.random.randint(1, 10, 5)})
"""给每行添加一个以key1分组后的平均值"""
k1_mean = df.groupby('key1').mean().add_prefix('mean_')
pd.merge(df, k1_mean, left_on='key1', right_index=True)
"""transform简化处理"""
k1_mean = df.groupby('key1').transform(np.mean).add_prefix('mean_')
df[k1_mean.columns] = k1_mean
"""距平化(与平均值的差值)"""
df = pd.DataFrame(np.random.randint(1, 10, (5, 5)),
columns=['a', 'b', 'c', 'd', 'e'],
index=['Alice', 'Bob', 'Candy', 'Dark', 'Emily'])
def demean(s):
return s - s.mean()
key = ['one', 'one', 'two', 'one', 'two']
demeaned = df.groupby(key).transform(demean)
"""apply函数(逐行或逐列处理数据)"""
df = pd.DataFrame({'key1': ['a', 'a', 'b', 'b', 'a', 'a', 'a', 'b', 'b', 'a'],
'key2': ['one', 'two', 'one', 'two', 'one', 'one', 'two', 'one', 'two', 'one'],
'data1': np.random.randint(1, 10, 10),
'data2': np.random.randint(1, 10, 10)})
def top(df, n=2, column='data1'):
return df.sort_values(by=column, ascending=False)[:n]
df.groupby('key1').apply(top,n=3,column="data2")
数据导入导出
时间日期
数据可视化
创建数据对象
"""创建数据对象"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
data = pd.Series([1,2,3,np.nan,5,6])
date = pd.date_range("20200608",periods=6)
df = pd.DataFrame(np.random.randn(6,4),index=date,columns=list("ABCD"))
df = pd.DataFrame(dict(A=1,\
B=pd.Timestamp("20200608"),\
C=list(range(4)),\
D=np.arange(5,9),\
E="text",\
F=list("AABBCCDD")))
df.values
df.dtypes
df.A
df.shape
df.head(n)
df.tail(n)
df.index
df.colums
df.describe()
df.T
df.sort_index(axis=1,ascending=False)
df.sort_values(by="C")
"""数据访问:标签访问"""
df.loc["20200608"]
df.loc[:,"A"]
df.loc["20200608":"20200612","A":"B"]
df.loc["20200608":"20200610",["A","C"]]
df.loc["20200608","A"]
df.at[pd.Timestamp("20200608","A")]
"""数据访访问:位置访问"""
df.iloc[1]
df.iloc[:,1]
df.iloc[1:5,1:3]
df.iloc[1,1]
df.iat[1,1]
"""布尔索引"""
df[df > value]
df[df.A > value]
df[df.F.isin(["A","C"])]
"""处理丢失数据"""
df.dropna(axis=0,how="any")
df.fillna(value=5,method=None)
pd.isnull(df)
df.mean()
df.sum()
df.sub()
df.cumsum()
df.apply(func)
df.iloc[0].value_counts()
df.iloc[0].mode()
"""数据合并"""
pd.concat([df.iloc[0],df.iloc[2:4],df.iloc[6]])
pd.merge()
df.append(df.iloc[0])
"""分组统计"""
df.groupby("A").sum()
df.groupby(["A","B"]).sum()
index = pd.MultiIndex.from_tuples(tuples,names=["first","second"])
df.stack()
df.unstack()
"""数据透视"""
pd.pivot_table(df,values="A",index=["D","E"],columns=["C"])
"""时间序列"""
pd.date_range("20200609",periods=600,freq="s")
pd.period_range(start=pd.Period('2017Q1', freq='Q'),end=pd.Period('2017Q2',freq='Q'),freq='M')
df.resample("2Min",how="sum")
"""类别数据"""
df["grade"] = df["D"].astype("category")
df["grade"].cat.categories = ["goog","very good"]
"""数据读写"""
df.to_csv("data.csv")
df = pd.read_csv("data.csv",index_col)