数据离散化
自动分组
pd.qcut(data,bins) :data是想要分组的数据,bins是分几组
series.value_counts():统计分组次数
自定义分组
pd.cut(data,bins) #bins自己指定分组区间
one-hot编码矩阵
pandas.get_dummies(data, prefix=None)
prefix:分组名字
示例:
import pandas as pd
data=pd.read_csv("./stock_day.csv")
data_p=data['p_change']
p_counts=pd.qcut(data_p,10) #自行分组
#实现哑变量矩阵one-hot
dummies = pd.get_dummies(p_counts, prefix="rise")
数据合并
pd.concat([data1, data2], axis=1)
按照行或列进行合并,axis=0为列索引,axis=1为行索引
pd.merge(left, right, how='inner', on=None)
left:表1
right:表2
how:按照何种方式合并(inner,outer,left,right)
on:指定的共同键
示例(接上面):
#根据行索引实现合并
pd.concat([data,dummies],axis=1)
merge示例:
left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
'key2': ['K0', 'K1', 'K0', 'K1'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
'key2': ['K0', 'K0', 'K0', 'K0'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})
#内联
pd.merge(left,right,how="inner",on=["key1","key2"])
#左联
pd.merge(left,right,how="left",on=["key1","key2"])
#右联
pd.merge(left,right,how="right",on=["key1","key2"])
#外联
pd.merge(left,right,how="outer",on=["key1","key2"])
交叉表与透视表
import numpy as np
date=pd.to_datetime(data.index).weekday
data['week']=date #增加一列星期
data['posi_neg']=np.where(data['p_change']>0,1,0)
data['posi_neg'] #增加一列posi_neg
#按照星期进行分组
#按照posi_neg进行分组
count = pd.crosstab(data['week'],data['posi_neg'])
#行求和
sum = count.sum(axis=1).astype(np.float32)
pro = count.div(sum, axis=0).plot(kind='bar',stacked=True)