常规设置
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width',1000)
pd.set_option('max_colwidth',100)
分组与透视
import pandas as pd
import numpy as np
import openpyxl
df = pd.DataFrame(np.arange(100).reshape(-1,4),columns=list('abcd'))
bins = [0,60,80,float('inf')]
df['cut'] = pd.cut(df['d'],bins=bins,right=True)
ap = df.groupby('cut').apply(lambda frame:frame.sort_values('a',ascending=False))
print('\nap\n',ap)
sm = df.groupby('cut')['a'].sum()
print('\nsm\n',sm)
ag1 = df.groupby('cut')['a'].aggregate(lambda se:se.sum())
print('\nag1\n',ag1)
ag2 = df.groupby('cut').aggregate(a=('a' ,np.sum),b=('b','count'))
print('\nag2\n',ag2)
ag3 = df.groupby('cut').aggregate(
{
'a' : [np.sum,np.mean],
'b' : [('s',np.sum),('m',np.mean)],
'c' : {'sm':np.sum,'ma':np.mean},
}
)
print('\nag3\n',ag3)
pvt = pd.pivot_table(df,index='cut',columns=None,aggfunc='count')
print('\npvt\n',pvt)
Excel
pip install xlrd
pip install openpyxl
raise XLRDError(FILE_FORMAT_DESCRIPTIONS[file_format]+‘; not supported‘)
pip uninstall xlrd
pip install xlrd==1.2.0
import openpyxl,xlsxwriter
df = pd.DataFrame()
df['a'] = [1,2,3,4,5]
df['b'] = [4,5,6,7,8]
sale = pd.read_excel('z01_数据统计表.xlsx',sheet_name='销售',header=0)
sale.to_excel('sale.xlsx')
writer = pd.ExcelWriter('new.xlsx')
book = openpyxl.load_workbook(writer.path)
writer.book = book
print(book.sheetnames)
book.remove(book['sheet1'])
book.save(writer.path)
df.to_excel(excel_writer=writer,sheet_name='st1',merge_cells=True,engine='xlsxwriter')
df.to_excel(excel_writer=writer,sheet_name='st2',merge_cells=Trueengine='xlsxwriter')
writer.save()
writer.close()
降维操作
import pandas as pd
import numpy as np
df = pd.DataFrame(np.arange(100).reshape(-1,4),columns=list('qwer'))
bins = [0,60,80,float('inf')]
df['cut'] = pd.cut(df.r,bins=bins,right=True)
gpd = df.groupby('cut').agg({
'e' : [np.sum,np.mean],
'q' : {'sum':np.sum,'mean':np.mean},
})
print(gpd)
gpd1 = gpd.copy(deep=True)
gpd1.columns = gpd1.columns.droplevel(0)
print(gpd1.columns)
gpd2 = gpd.copy(deep=True)
gpd2.columns = ['_'.join(x) for x in gpd2.columns.ravel()]
print(gpd2.columns)
自定义函数
序列的连续性
import numpy as np
import pandas as pd
def calc_conti(ls):
ls = list(ls)
conti_ls = []
pre_v = None
conti_num = 1
for v in ls:
if v == pre_v:
conti_num = conti_num+1
else :
conti_num = 1
pre_v = v
conti_ls.append(conti_num)
return conti_ls
序列的macd
import numpy as np
import pandas as pd
def calc_EMA(df, ema_name, N):
'''
传入df生成ema_name列,返回se
'''
df = df.copy()
if ema_name in df.columns:
clac_idx = np.where(df[ema_name].isna())[0]
else:
df[ema_name] = None
clac_idx = range(len(df))
for i in clac_idx:
if i == 0:
df.ix[i,ema_name]=df.ix[i,'close']
else:
df.ix[i,ema_name]=((N-1)*df.ix[i-1,ema_name]+2*df.ix[i,'close']) / (N+1)
return df
def calc_MACD(df, st=12, lg=26, M=9):
'''
传入df,生成dif,dea,macd列,返回df
index不可以int类型
'''
df = df.copy()
df = calc_EMA(df,'ema_12',st)
df = calc_EMA(df,'ema_26',lg)
if 'dif' in df.columns:
ept_idx = np.where(df.dif.isna())[0]
df.ix[ept_idx,'dif'] = df.ix[ept_idx,'ema_12'] - df.ix[ept_idx,'ema_26']
else:
df['dif'] = df.ema_12 - df.ema_26
if 'dea' in df.columns:
clac_idx = np.where(df.dea.isna())[0]
else:
df['dea'] = None
clac_idx = range(len(df))
for i in clac_idx:
if i == 0:
df.ix[i,'dea'] = df.ix[i,'dif']
else:
df.ix[i,'dea'] = ((M-1)*df.ix[i-1,'dea'] + 2*df.ix[i,'dif']) / (M+1)
if 'macd' in df.columns:
ept_idx = np.where(df.macd.isna())[0]
df.ix[ept_idx,'macd'] =2 * (df.ix[ept_idx,'dif'] - df.ix[ept_idx,'dea'])
else:
df['macd'] = 2 * (df.dif - df.dea)
return df
造数
import pandas as pd
import numpy as np
import random
def makedata(qty,options=None):
cols = options.keys()
df = pd.DataFrame(np.arange(qty*len(cols)).reshape(len(cols),qty),index=cols).T
for k,v in options.items():
df[k] = df[k].map(v)
return df
if __name__=='__main__':
uuid = lambda x:'uuid%06d'%x
name = lambda x:random.choice(['张三','李四','王五'])
clas = lambda x:random.choice(['1班','2班','3班'])
score = lambda x:random.choice(range(1,101))
options = {'uuid':uuid,'name':name,'clas':clas,'score':score}
ret = makedata(2000,options)
print(ret)
爆裂函数
source.join(source['refered_group_ids'].str.split('@',expand=True).stack().reset_index(level=1,drop=True).rename('rely')).reset_index(drop=True)[['group_id','rely']]