python_pandas_常用例子

常规设置

# 显示所有列
pd.set_option('display.max_columns', None)

# 显示所有行
pd.set_option('display.max_rows', None)

# 设置打印宽度
pd.set_option('display.width',1000)

# 设置最大列宽,默认为50
pd.set_option('max_colwidth',100)

分组与透视

# -*- coding:utf-8 -*- 
import pandas as pd 
import numpy as np 
import openpyxl


# -1是自动行数
df = pd.DataFrame(np.arange(100).reshape(-1,4),columns=list('abcd'))

# 数值切片
bins = [0,60,80,float('inf')]
df['cut'] = pd.cut(df['d'],bins=bins,right=True)


# apply
ap = df.groupby('cut').apply(lambda frame:frame.sort_values('a',ascending=False))
print('\nap\n',ap)


# agg
sm = df.groupby('cut')['a'].sum()
print('\nsm\n',sm)

ag1 = df.groupby('cut')['a'].aggregate(lambda se:se.sum())
print('\nag1\n',ag1)

ag2 = df.groupby('cut').aggregate(a=('a' ,np.sum),b=('b','count'))
print('\nag2\n',ag2)

ag3 = df.groupby('cut').aggregate(
    {
    	'a' : [np.sum,np.mean],
    	'b' : [('s',np.sum),('m',np.mean)],
    	'c' : {'sm':np.sum,'ma':np.mean},
    }
)
print('\nag3\n',ag3)


# pivot
pvt = pd.pivot_table(df,index='cut',columns=None,aggfunc='count')
print('\npvt\n',pvt)

Excel

pip install xlrd
pip install openpyxl
raise XLRDError(FILE_FORMAT_DESCRIPTIONS[file_format]+‘; not supported‘)

pip uninstall xlrd
pip install xlrd==1.2.0
import openpyxl,xlsxwriter

df = pd.DataFrame()
df['a'] = [1,2,3,4,5]
df['b'] = [4,5,6,7,8]

# 读取
sale = pd.read_excel('z01_数据统计表.xlsx',sheet_name='销售',header=0)

# 直接保存
sale.to_excel('sale.xlsx')

# 构造写入对象
writer = pd.ExcelWriter('new.xlsx')

# 保留原来的表
book = openpyxl.load_workbook(writer.path)
writer.book = book # 保存原来数据需要

# 删除表
print(book.sheetnames)
book.remove(book['sheet1'])
book.save(writer.path)

# 追加表保存
df.to_excel(excel_writer=writer,sheet_name='st1',merge_cells=True,engine='xlsxwriter')
df.to_excel(excel_writer=writer,sheet_name='st2',merge_cells=Trueengine='xlsxwriter')

# 保存退出
writer.save()
writer.close()

降维操作

# -*- coding:utf-8 -*- 
import pandas as pd 
import numpy as np 

# -1是自动行数
df = pd.DataFrame(np.arange(100).reshape(-1,4),columns=list('qwer'))

# 数值切片
bins = [0,60,80,float('inf')]
df['cut'] = pd.cut(df.r,bins=bins,right=True)

# 多维分组
gpd = df.groupby('cut').agg({
    'e' : [np.sum,np.mean],
    'q' : {'sum':np.sum,'mean':np.mean},
    })
print(gpd)

# 索引直接降维度
gpd1 = gpd.copy(deep=True)
gpd1.columns = gpd1.columns.droplevel(0)
print(gpd1.columns)

# 索引合并到1维
gpd2 = gpd.copy(deep=True)
gpd2.columns = ['_'.join(x) for x in gpd2.columns.ravel()]
print(gpd2.columns)

自定义函数

序列的连续性

# -*- coding:utf-8 -*- 
import numpy as np 
import pandas as pd 


# 计算连续性函数
def calc_conti(ls):
    ls = list(ls)
    conti_ls = []

    pre_v = None
    conti_num = 1
    
    for v in ls:
        if v == pre_v:
            conti_num = conti_num+1
        else :
            conti_num = 1
        pre_v = v
        conti_ls.append(conti_num)

    return conti_ls

序列的macd

# -*- coding:utf-8 -*- 
import numpy as np 
import pandas as pd 


def calc_EMA(df, ema_name, N): 
    '''
    传入df生成ema_name列,返回se
    '''
    df = df.copy()

    if ema_name in df.columns:
        clac_idx = np.where(df[ema_name].isna())[0]
    else:
        df[ema_name] = None
        clac_idx = range(len(df))
        
    for i in clac_idx:
        if i == 0:
            df.ix[i,ema_name]=df.ix[i,'close']
        else:
            df.ix[i,ema_name]=((N-1)*df.ix[i-1,ema_name]+2*df.ix[i,'close']) / (N+1)

    return df



def calc_MACD(df, st=12, lg=26, M=9):
    '''
    传入df,生成dif,dea,macd列,返回df
    index不可以int类型
    '''
    df = df.copy()

    # ema
    df = calc_EMA(df,'ema_12',st)
    df = calc_EMA(df,'ema_26',lg)

    # dif
    if 'dif' in df.columns:
        ept_idx = np.where(df.dif.isna())[0]
        df.ix[ept_idx,'dif'] = df.ix[ept_idx,'ema_12'] - df.ix[ept_idx,'ema_26']
    else:
        df['dif'] = df.ema_12 - df.ema_26

    # dea
    if 'dea' in df.columns:
        clac_idx = np.where(df.dea.isna())[0]
    else:
        df['dea'] = None
        clac_idx = range(len(df))
    for i in clac_idx:
        if i == 0:
            df.ix[i,'dea'] = df.ix[i,'dif']  
        else:
            df.ix[i,'dea'] = ((M-1)*df.ix[i-1,'dea'] + 2*df.ix[i,'dif']) / (M+1)
    
    # macd
    if 'macd' in df.columns:
        ept_idx = np.where(df.macd.isna())[0]
        df.ix[ept_idx,'macd'] =2 * (df.ix[ept_idx,'dif'] - df.ix[ept_idx,'dea'])
    else:
        df['macd'] = 2 * (df.dif - df.dea)

    return df

造数

# coding:utf-8
import pandas as pd 
import numpy as np
import random


def makedata(qty,options=None):
    cols = options.keys()
    df = pd.DataFrame(np.arange(qty*len(cols)).reshape(len(cols),qty),index=cols).T
    for k,v in options.items():
        df[k] = df[k].map(v)
    return df


if __name__=='__main__':
    uuid = lambda x:'uuid%06d'%x
    name = lambda x:random.choice(['张三','李四','王五'])
    clas = lambda x:random.choice(['1班','2班','3班'])
    score = lambda x:random.choice(range(1,101))
    options = {'uuid':uuid,'name':name,'clas':clas,'score':score}

    ret = makedata(2000,options)
    print(ret)

爆裂函数

source.join(source['refered_group_ids'].str.split('@',expand=True).stack().reset_index(level=1,drop=True).rename('rely')).reset_index(drop=True)[['group_id','rely']]
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值