import pandas as pd
import numpy as np
#加载数据
detail=pd.read_excel('./meal_order_detail.xlsx')
# print(detail.columns)
#pandas 数据分析
# print(detail['amounts'].max())
# print(detail['amounts'].min())
# print(detail['amounts'].var())
# print(detail['amounts'].mode())
# print(detail['amounts'].median())
"""
178
1
1354.8693557124277
0 35
dtype: int64
35.0
"""
# df=pd.DataFrame([[1,2,3,4,5],[2,3,5,3,7]])
# print(df)
#统计dishes_name 众数 出现次数 非空值数量
# print(detail['dishes_name'].mode())
# print(detail['dishes_name'].count())
#describe
# print(detail['dishes_name'].describe())
"""
count 2779
unique 154
top 白饭/大碗
freq 92
"""
#对于数值类型需要先转化成类型才可以进行统计众数次数
# detail['amounts']=detail['amounts'].astype('category')
# print(detail['amounts'].describe())
#循环删除空行
# columns=detail.columns
# # print(columns)
# for tmp in columns:
# # print(tmp)
# if detail[tmp].count()==0:
# detail.drop(labels=tmp,axis=1)
# print(detail.shape)
"""
pandas 默认支持的时间点Timestamp
默认支持的时间序列类型 DatetimeIndex
numpy datetime[ns]
"""
# place_order_time=detail['place_order_time']
# print(place_order_time)
#转化成pandas支持的时间
# detail['place_order_time']=pd.to_datetime(place_order_time)
# print(detail.dtypes)
# detail['place_order_time']=pd.DatetimeIndex(place_order_time)
# print(detail.dtypes)
# df=pd.Series(['2019-06-19','2019-06-20'])
# print(df)
# df=pd.to_datetime(df)
# print(df)
#时间序列的操作
# year=[i.year for i in detail['place_order_time']]
# print(year)
# weekday=[i.weekday_name for i in detail['place_order_time']]
# print(weekday)
#时间序列加减
# a=detail['place_order_time']+pd.Timedelta(days=1)
# print(a,detail['place_order_time'])
#计算时间差
# time=pd.to_datetime('2019-6-19')-pd.to_datetime('2019-6-10')
# print(time)
#计算机最早时间和最晚时间
# print(pd.Timestamp.min)
# print(pd.Timestamp.max)
#分组与聚合
#分组 按照amounts 分组 然后统计counts 的平均值
# de=detail.groupby('amounts')['counts'].mean()
# print(de)
#按照order_id 和dishes_id 进行分组 统计单价amounts的均值
# de=detail.groupby(by=['order_id','dishes_id'])['amounts'].mean()
# print(de)
#agg 1可以对多列数据同时进行多个统计分析
# de=detail[['order_id','amounts']].agg([np.max,np.min])
# print(de)
#对order_id求max 对amounts求min
# de=detail.agg({'order_id':np.max,'amounts':np.min})
# print(de)
#对order_id求mean 对amounts求min max
# de=detail.agg({'order_id':np.mean,'amounts':[np.min,np.max]})
# print(de)
#apply transform
# de=detail['order_id'].apply(lambda x:x+1)
# print(de)
# de=detail[['amounts','order_id']].transform(lambda x:x+1)
# print(de)
#数据透视表
#创建数据透视表
# data=detail[['order_id','dishes_id','dishes_name','amounts','counts']]
#按照order_id进行行分组 统计amounts ,counts平均值 默认平均值
# ret=pd.pivot_table(data,index='order_id',values=['amounts','counts'])
# print(ret)
#按照dishes_name进行分组 将其当做列
# ret=pd.pivot_table(data,columns='dishes_name',values=['amounts','counts'])
# print(ret)
#aggfunc 默认均值 可以更改统计指标
# ret=pd.pivot_table(data,columns='dishes_name',values=['amounts','counts'],aggfunc=max)
# print(ret)
#fill_value 空值填充 nan空值
# ret=pd.pivot_table(data,columns='dishes_name',values=['amounts','counts'],aggfunc=max,fill_value=0)
# print(ret)
#margins 是否 求和
# ret=pd.pivot_table(data,index='order_id',columns='dishes_name',values=['amounts','counts'],aggfunc=max,fill_value=0,margins=True)
# print(ret)
"""
类似分组聚合 比其功能更加强大
"""
#单日营业额
#获取数据
# data=detail['place_order_time']
# print(data)
#将时间转化
# detail['place_order_time']=pd.to_datetime(data)
#获取日数据
# day=[i.day for i in detail['place_order_time']]
# print(day)
#将数据保存到detail里
# detail['day']=day
#食品花费
# cost=detail['amounts']*detail['counts']
# detail['cost']=cost
#按照day进行分组
# ret=detail.groupby('day')['cost'].sum()
# print(ret)
#交叉表
# ret=pd.crosstab(index=detail['counts'],columns=detail['amounts'],values=detail['dishes_name'],aggfunc=max,margins=True)
# print(ret)
#单列去重 对原来数据没有改变
# detail=detail['counts'].drop_duplicates()
# print(detail.shape)
#多列去重
# detail.drop_duplicates(subset=['counts','amounts'],inplace=True)
# print(detail.shape)
pandas基础及操作
最新推荐文章于 2021-10-03 10:02:27 发布