Pandas

1 Sql csv excel 文件的读取和存储

from sqlalchemy import create_engine
import pandas as pd
## 创建一个mysql连接器,用户名为root,密码为123456
## 地址为127.0.0.1,数据库名称为chat,编码为utf-8
engine = create_engine('mysql+pymysql://root:123456@127.0.0.1:3306/chat?charset=utf8')
formlist = pd.read_sql_query('show tables', con = engine)## 使用read_sql_query查看tesdb中的数据表数目
detail1 = pd.read_sql_table('meal_order_detail1',con = engine)# 使用read_sql_table读取订单详情表
detail2 = pd.read_sql('select * from meal_order_detail2',con = engine)# 使用read_sql读取订单详情表
detail3 = pd.read_sql('meal_order_detail3',con = engine)
order = pd.read_table('meal_order_info.csv',sep = ',',encoding = 'gbk')## 使用read_table读取订单信息表
order1 = pd.read_csv('meal_order_info.csv',encoding = 'gbk')## 使用read_csv读取订单信息表
order2 = pd.read_table('meal_order_info.csv',sep = ';',encoding = 'gbk')## 使用read_table读取菜品订单信息表,sep = ';'
order3 = pd.read_csv('meal_order_info.csv',sep = ',',header = None,encoding = 'gbk')## 使用read_csv读取菜品订单信息表,header=None
user = pd.read_excel('users.xlsx')## 读取user.xlsx文件
# order4 = pd.read_csv('meal_order_info.csv',sep = ',',encoding = 'utf-8')## 使用gbk解析菜品订单信息表


## 使用to_sql存储orderData
detail1.to_sql('test1',con = engine,index = False,if_exists = 'replace')
order.to_csv('orderInfo.csv',sep = ';',index = False)## 将order以csv格式存储
order.to_excel('orderInfo.xlsx',index = False)## 将order以csv格式存储
import os
print('订单信息表写入文本文件前目录内文件列表为:\n',os.listdir('../'))

2 DataFrom操作

from sqlalchemy import create_engine
import pandas as pd
## 创建数据库连接
engine = create_engine('mysql+pymysql://root:123456@127.0.0.1:3306/chat?charset=utf8')
detail= pd.read_sql_table('meal_order_detail1',con = engine)
print('订单详情表的索引为:', detail.index)
print('订单详情表的所有值为:','\n', detail.values)
print('订单详情表的列名为:','\n', detail.columns)
print('订单详情表的数据类型为:','\n', detail.dtypes)
print('订单详情表的元素个数为:', detail.size)
print('订单详情表的维度数为:', detail.ndim) ## 查看DataFrame的维度数
print('订单详情表的形状为:', detail.shape) ## 查看DataFrame的形状
print('订单详情表转置前形状为:',detail.shape)
print('订单详情表转置后形状为为:',detail.T.shape)
print('订单详情表中的order_id的形状为:','\n',detail['order_id'].shape)## 使用访问字典方式取出orderInfo中的某一列
print('订单详情表中的dishes_name的形状为:',detail.dishes_name.shape)## 使用访问属性方式取出orderInfo中的菜品名称列
print('订单详情表中的dishes_name前5个元素为:','\n',detail['dishes_name'][:5])
print('订单详情表中的order_id和dishes_name前5个元素为:','\n',detail[['order_id','dishes_name']][:5])
print('订单详情表的1-6行元素为:','\n',detail[:][1:6])
print('订单详情表中前五行数据为','\n',detail.head())
print('订单详情表中后五个元素为:','\n',detail.tail())
print('使用loc提取dishes_name列的size为:', detail.loc[:,'dishes_name'].size)
print('使用iloc提取第3列的size为:', detail.iloc[:,3].size)
print('使用loc提取order_id和dishes_name列的size为:',detail.loc[:, ['order_id', 'dishes_name']].size)
print('使用iloc提取第1和第3列的size为:',  detail.iloc[:,[1,3]].size)
print('列名为order_id和dishes_name的行名为3的数据为:\n',detail.loc[3,['order_id','dishes_name']])
print('列名为order_id和dishes_name行名为2,3,4,5,6的数据为:\n',detail.loc[2:6,['order_id','dishes_name']])
print('列位置为1和3行位置为3的数据为:\n',detail.iloc[3,[1,3]])
print('列位置为1和3行位置为2,3,4,5,6的数据为:\n',detail.iloc[2:7,[1,3]])

###################################################################################
###################################################################################
###################################################################################

## loc内部传入表达式
print('detail中order_id为458的dishes_name为:\n',detail.loc[detail['order_id']=='458',['order_id','dishes_name']])
print('detail中order_id为458的第1,5列数据为:\n',detail.iloc[detail['order_id']=='458',[1,5]])
print('detail中order_id为458的第1,5列数据为:\n',detail.iloc[(detail['order_id']=='458').values,[1,5]])
print('列名为dishes_name行名为2,3,4,5,6的数据为:\n',detail.loc[2:6,'dishes_name'])
print('列位置为5,行位置为2至6的数据为:\n',detail.iloc[2:6,5])
print('列位置为5行名为2至6的数据为:', '\n',detail.ix[2:6,5])


#单个赋值
detail.loc[detail['order_id']=='458','order_id'] = '45800'##将order_id为458的,变换为45800
#新增列
detail['payment'] =  detail['counts']*detail['amounts']
detail['pay_way'] = '现金支付'
#删除行列
detail.drop(labels = 'pay_way',axis = 1,inplace = True)#删除pay_way后detail的列索引为
detail.drop(labels = range(1,11),axis = 0,inplace = True)#删除1-10行前detail的长度为


###################################################################################
###################################################################################
###################################################################################

import numpy as np
print('订单详情表中amount(价格)的平均值为:', np.mean(detail['amounts']))
print('订单详情表中amount(价格)的平均值为:', detail['amounts'].mean())
print('订单详情表counts和amounts两列的描述性统计为:\n', detail[['counts','amounts']].describe())
print('订单详情表dishes_name频数统计结果前10为:\n',detail['dishes_name'].value_counts()[0:10])#词频统计
detail['dishes_name'] = detail['dishes_name'].astype('category')
print('订单信息表dishes_name列转变数据类型后为:',detail['dishes_name'].dtypes)

3 转换与处理时间序列

import pandas as pd
order = pd.read_table('meal_order_info.csv',sep = ',',encoding = 'gbk')
order['lock_time'] = pd.to_datetime(order['lock_time'])
print('最小时间为:', pd.Timestamp.min)
print('最大时间为:', pd.Timestamp.max)
dateIndex = pd.DatetimeIndex(order['lock_time'])
print('转换为DatetimeIndex后数据的类型为:\n',type(dateIndex))
periodIndex = pd.PeriodIndex(order['lock_time'],freq = 'S')
print('转换为DatetimeIndex后数据的类型为:\n',type(periodIndex))
print('lock_time中的年份数据前5个为:',[i.year for i in order['lock_time']][:5])
print('lock_time中的月份数据前5个为:',[i.month for i in order['lock_time']][:5])
print('lock_time中的日期数据前5个为:',[i.day for i in order['lock_time']][:5])
print('lock_time中的星期名称数据前5个为:',[i.weekday_name for i in order['lock_time']][:5])
print('dateIndex中的星期名称数据前5个为:\n',dateIndex.weekday_name[:5])
print('periodIndex中的星期标号数据前5个为:',periodIndex.weekday[:5])
print('lock_time在加上一天前前5行数据为:\n',(order['lock_time']+pd.Timedelta(days = 1))[:5])## 将lock_time数据向后平移一天
timeDelta = order['lock_time'] - pd.to_datetime('2017-1-1')
print('lock_time减去2017年1月1日0点0时0分后的数据:\n',timeDelta[:5])
print('lock_time减去time1后的数据类型为:',timeDelta.dtypes)

4 分组 groupby,聚合 agg

import pandas as pd
import numpy as np
from sqlalchemy import create_engine
engine = create_engine('mysql+pymysql://root:123456@127.0.0.1:\
3306/chat?charset=utf8')
detail = pd.read_sql_table('meal_order_detail1',con = engine)
detailGroup = detail[['order_id','counts','amounts']].groupby(by = 'order_id')
print('分组后的订单详情表为:',detailGroup)
print('订单详情表分组后前5组每组的均值为:\n',detailGroup.mean().head())
print('订单详情表分组后前5组每组的标准差为:\n',detailGroup.std().head())
print('订单详情表分组后前5组每组的大小为:','\n',detailGroup.size().head())

print('订单详情表的菜品销量与售价的和与均值为:\n',detail[['counts','amounts']].agg([np.sum,np.mean]))
print('订单详情表的菜品销量总和与售价的均值为:\n',detail.agg({'counts':np.sum,'amounts':np.mean}))
print('菜品订单详情表的菜品销量总和与售价的总和与均值为:\n',detail.agg({'counts':np.sum,'amounts':[np.mean,np.sum]}))

##自定义函数求两倍的和
def DoubleSum(data):
    s = data.sum()*2
    return s
print('菜品订单详情表的菜品销量两倍总和为:','\n',detail.agg({'counts':DoubleSum},axis = 0))

##自定义函数求两倍的和
def DoubleSum1(data):
    s = np.sum(data)*2
    return s
print('订单详情表的菜品销量两倍总和为:\n',detail.agg({'counts':DoubleSum1},axis = 0).head())
print('订单详情表的菜品销量与售价的和的两倍为:\n',detail[['counts','amounts']].agg(DoubleSum1))
print('订单详情表分组后前3组每组的均值为:\n',detailGroup.agg(np.mean).head(3))
print('订单详情表分组后前3组每组的标准差为:\n',detailGroup.agg(np.std).head(3))
print('订单详情分组前3组每组菜品总数和售价均值为:\n',detailGroup.agg({'counts':np.sum,'amounts':np.mean}).head(3))
print('订单详情表的菜品销量与售价的均值为:\n',detail[['counts','amounts']].apply(np.mean))
print('订单详情表分组后前3组每组的均值为:','\n', detailGroup.apply(np.mean).head(3))
print('订单详情表分组后前3组每组的标准差为:','\n', detailGroup.apply(np.std).head(3))
print('订单详情表的菜品销量与售价的两倍为:\n',detail[['counts','amounts']].transform(lambda x:x*2).head(4))
print('订单详情表分组后实现组内离差标准化后前五行为:\n',detailGroup.transform(lambda x:(x.mean()-x.min())/(x.max()-x.min())).head())

5 创建透视表 pivot_table和交叉表


import pandas as pd
import numpy as np
from sqlalchemy import create_engine
engine = create_engine('mysql+pymysql://root:123456@\
127.0.0.1:3306/chat?charset=utf8')
detail = pd.read_sql_table('meal_order_detail1',con = engine)
detailPivot = pd.pivot_table(
	detail[ [ 'order_id','counts','amounts'] ],
           index = 'order_id')
print('以order_id作为分组键创建的订单透视表为:\n',detailPivot.head())
detailPivot1 = pd.pivot_table(
	detail[[ 'order_id','counts','amounts']],
      index = 'order_id',aggfunc = np.sum)
print('以order_id作为分组键创建的订单销量与售价总和透视表为:\n',detailPivot1.head())

detailPivot2 = pd.pivot_table(
	detail[['order_id','dishes_name', 'counts','amounts']],
      index = ['order_id','dishes_name'],
      aggfunc = np.sum)
print('以order_id和dishes_name作为分组键创建的订单\销量与售价总和透视表为:\n',detailPivot2.head())

detailPivot2 = pd.pivot_table(detail[[
      'order_id','dishes_name','counts','amounts']],
      index = 'order_id',
      columns = 'dishes_name',
      aggfunc = np.sum)
print('以order_id和dishes_name作为行列分组键创建的\透视表前5行4列为:\n',detailPivot2.iloc[:5,:4])

detailPivot4 = pd.pivot_table(detail[[
      'order_id','dishes_name','counts','amounts']],
      index = 'order_id',
      values = 'counts',
      aggfunc = np.sum)
print('以order_id作为行分组键counts作为值创建的\透视表前5行为:\n',detailPivot4.head())

detailPivot5 = pd.pivot_table(detail[[
      'order_id','dishes_name','counts','amounts']],
      index = 'order_id',
      columns = 'dishes_name',
      aggfunc = np.sum,fill_value = 0)
print('空值填0后以order_id和dishes_name为行列分组键创建透视表前5行4列为:\n',detailPivot5.iloc[:5,:4])


detailPivot6 = pd.pivot_table(detail[[
      'order_id','dishes_name','counts','amounts']],
      index = 'order_id',columns = 'dishes_name',
      aggfunc = np.sum,fill_value = 0,
      margins = True)
print('添加margins后以order_id和dishes_name为分组键的透视表前5行后4列为:\n',detailPivot6.iloc[:5,-4:])


detailCross = pd.crosstab(
      index=detail['order_id'],
      columns=detail['dishes_name'],
      values = detail['counts'],aggfunc = np.sum)
print('以order_id和dishes_name为分组键\counts为值的透视表前5行5列为:\n',detailCross.iloc[:5,:5])

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值