查看餐饮数据的大小和维度
from sqlalchemy import create_engine
import pandas as pd
engine = create_engine( 'mysql+pymysql://root:981221@localhost/testdb?charset=utf8mb4' )
detail = pd. read_sql_table( 'meal_order_detail1' , con= engine)
order = pd. read_table( 'D:/pandas/meal_order_info.csv' , sep= ',' , encoding= 'gbk' )
user = pd. read_excel( 'D:/pandas/users.xlsx' )
print ( '订单的详情表的维度:' , detail. ndim)
print ( '订单信息表的维度:' , order. ndim)
print ( '客户信息表的维度:' , user. ndim)
订单的详情表的维度: 2
订单信息表的维度: 2
客户信息表的维度: 2
print ( '详情标的形状:' , detail. shape)
print ( '信息表的形状:' , order. shape)
print ( '客户表的形状:' , user. shape)
详情标的形状: (2779, 19)
信息表的形状: (945, 21)
客户表的形状: (734, 37)
print ( '详情表的元素个数:' , detail. size)
print ( '信息表的元素个数:' , order. size)
print ( '客户表的元素个数:' , user. size)
详情表的元素个数: 52801
信息表的元素个数: 19845
客户表的元素个数: 27158
统计菜品销售情况
数值型特征
print ( '详情表里单价(counts)和数目(amounts)两列的描述性统计:\n' , detail. loc[ : , [ 'counts' , 'amounts' ] ] . describe( ) )
详情表里单价(counts)和数目(amounts)两列的描述性统计:
counts amounts
count 2779.000000 2779.000000
mean 1.111191 45.337172
std 0.625428 36.808550
min 1.000000 1.000000
25% 1.000000 25.000000
50% 1.000000 35.000000
75% 1.000000 56.000000
max 10.000000 178.000000
detail[ 'order_id' ] = detail[ 'order_id' ] . astype( 'category' )
detail[ 'dishes_name' ] = detail[ 'dishes_name' ] . astype( 'category' )
print ( '''信息表的订单标号(order_id)和菜品名称(dishes_name)的描述性统计结果:
\n''' , detail[ [ 'order_id' , 'dishes_name' ] ] . describe( ) )
信息表的订单标号(order_id)和菜品名称(dishes_name)的描述性统计结果:
order_id dishes_name
count 2779 2779
unique 278 145
top 392 白饭/大碗
freq 24 92
信息总结:
菜品均售价为45.3元 订单信息里销售最多的菜品数目达92中 销售最多的菜品是白米饭 订单标号为392号的物品种类最多,有24种
剔除 “全是空值” 或者 “所有元素取值相同” 的列
def dropNullStd ( data) :
beforerlen = data. shape[ 0 ]
beforeclen = data. shape[ 1 ]
colisNull = data. describe( ) . loc[ 'count' ] == 0
print ( type ( colisNull) )
print ( 'beforerlen[0]:' , beforerlen)
print ( 'beforeclen[1]:' , beforeclen)
for i in range ( len ( colisNull) ) :
if colisNull[ i] :
data. drop( colisNull. index[ i] , axis = 1 , inplace = True )
stdisZero = data. describe( ) . loc[ 'std' ] == 0
for i in range ( len ( stdisZero) ) :
if stdisZero[ i] :
data. drop( stdisZero. index[ i] , axis = 1 , inplace = True )
afterclen = data. shape[ 1 ]
print ( '剔除的列的数目:' , beforeclen- afterclen)
print ( '剔除后数据形状为:' , data. shape)
dropNullStd( detail)
<class 'pandas.core.series.Series'>
beforerlen[0]: 2779
beforeclen[1]: 19
剔除的列的数目: 0
剔除后数据形状为: (2779, 19)
dropNullStd( order)
<class 'pandas.core.series.Series'>
beforerlen[0]: 945
beforeclen[1]: 21
剔除的列的数目: 7
剔除后数据形状为: (945, 14)
dropNullStd( user)
<class 'pandas.core.series.Series'>
beforerlen[0]: 734
beforeclen[1]: 37
剔除的列的数目: 13
剔除后数据形状为: (734, 24)