Python数据分析笔记 4

斯凯利.瑞恩

已于 2024-07-17 10:10:30 修改

阅读量595

点赞数 13

分类专栏： Python&数据分析文章标签： python matplotlib pandas

于 2024-07-17 09:56:26 首次发布

本文链接：https://blog.csdn.net/m0_47498690/article/details/140486142

版权

Python&数据分析专栏收录该内容

12 篇文章 0 订阅

订阅专栏

Python数据分析笔记 4

"""实现Excel的数据透视表"""
table = pd.pivot_table(df, values = ['value'], index = ['column1','column2'], columns = ['column3'], aggfunc = np.num,margins=True)
#1、df:选择待处理的DataFrame，对应Excel的框选操作，若只想筛选部分字段，则df[['column1','column2','column3','values']]
#2、value:选择我们要聚合的数值字段 ，对应Excel中的值
#3、index:选择我们的索引（行标签），对应Excel的行
#4、columns:选择我们的列字段，对应Excel的列
#5、aggfunc:选择聚合函数
#6、margins:选择是否添加行&列的小计

import numpy as np
import pandas as pd
# import pymysql
pd.set_option('display.max_rows', 9999)
pd.set_option('display.max_columns', 9999)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

# - 1：各个门店在不同平台的商家实收
table = pd.pivot_table(cpc, values = ['门店实收','cpc总费用'], index = ['平台门店名称'], columns = ['平台i'], aggfunc = np.sum).fillna(0)
table

C:\Users\LENOVO\AppData\Local\Temp\ipykernel_16248\2139949438.py:2: FutureWarning: The provided callable <function sum at 0x000001BA644B93A0> is currently using DataFrameGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "sum" instead.
  table = pd.pivot_table(cpc, values = ['门店实收','cpc总费用'], index = ['平台门店名称'], columns = ['平台i'], aggfunc = np.sum).fillna(0)

	cpc总费用		门店实收
平台i	美团	饿了么	美团	饿了么
平台门店名称
利芳·一人食大盘鸡(国定路店)	0.000	16738.510	0.000	51351.750
利芳大盘鸡(国定路店)	0.000	11198.710	0.000	24088.000
拌客·干拌麻辣烫(武宁路店)	0.000	19624.310	0.000	140628.830
拌客干拌麻辣烫(武宁路店)	0.000	510.900	0.000	1597.770
拌客干拌麻辣烫（武宁路店）	6939.660	0.000	36582.480	0.000
蛙小辣·美蛙火锅杯(五角场店)	0.000	423.060	0.000	4220.500
蛙小辣·美蛙火锅杯(大宁店)	0.000	2085.300	0.000	9099.400
蛙小辣·美蛙火锅杯(宝山店)	0.000	9898.120	0.000	101526.140
蛙小辣·美蛙火锅杯(真如店)	0.000	6027.460	0.000	44432.920
蛙小辣·美蛙火锅杯(芳华路店)	0.000	3.600	0.000	152.650
蛙小辣·美蛙火锅杯(虹口足球场店)	0.000	13372.700	0.000	71286.210
蛙小辣·美蛙火锅杯(龙阳路店)	0.000	538.000	0.000	1607.480
蛙小辣·美蛙火锅杯麻辣烫(五角场店)	0.000	312.210	0.000	4866.630
蛙小辣·美蛙火锅杯麻辣烫(宝山店)	0.000	5786.460	0.000	64722.170
蛙小辣·美蛙火锅杯麻辣烫（五角场店）	4.200	0.000	297.900	0.000
蛙小辣·美蛙火锅杯（虹口足球场店）	8682.990	0.000	43977.250	0.000
蛙小辣·美蛙火锅杯（长风大悦城店）	5216.900	0.000	33258.330	0.000
蛙小辣火锅杯	1567.010	0.000	8234.000	0.000
蛙小辣火锅杯(五角场店)	0.000	10190.150	0.000	99578.180
蛙小辣火锅杯(徐汇店)	0.000	74.100	0.000	354.340
蛙小辣火锅杯(龙阳广场店)	0.000	1951.830	0.000	7486.000
蛙小辣火锅杯麻辣烫(五角场店)	0.000	5.800	0.000	263.390
蛙小辣火锅杯（五角场店）	364.240	649.200	3108.130	4642.000
蛙小辣火锅杯（合生汇店）	7307.620	0.000	35655.800	0.000
蛙小辣火锅杯（宝山店）	9903.300	0.000	71009.660	0.000
蛙小辣火锅杯（真如店）	1765.970	298.400	16258.930	939.000
蛙小辣火锅杯（金煌美食城店）	683.240	0.000	2505.000	0.000
蛙小辣美蛙火锅杯(五角场店)	0.000	1156.700	0.000	7362.000
蛙小辣美蛙火锅杯(真如店)	0.000	580.500	0.000	1468.000
蛙小辣美蛙火锅杯（亚龙美食城店）	4745.210	0.000	17460.440	0.000
蛙小辣美蛙火锅杯（大宁国际店）	3909.500	0.000	12421.580	0.000

"""”表格的合并"""
#核心思路都是采用循环，逐个拼接表格

#当想要合并不同的Excel表格（注意，要保证每个Excel表格的字段相同哦）
import numpy as np
import pandas as pd
import os

path = './test/'							#1、path:选择Excel表格所在路径
files = [i for i in os.listdir(path)]			#2、files = []:创建一个空列表，将路径中，文件or文件夹的名字装入列表
print(files)									#3、查看是否正确，如：['test1','test2']

Merge = pd.DataFrame()							#4、创建一个空的DataFrame
for i in files:									#5、for:循环遍历我们的名称列表(2)，
    df = pd.read_excel(path+i)				  #6、read_excel(path+i):读取第files[i]份表格。如当i=0,files[0]='sheet1'
    Merge = pd.concat([Merge,df])				#7、concat:将df内容拼接至Merge内
Merge.to_excel(path+'excel合并.xlsx',index = None)	#8、to_excel:遍历结束后，输出我们合并后的表格

#当想要合并同一张Excel表格内的所有sheet（注意，要保证每个sheet的字段相同哦）
import numpy as np
import pandas as pd  
import os

name_list = pd.ExcelFile('./test/学习名单3.xlsx')       #1、ExcelFile:传入Excel表格
df_list = pd.DataFrame()						  #2、创建一个空DataFrame
for sheet in name_list.sheet_names:		       #3、for:循环遍历我们的名称列表(1)
    df = name_list.parse(sheet_name=sheet)    #4、name_list的属性parse:读取sheet中的内容
    df_list=pd.concat([df_list,df]) 		   #5、append:我们将读取到的sheet内容，附加到df_list中
df_list.to_excel('./test/sheet合并.xlsx',index=False)    #6、将df_list中的内容按行合并，输出


#ExcelFile()：传入一个表格的路径，并赋予一个对象。
#sheet_names：他是ExcelFile()的一个方法，将Excel子工作表的名字以列表形式返回
#parse：他是ExcelFile()的一个方法
#参数sheet_name：传入子工作表的名称，可以读取里面的内容

""" 数据探索处理 """
# - 0、数据准备：导入pandas库&数据
import numpy as np
import pandas as pd
    #导入我们需要的包，并且给它起别名，方便我们调用

df = pd.read_csv('./cpc.csv',encoding='gbk') 
    #df--给我们导入的数据命名为df
    #pd--调用pandas
    #read_csv()--pandas常用的读取数据函数
    #('文件名')--需要导入的数据文件名

# - 1、了解数据
df.info() #了解数据框架--我们数据的行列、字段的类别等概况
    #rangeIndex--数据总体的行数
    #data columns--数据总体的字段
    #int64/object/float64...--字段数据类型，object可理解为字符串
    #xxx non-null--非空的记录数

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1177 entries, 0 to 1176
Data columns (total 24 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   updateTime  1177 non-null   object 
 1   平台i         1177 non-null   object 
 2   门店ID        1177 non-null   int64  
 3   平台门店名称      1177 non-null   object 
 4   日期          1177 non-null   object 
 5   cpc单次点击费用   1165 non-null   float64
 6   cpc总费用      1177 non-null   float64
 7   cpc曝光量      1177 non-null   int64  
 8   cpc访问量      1177 non-null   int64  
 9   gmvroi      1177 non-null   float64
 10  下单转换率       1177 non-null   float64
 11  单均gmv       1177 non-null   float64
 12  单均实收        1177 non-null   float64
 13  实收roi       1177 non-null   float64
 14  无效订单        1168 non-null   float64
 15  有效订单        1168 non-null   float64
 16  自增主键        1177 non-null   int64  
 17  自然曝光量       1165 non-null   float64
 18  自然访问量       1165 non-null   float64
 19  门店下单量       1165 non-null   float64
 20  门店实收        1168 non-null   float64
 21  门店曝光量       1165 non-null   float64
 22  门店营业额       1168 non-null   float64
 23  门店访问量       1165 non-null   float64
dtypes: float64(16), int64(4), object(4)
memory usage: 220.8+ KB

df.head(10) #了解详细数据
    #可以看到我们表格内具体的数据内容
    #head()--括号内不加参数，默认为5，即显示5行

	updateTime	平台i	门店ID	平台门店名称	日期	cpc单次点击费用	cpc总费用	cpc曝光量	cpc访问量	gmvroi	下单转换率	单均gmv	单均实收	实收roi	无效订单	有效订单	自增主键	自然曝光量	自然访问量	门店下单量	门店实收	门店曝光量	门店营业额	门店访问量
0	2019/12/12 11:54	美团	8184590	蛙小辣火锅杯（合生汇店）	2019-12-10	1.300	225.650	2711	173	7.430	0.170	57.480	19.800	2.560	0.000	59.000	1501603	1427.000	159.000	56.000	1167.970	4138.000	3391.380	332.000
1	2019/12/12 11:54	美团	8223184	蛙小辣美蛙火锅杯（大宁国际店）	2019-12-10	1.540	261.100	3665	169	4.300	0.130	52.990	18.880	1.530	0.000	35.000	1501605	11.000	86.000	32.000	660.730	3676.000	1854.760	255.000
2	2019/12/12 11:54	美团	8106681	蛙小辣·美蛙火锅杯（长风大悦城店）	2019-12-10	1.380	177.500	2115	129	6.750	0.180	51.490	20.380	2.670	1.000	53.000	1502265	874.000	165.000	53.000	1080.310	2989.000	2728.720	294.000
3	2019/12/12 11:54	美团	8165842	蛙小辣·美蛙火锅杯（虹口足球场店）	2019-12-10	1.470	240.300	2937	164	7.430	0.190	56.350	19.250	2.540	2.000	64.000	1502274	614.000	162.000	63.000	1231.860	3551.000	3606.100	326.000
4	2019/12/12 11:54	饿了么	2001220953	利芳·一人食大盘鸡(国定路店)	2019-12-10	1.550	623.500	4190	401	5.350	0.170	49.690	12.710	1.370	0.000	132.000	1502523	1872.000	387.000	132.000	1677.960	6062.000	6558.490	788.000
5	2019/12/12 11:54	饿了么	2000555792	蛙小辣·美蛙火锅杯(虹口足球场店)	2019-12-10	1.610	207.800	1628	129	3.170	0.090	56.100	18.200	1.030	0.000	32.000	1502706	802.000	223.000	32.000	582.440	2430.000	1795.340	352.000
6	2019/12/12 11:54	饿了么	2001104355	蛙小辣·美蛙火锅杯(宝山店)	2019-12-10	1.250	198.500	2043	159	6.520	0.130	64.650	25.620	2.580	0.000	54.000	1502736	1108.000	254.000	52.000	1383.310	3151.000	3491.020	413.000
7	2019/12/12 11:54	饿了么	2000507076	蛙小辣火锅杯(五角场店)	2019-12-10	1.350	166.400	1705	123	6.400	0.150	57.810	21.280	2.360	1.000	63.000	1502967	1470.000	284.000	61.000	1340.470	3175.000	3642.100	407.000
8	2019/12/12 11:54	饿了么	2001020019	蛙小辣·美蛙火锅杯(真如店)	2019-12-10	1.510	140.300	1376	93	4.820	0.140	53.450	14.660	1.320	0.000	46.000	1502987	1394.000	245.000	46.000	674.160	2770.000	2458.600	338.000
9	2019/12/12 12:49	美团	8106681	蛙小辣·美蛙火锅杯（长风大悦城店）	2019-12-09	1.400	195.350	2329	140	6.870	0.200	47.130	17.200	2.510	3.000	63.000	1503654	839.000	160.000	61.000	1083.590	3168.000	2969.080	300.000

df.drop(columns='updateTime',inplace = True)#当我们通过上一步：df.head()查看数据内容后，可以删除我们认为不需要的字段
    #inplace--我们直接替换原有的数据

#- 3、数值探索

df.describe()#我们对表格内的数值型字段进行描述性统计
    #count--计数
    #mean--平均值
    #std--标准差
    #min--最小值
    #25%/50%/75%--分位值
    #max--最大值

	门店ID	cpc单次点击费用	cpc总费用	cpc曝光量	cpc访问量	gmvroi	下单转换率	单均gmv	单均实收	实收roi	无效订单	有效订单	自增主键	自然曝光量	自然访问量	门店下单量	门店实收	门店曝光量	门店营业额	门店访问量
count	1177.000	1165.000	1177.000	1177.000	1177.000	1177.000	1177.000	1177.000	1177.000	1177.000	1168.000	1168.000	1177.000	1165.000	1165.000	1165.000	1168.000	1165.000	1168.000	1165.000
mean	1102707722.104	1.390	129.580	1350.986	91.311	8.550	0.189	57.694	20.699	3.053	0.601	41.988	3023888.608	1245.313	124.750	40.894	789.763	2604.082	2298.817	216.701
std	968178797.902	0.304	134.539	1256.423	87.908	15.724	0.063	9.228	5.244	5.602	1.188	35.579	1433359.634	1206.364	101.608	34.501	566.841	1813.494	1728.387	161.656
min	8052557.000	0.020	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	4.000	1501603.000	-5534.000	-427.000	0.000	30.000	0.000	164.000	0.000
25%	8491999.000	1.240	39.910	466.000	29.000	6.040	0.150	52.990	17.820	2.020	0.000	17.000	1872058.000	492.000	50.000	17.000	375.595	1123.000	1040.517	86.000
50%	2000507076.000	1.380	76.200	863.000	57.000	7.590	0.190	57.810	20.530	2.660	0.000	26.000	2538079.000	812.000	92.000	26.000	587.645	2016.000	1609.425	157.000
75%	2001104355.000	1.540	190.000	1950.000	133.000	9.850	0.230	62.740	23.530	3.600	1.000	61.250	3775030.000	1599.000	169.000	60.000	1100.152	3605.000	3296.950	314.000
max	2001572992.000	2.980	846.400	7812.000	502.000	534.660	0.420	90.760	47.320	189.320	12.000	232.000	7684897.000	7153.000	745.000	224.000	3780.110	11066.000	11012.760	985.000

"""

小练习
我们已经拥有了本地数据cpc，现在，我们需要使用sql获取云端数据shop，与本地数据进行连接，并完成以下操作

1、查看各平台在6、7月的GMV占比
2、查看6、7月各周的总GMV情况
3、查看6、7月各周的转化率情况
4、查看6、7月每天各个门店的GMV，门店实收，并按照门店实收进行排名。（输出字段：日期-门店名称-GMV-门店实收-排名）
答案可参考Practise.ipynb

"""

"""matplotlib"""
# DataFrame与Seires可以直接调用matplotlib的plot方法
# 想了解更多细节的家人们可以跳转至官网查看
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.plot.html?highlight=plot#pandas.DataFrame.plot

%matplotlib notebook
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']#设置可以显示中文
plt.rcParams['axes.unicode_minus'] = False #正常显示负号

import pandas as pd
olpc = pd.read_csv('Tokyo 2021 dataset.csv') #读取文件中东京奥运会的CSV表格
olpc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93 entries, 0 to 92
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Rank           93 non-null     int64 
 1   Team/NOC       93 non-null     object
 2   Gold Medal     93 non-null     int64 
 3   Silver Medal   93 non-null     int64 
 4   Bronze Medal   93 non-null     int64 
 5   Total          93 non-null     int64 
 6   Rank by Total  93 non-null     int64 
 7   NOCCode        93 non-null     object
dtypes: int64(6), object(2)
memory usage: 5.9+ KB

top5 = olpc[olpc.Rank<=5] #将排名>5的数据命名为top5
top5.rename(columns={'Team/NOC':'Team'},inplace=True) #给字段Team/NOC 改名为 Team
top5

C:\Users\LENOVO\AppData\Local\Temp\ipykernel_16248\4082916704.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top5.rename(columns={'Team/NOC':'Team'},inplace=True) #给字段Team/NOC 改名为 Team

	Rank	Team	Gold Medal	Silver Medal	Bronze Medal	Total	Rank by Total	NOCCode
0	1	United States of America	39	41	33	113	1	USA
1	2	People's Republic of China	38	32	18	88	2	CHN
2	3	Japan	27	14	17	58	5	JPN
3	4	Great Britain	22	21	22	65	4	GBR
4	5	ROC	20	28	23	71	3	ROC

# 基础图表
# 每一个单元格都有自己的figure()，不然图表会被后面的代码篡改哦
plt.figure()
plt.plot(top5['NOCCode'],top5['Gold Medal'])#plot为折线图，分别传入参数x,y对应横纵坐标

[<matplotlib.lines.Line2D at 0x1ba68ed7e10>]

在这里插入图片描述

plt.figure()
plt.bar(top5['NOCCode'],top5['Gold Medal'])#bar为柱状图，分别传入参数x,y对应横纵坐标

<BarContainer object of 5 artists>

在这里插入图片描述

plt.figure()
plt.barh(top5['NOCCode'],top5['Gold Medal']) #barh为水平柱状图，分别传入参数x,y对应横纵坐标

<BarContainer object of 5 artists>

在这里插入图片描述

plt.figure()
plt.pie(top5['Gold Medal']) #pie为饼图，需要传入的是一个 Seires

([<matplotlib.patches.Wedge at 0x1ba68c4ab90>,
  <matplotlib.patches.Wedge at 0x1ba69704a90>,
  <matplotlib.patches.Wedge at 0x1ba69705410>,
  <matplotlib.patches.Wedge at 0x1ba69706890>,
  <matplotlib.patches.Wedge at 0x1ba69707d10>],
 [Text(0.7348702653319799, 0.8185143206633013, ''),
  Text(-0.8786572668872328, 0.6617865270208049, ''),
  Text(-0.8025133929575279, -0.7523112747552017, ''),
  Text(0.2579404140090271, -1.0693300439157463, ''),
  Text(0.9996993595783734, -0.45891305326672754, '')])

在这里插入图片描述

plt.figure()
plt.scatter(top5['NOCCode'],top5['Gold Medal']) #scatter是散点图，传入两个 数值型的字段

<matplotlib.collections.PathCollection at 0x1ba68dfcb10>

在这里插入图片描述

plt.figure()
plt.hist(top5['Gold Medal'],bins=5) #hist为直方图，传入一个Serires，bins可以设置你的桶大小

(array([2., 1., 0., 0., 2.]),
 array([20. , 23.8, 27.6, 31.4, 35.2, 39. ]),
 <BarContainer object of 5 artists>)

在这里插入图片描述

plt.figure()
plt.plot(top5['NOCCode'],top5['Gold Medal'])
plt.ylim(20,40) #设置你的y轴范围
#若想设置x轴，则输入 plt.xlim()

(20.0, 40.0)

在这里插入图片描述

plt.figure()
plt.plot(top5['NOCCode'],top5['Gold Medal'])
plt.xlabel('国家简称')
plt.ylabel('金牌数目')
#给坐标命名

Text(0, 0.5, '金牌数目')

在这里插入图片描述

plt.figure()
plt.plot(top5['NOCCode'],top5['Gold Medal'],label = '金牌数目') #添加一个参数 label，然后输入你的标签
plt.legend() #要码出这行代码，标签才会显示哦
#增加图例

<matplotlib.legend.Legend at 0x1ba68e386d0>

在这里插入图片描述

plt.figure()
plt.plot(top5['NOCCode'],top5['Gold Medal'])
for x,y in zip(top5['NOCCode'],top5['Gold Medal']):
    plt.text(x,y+0.1,'%d' % y,ha = 'center',va = 'bottom')
#增加标注