数据分析之jupyer（数据挖掘工程师进阶之路）

最新推荐文章于 2023-06-05 19:49:51 发布

离开你，我才发现

最新推荐文章于 2023-06-05 19:49:51 发布

阅读量804

点赞数 2

分类专栏：数据分析pandas 文章标签： python numpy 数据分析数据可视化

本文链接：https://blog.csdn.net/k8vg___/article/details/111474886

版权

数据分析pandas 专栏收录该内容

3 篇文章 0 订阅

订阅专栏

欢迎访问个人博客http://www.jkraise.top

jupyter

说明：
Jupyter项目是一个交互式的Python科学计算和数据分析生态系统，最常用的是Jupyter Notebook
交互式的编程环境，友好的界面，便于分享的文档格式，对排版语法、绘图、数学公式的支持，使她成为最流行的Python科学计算工具
install：
```
# 1. 进入虚拟环境 
workon  py3_numpy
#2 安装模块
windows 
pip3 install jupyter 
pip3 install matplotlib
==============
==============
linux:
pip3 install notebook
pip3 install matplotlib
```
打开 jupyter-notebook
关闭 Ctrl + c 输入 y 关闭

Matplotlib 绘图库

1. 基础

折线图

# 导入 绘图库
import matplotlib.pyplot as plt
# 绘制一条线时， x轴 可以省略 
plt.plot([0,2,4,6,8]) # 默认Y轴坐标

plt.plot([0,2,4,6,8],[1,5,3,9,7]) # X轴坐标值,Y轴坐标值

plt.show() #显示图片

在这里插入图片描述

线条颜色，color=‘g’
线条风格，linestyle=’–’
线条粗细，linewidth=5.0
标记风格，marker=‘o’
标记颜色，markerfacecolor=‘b’
标记尺寸，markersize=20
=====
透明度，alpha=0.5
案例

 # 汇率
eurcny = [6.8007,6.8007,6.8015,6.8015,6.8060,6.8036,6.8025,6.7877,6.7835,6.7758,6.7700,6.7463,6.7519,6.7595,6.7669,6.7511,6.7511,6.7539,6.7430,6.7374,6.7265]
# 日期
date = [3,4,5,6,7,10,11,12,13,14,17,18,19,20,21,24,25,26,27,28,31]
plt.plot(
	date, # x 轴数据 ， 日期
	eurcny，# y轴数据，  收盘价
	color='r', # 线条颜色，
	linestyle='--',  # 线条风格
	linewidth=2,  # 线条粗细
	marker=‘o’,   #  标记风格
	markerfacecolor='#ffff00' , # 标记颜色
	markersize=5,  # 标记大小

	alpha=0.5, # 透明度
	

)

 plt.plot(eurcny)
 plt.show()

在这里插入图片描述

散点图／气泡图

# 数据
x = [1,3,5,7,9,11,13,15,17]
y = [2,-5,19,3,5,8,12,6,1]

# 绘图
plt.scatter(x, y)

plt.show()

在这里插入图片描述

条形图／柱状图

x = [1,2,3,4,5]
y = [3,6,1,8,2]

# 柱状图，x轴为单个柱子，y轴为柱子高度，Width用于柱子粗细
plt.bar(x, y)
plt.show()

# 条形图，注意x，y含义
plt.barh(
    x, # 横条离开x轴的距离
    y, # 横条长度
    height=0.5, # 横条粗细
)
# y轴标注
plt.yticks(x,['a','b','c','d','e'])
plt.show()

在这里插入图片描述
案例

# 指定画版的大小和分辨率
plt.figure(figsize=(6,6), dpi=100)

# 男生平均分， 语文/数学/英语/物理/化学
boy = [85.5,91,72,59,66]
# 女生平均分
girl = [94,82,89.5,62,49]

# 科目坐标
course = [1,2,3,4,5]

# 绘图  男生
plt.bar(
    course, 
    boy, 
    color = 'g',
    width = 0.3,
    alpha = 0.3
)

# 绘图  女生
# 间隔 0.3   width =0.3 
course2 = [1.3,2.3,3.3,4.3,5.3]
plt.bar(
    course2, # x
    girl,  # y
    color = 'r',
    width = 0.3,
    alpha = 0.3
)

# 科目坐标替换为字符
course3 = [1.15,2.15,3.15,4.15,5.15]

plt.xticks(course3, ['Chi','Math', 'Eng','Phy', 'Che'])
# 保存图像
plt.savefig("./avg_data.png")
# 显示图像
plt.show()

在这里插入图片描述
还有一大堆图表，这里不一一列举了

2. 提升

图像绘制区域
图像组件
案例：

折线图

# 指定画版的大小和分辨率
plt.figure(figsize=(6,6), dpi=100)

# 指定坐标点，注意是列表， 注意x和y 的数量要对应
x = [i for i in range(10)]
y = [12,32,2,34,23,12,56,34,23,44]
# 
plt.plot(x,y)
# 保存图像
plt.savefig("./data1.png")
plt.show()

在这里插入图片描述
解决中文显示问题
linux 版：

# 比较北京和上海 天气 案例
import matplotlib.pyplot as plt
import random

from matplotlib import font_manager
# 放字体文件目录  必须是绝对路径
my_font = font_manager.FontProperties(r'D:\Pycharm-project\untitled\py3_nmp\arial unicode ms.ttf')
# 指定画板大小和分辨率
plt.figure(figsize=(20, 10), dpi=100)

# 制定坐标点 注意必须是列表 注意x和y的数量要一致
x = range(60)
# 15到18度随机 60个点
y = [random.uniform(15, 18) for i in range(60)]
y_beijing = [random.uniform(0, 5) for i in range(60)]

y_ = [i for i in range(50)]
# 画出折线图 上海的
plt.plot(x, y, label="上海")
# 北京的
plt.plot(x, y_beijing, label="北京", color='r', linestyle="--", linewidth=5)
x_name = ["11点{}分".format(i) for i in range(60)]
# x轴刻度
plt.xticks(x[::5], x_name[::5], fontproperties=my_font)
# y轴刻度
plt.yticks(y_[::5])
# x和y轴的名字
plt.xlabel("时间", fontdict={"fontproperties": my_font})
plt.ylabel("温度", fontdict={"fontproperties": my_font})
# 标题
plt.title("time in 11 to 12")
# 显示折线数据的描述 上海和北京
plt.legend(loc="best",prop = my_font)
plt.savefig("./data_bjhangh.png")
plt.show()

Windows版：

import matplotlib.pyplot as plt
import random

from matplotlib import font_manager

# 指定画板大小和分辨率
plt.figure(figsize=(20, 10), dpi=100)

# Windows 字体设置
# 字体设置
# 1. 方式1
plt.rc('font', family='SimHei', size=13)
#  方式2
#plt.rcParams['font.sans-serif'] = ['SimHei']   # 指定默认字体
#plt.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题

# 制定坐标点 注意必须是列表 注意x和y的数量要一致
x = range(60)
# 15到18度随机 60个点
y = [random.uniform(15, 18) for i in range(60)]
y_beijing = [random.uniform(0, 5) for i in range(60)]

y_ = [i for i in range(50)]
# 画出折线图 上海的
plt.plot(x, y, label="上海")
# 北京的
plt.plot(x, y_beijing, label="北京", color='r', linestyle="--", linewidth=5)
x_name = ["11点{}分".format(i) for i in range(60)]
# x轴刻度
plt.xticks(x[::5], x_name[::5], )
# y轴刻度
plt.yticks(y_[::5])
# x和y轴的名字
plt.xlabel("时间", )
plt.ylabel("温度", )
# 标题
plt.title("time in 11 to 12")
# 显示折线数据的描述 上海和北京
plt.legend(loc="best",)
plt.savefig("./data_bjhangh.png")
plt.show()

在这里插入图片描述
折线网格图

import matplotlib.pyplot as plt
plt.rcParams['font.family'] = ['Arial Unicode MS', 'sans-serif']

# 指定画板大小和分辨率
plt.figure(figsize=(20, 10), dpi=100)
# 绘图，并赋值给变量
n1, = plt.plot(
    [0,2,4,6,8],  # X轴坐标值
    [1,5,3,9,7],  # Y轴坐标值
    label='number1图例标题1'  # 图例
)
n2, = plt.plot([1,3,5,7,9],[0,4,2,8,6], label = 'number2图例标题2')

########################

# 图表标题
plt.title(
    'hello图表标题',
    color = '#00ff00',
    fontsize = 24,
)

# 坐标轴标注
plt.xlabel('X axis')
plt.ylabel('Y轴标注')

# 刻度和刻度标注
plt.yticks([1,3,6,9,12,15,18,20])  # 设置坐标刻度步长
plt.xticks(
    [0,1,2,5,8,10],  # 刻度设置
    ['2000','2001','2002','2005','2008','2010'],  # 刻度标注
    fontsize = 14,  # 文字大小
    rotation = 90,  # 旋转角度
)

# 坐标范围
# plt.axis([-1,11,-2,12]) # X轴-1到11,Y轴-2到12
plt.xlim([-1,11])  # x轴坐标范围：-1到11
plt.ylim([-2,12])  # y轴坐标范围：-2到12

########################

# 图例：全部绘制（需在绘图方法内加label属性，见前代码）
# plt.legend() # loc 位置，frameon 有无边框

# 图例：指定数据绘制图例，绘图方法前需要赋值变量（绘图方法传给变量，变量名后带逗号）
# 图例位置可用参数：
# best 默认
# right
# center,center left,center right
# lower center,lower left,lower right
# upper center,upper left,upper right
plt.legend(
    handles = [n1,n2],  # 给指定数据绘制图例
    loc = 'upper right',  # 图例位置
    frameon=  False  # 有无边框
)

# 网格
plt.grid()

# 存为图片文件，默认png，
# 其他格式如jpg写上后缀即可
# 可带路径如`images/test.jpg`，需要先建立目录
#plt.savefig('test', dpi = 600)  # dpi 分辨率，常用：72，300

# 显示图像
plt.show()

在这里插入图片描述

直方图


import matplotlib.pyplot as plt
import random

plt.figure(figsize=(15,6), dpi=100)

# 准备时长
time =[131,  98, 125, 131, 124, 139, 131, 117, 128, 108, 135, 138, 131, 102, 107, 114, 119, 128, 121, 142, 127, 130, 124, 101, 110, 116, 117, 110, 128, 128, 115,  99, 136, 126, 134,  95, 138, 117, 111,78, 132, 124, 113, 150, 110, 117,  86,  95, 144, 105, 126, 130,126, 130, 126, 116, 123, 106, 112, 138, 123,  86, 101,  99, 136,123, 117, 119, 105, 137, 123, 128, 125, 104, 109, 134, 125, 127,105, 120, 107, 129, 116, 108, 132, 103, 136, 118, 102, 120, 114,105, 115, 132, 145, 119, 121, 112, 139, 125, 138, 109, 132, 134,156, 106, 117, 127, 144, 139, 139, 119, 140,  83, 110, 102,123,107, 143, 115, 136, 118, 139, 123, 112, 118, 125, 109, 119, 133,112, 114, 122, 109, 106, 123, 116, 131, 127, 115, 118, 112, 135,115, 146, 137, 116, 103, 144,  83, 123, 111, 110, 111, 100, 154,136, 100, 118, 119, 133, 134, 106, 129, 126, 110, 111, 109, 141,120, 117, 106, 149, 122, 122, 110, 118, 127, 121, 114, 125, 126,114, 140, 103, 130, 141, 117, 106, 114, 121, 114, 133, 137,  92,121, 112, 146,  97, 137, 105,  98, 117, 112,  81,  97, 139, 113,134, 106, 144, 110, 137, 137, 111, 104, 117, 100, 111, 101, 110,105, 129, 137, 112, 120, 113, 133, 112,  83,  94, 146, 133, 101,131, 116, 111,  84, 137, 115, 122, 106, 144, 109, 123, 116, 111,111, 133, 150]

# 指定组距 （每隔多少分钟统计一次数量）
width =3 

# 组数
num_bins = int((max(time)- min(time))/width)

# 显示直方图
plt.hist(x=time, bins=num_bins, density=True)

# 指定显示刻度的个数
plt.xticks(range(min(time), max(time))[::5])

# 指定标题
plt.title('250个电影的时长分布图', )
plt.grid(True, linestyle='--', alpha=0.5)

plt.show()

直方图
饼图api

movie_name = ['雷神3：诸神黄昏','正义联盟','东方快车谋杀案','寻梦环游记','全球风暴',
              '降魔传','追捕','七十七天','密战','狂兽','其它']

# 设置字体 
plt.rcParams['font.family'] = ['Arial Unicode MS', 'sans-serif']
# 指定画板大小和分辨率
plt.figure(figsize=(20,10), dpi=100)

place_count = [60605,54546,45819,28243,13270,9945,7679,6799,6101,4621,20105]

# 绘制 饼图

# explode突出块，突出比例,
p = plt.pie(place_count,labels=movie_name, 
            explode=(0,0.1,0,0,0,0,0,0.3,0,0,0.1), 
            autopct='%0.2f%%',colors=['b','r','g','y','c','m','y','k','c','g','r'])


# linux  解决方法
# 把描述设置为汉字字体
# for t in p[1]:
#    t.set_fontproperties(my_font)

# plt.legend(loc="best",prop = my_font)
# plt.title("今日排片占比",fontproperties=my_font)           

plt.legend(loc="best")          
plt.title("今日排片占比")
plt.axis('equal')
plt.show()

在这里插入图片描述
散点图

数值计算库 Numpy

1. 基础

数组

# ndarray 数组
import numpy as np
class1 = np.array([99,60,80,5,50])
print(class1)
print(class1.dtype)
print('++++++++=')

# dtype 查看数组 类型
s1 = np.array([True,80,177.7,'张小明'])
print(s1,s1.dtype)

# [99 60 80  5 50]
# int32
# ++++++++=
# ['True' '80' '177.7' '张小明'] <U32

dtype 查看数组类型
在这里插入图片描述
ndarray数组对象属性

案例

import numpy as np

# 二维数组，1班学生情况
class1 = np.array([
    [True,80,177.7,'张小明'],
    [False,99,164.5,'李丽'],
    [True,59,158,'王华']
])


print(class1.dtype)
print(class1.size)

print(class1.ndim) # 秩
print(class1.shape) # 轴
print("================")
# 三维数组，1班学生 期中，期末情况
class1 = np.array([
    [
        [True,80,177.7,'张小明'],
        [False,99,164.5,'李丽'],
        [True,59,158,'王华']
    ],
    [
        [True,95,178,'张小明'],
        [False,99,164.5,'李丽'],
        [True,39,178,'王华']
    ],
])

print(class1.dtype)
print(class1.size)

print(class1.ndim) # 秩
print(class1.shape) # 轴

print(len(class1.shape))

在这里插入图片描述


import numpy as np

#  =====
# np.arange()
# 类似range()函数(递增的整数序列)，元素从0到n-1


list_01 = np.arange(10)
# 起始 ，终点， 步长，
list_02 = np.arange(1,9,2)
# print(list_01, list_02)
# [0 1 2 3 4 5 6 7 8 9] [1 3 5 7]


#  =====
# np.linspace()
# 根据起止数据等间距填充数据形成数组
num01 = np.linspace(1,10,4)
# print(num01) # [ 1.  4.  7. 10.]
# endpoint 表示终止元素
num02 = np.linspace(1,10,4, endpoint = False)
# print(num02)  #[1.   3.25 5.5  7.75]


#  =====
# np.array()
# 形成数组
n1 = np.array([[1,23,4],[3,4,5]])
# print(n1)
# [[ 1 23  4]
#  [ 3  4  5]]


#  =====
# np.ones(shape)
# 根据shape生成一个全1数组，shape是元组类型

# np.ones_like(ndarray)
# 以另一个数组为参数，根据其形状和dtype创建全1数组

# n2 = np.ones((2,3,4))
# print(n2,)
# print("==============")
# n3 = np.ones_like(n2)
# print(n3)
# [[[1. 1. 1. 1.]
#   [1. 1. 1. 1.]
#   [1. 1. 1. 1.]]
#
#  [[1. 1. 1. 1.]
#   [1. 1. 1. 1.]
#   [1. 1. 1. 1.]]]
# ==============
# [[[1. 1. 1. 1.]
#   [1. 1. 1. 1.]
#   [1. 1. 1. 1.]]
#
#  [[1. 1. 1. 1.]
#   [1. 1. 1. 1.]
#   [1. 1. 1. 1.]]]
#

# =======
# np.zeros(shape)	根据shape生成一个全0数组，shape是元组类型
# np.zeros_like(ndarray)	以另一个数组为参数，根据其形状和dtype创建全0数组
# 全0
# n3 = np.zeros((3, 6), dtype = np.int32)
# n4 = np.zeros_like(n3)
# print(n3)
# print("===========")
# print(n4)
# [[0 0 0 0 0 0]
#  [0 0 0 0 0 0]
#  [0 0 0 0 0 0]]
# ===========
# [[0 0 0 0 0 0]
#  [0 0 0 0 0 0]
#  [0 0 0 0 0 0]]


# ======
# np.empty(shape)	创建新数组只分配内存空间，随意填充一些垃圾值
# np.empty_like(ndarray)	以另一个数组为参数，根据其形状和dtype创建填充值数组
# 填充数组
# n1 = np.empty((6, 7))
# n2 = np.empty_like(n1)
# print(n1)
# print("=========")
# print(n2)
# [[1.18036057e-311 1.18024123e-311 0.00000000e+000 0.00000000e+000
#   1.18036090e-311 0.00000000e+000 1.18036090e-311]
#  [0.00000000e+000 0.00000000e+000 1.18022288e-311 0.00000000e+000
#   1.18022288e-311 0.00000000e+000 1.18022288e-311]
#  [0.00000000e+000 1.18022288e-311 0.00000000e+000 1.18022288e-311
#   0.00000000e+000 1.18022288e-311 0.00000000e+000]
#  [1.18022288e-311 0.00000000e+000 1.18022288e-311 0.00000000e+000
#   1.18022288e-311 0.00000000e+000 1.18022288e-311]
#  [0.00000000e+000 0.00000000e+000 0.00000000e+000 6.95331186e-310
#   0.00000000e+000 0.00000000e+000 6.95331185e-310]
#  [6.95331185e-310 6.95331184e-310 0.00000000e+000 0.00000000e+000
#   0.00000000e+000 0.00000000e+000 0.00000000e+000]]
# =========
# [[1.18024210e-311 1.18024123e-311 9.29433783e+242 1.49174223e+195
#   4.95261533e+223 7.19464630e+159 1.99501687e+161]
#  [2.76518167e+180 4.45511939e-091 2.75383585e+212 2.97762064e+228
#   7.66991258e+170 1.06112891e-153 4.64501053e+151]
#  [2.63265729e+267 1.32882271e-258 3.68777421e+180 4.47593816e-091
#   2.93573416e+222 1.36455813e+161 5.37649537e+242]
#  [9.92152605e+247 5.03734573e+180 5.28595595e-085 4.64501053e+151
#   5.50436598e+257 5.03734573e+180 9.92152728e+247]
#  [5.03734573e+180 4.83245960e+276 8.03408340e-095 1.95575364e-109
#   2.87903286e-152 2.58400946e+161 7.61384359e-010]
#  [5.34083717e+228 5.02383426e+223 4.27195504e+270 2.59345414e+161
#   5.03734574e+180 8.03408340e-095 1.05894728e-153]]




# ======
# np.full(shape,val)	根据shape生成一个数组，每个元素都是val
# np.full_like(a,val)	根据数组a的形状生成一个全 val 数组

# 设定值数组
n1 = np.full(6, 3)
n2 = np.full((2,3,4), 25)
n3 = np.full_like(n1, 25)
print(n1)
print('==========')
print(n2)
print('==========')

print(n3)
# ======
# np.eye(n)，np.identity(n)	创建一个正方的n*n单位矩阵，对角线为1，其余为0
# np.diag(list)	创建一个正方形矩阵，对角线为参数值

# n1 = np.eye(5)
# n2 = np.identity(5)
# print(n1)
# print("===========")
# print(n2)


# 正方形矩阵，对角线为参数值
# n1 = np.diag([1,3,5,7,9])
# print(n1)
# [[1 0 0 0 0]
#  [0 3 0 0 0]
#  [0 0 5 0 0]
#  [0 0 0 7 0]
#  [0 0 0 0 9]]

数组运算

import numpy as np

# arr01 = np.ones([30])
# print(arr01)
# print('=================')
#
# arr02 = arr01.reshape((5,6))
# print(arr02)
# print('=================')
#
# arr03 = arr01.reshape((5,2,3))
# print(arr03)
# print('=========')

# arr1 = np.random.normal(1.5,0.2,(4,5))
#
# print(arr1)
# print('-'*30)
#
# print(arr1[1])
# print('-'*30)
#
# print(arr1[0:2])
# print('-'*30)
#
# print(arr1[0:2, 3])
# print('-'*30)
#
# # 0 到 2 的数组 中的 1列到3列
# print(arr1[0:2, 1:3])
# print('-'*30)
#
# print(arr1[0][1])
# print('-'*30)

# arr1 = np.arange(24).reshape((4,6))
# print(arr1)
#
# # numpy 三元运算
# arr2 = np.where(arr1 < 10, 0, 10)
# print(arr2)

# [[ 0  1  2  3  4  5]
#  [ 6  7  8  9 10 11]
#  [12 13 14 15 16 17]
#  [18 19 20 21 22 23]]
# [[ 0  0  0  0  0  0]
#  [ 0  0  0  0 10 10]
#  [10 10 10 10 10 10]
#  [10 10 10 10 10 10]]

###
###  数组间的运算
#
# a1 = np.array([[1, 2, 3], [4, 5, 6]])
# a2 = np.array([[5, 10, 15], [5, 6, 7]])
#
# print(a1 + a2)

# a = np.array([6,15,25])
# print(a.mean())
#
# print(np.average(a, weights=[1,2,3]))


# a = np.array ([
# 	[80, 86], [82, 80], [85, 78], [90, 90], [86, 82], [82, 90], [78, 80],
# 	[92, 94]
#
# ])

# print(np.max(a))
# print(np.min(a))
# print(np.mean(a))   # 平均数
# print(np.mean(a, axis=0))  # 平均值
# print(np.max(a, axis=1))

数值分析库 Pandas

1. 读取csv 文件

import pandas as pd
content_csv = pd.read_csv('./IMDB-Movie-Data.csv')

print(content_csv)

2. 数据处理

import pandas as pd
import numpy as np
#
# s1 = pd.Series([1,2,3,4,5])
#
# arr1 = np.arange(10)
#
# print(pd.Series(arr1))
# print('-------------')
# print(pd.Series([[3,45,6],['hehe','a']]))
# print('-------------')
#
#
# # 取出数据  切片
# print(s1[2])
# print('-'*30)
# print(s1[2:4])
# print('-'*30)
# # 自定义索引
# s2 = pd.Series([1,23,4,5],[10,'a','b','c'])
# print(s2)
# print(s2[10])
# print(s2['a'])
# print(s2['b'])
#
#
#



# csv文件读取

s1 = pd.read_csv('./dogNames2.csv')

# print(s1.shape)
# print('---------------')
# print(s1.dtypes)
# print('-----------')
# print(s1.ndim)  # 维度
# print('------------')
# print(s1.index)  # 行索引
# print('------------')
# print(s1.columns)   # 列索引
# print('------------')
# print(s1.values)  # 值索引
#
#
#
# s1.head(3)  # 显示头部几行，默认5行
# s1.tail(3)  # 显示末尾几行，默认5行
#
# s1.info()  # 相关信息概览：行数，列数，列索引，列非空值个数，列类型，列类型，内存占用
#
# s1.describe()  #快速综合统计结果：计数，均值，标准差，最大值，四分位数，最小s1





###  ======================
### 过滤 判断

print(s1['Count_AnimalName'] > 800)
print('-------------')
# 获取结果为 true  名字超过800
print(s1[s1['Count_AnimalName'] > 800])

# 排序 处理
print(s1.sort_values('Count_AnimalName'))

# ascending=False  从大到小
print('------------')
print(s1.sort_values('Count_AnimalName', ascending=False))

获取nan 打印type 类型

import pandas as pd

# 读取csv文件  得到DataFrame对象
s1 = pd.read_csv('./IMDB-Movie-Data.csv')
print(s1['Revenue (Millions)'][7])   # 获取一个nan数据
print(type(s1['Revenue (Millions)'][7]))  #<class 'numpy.float64'> 注意nan是float类型

替换缺失值为Nan

import pandas as pd
import numpy as np

# 读取csv文件 获取Dataframe 对象
s1 = pd.read_csv('./breast.data')
s2 = s1.replace(to_replace='?', value=np.nan)  #把数据替换成nan
print(s2[20:34])

groupby分组


import pandas as pd
import numpy as np

# content_csv = pd.read_csv('./IMDB-Movie-Data.csv')
#
# print(content_csv)


df = pd.DataFrame({
    'name': ['张三','李四','王五','李四','王五','王五','赵六'],
    'chinese':np.random.randint(35,100,7),
    'math':np.random.randint(35,100,7),
    'english':np.random.randint(35,100,7),
    'test': ['一','一','一','二','二','三','一']
})

# print(df)


### groupby  分组
###  =================
# 创建group对象，未进行计算
print(df.groupby('name'))

# 调用函数做分组计算，分组后求每组平均值
# 可将列名或列值当作分组对象，分组中数值列会被聚合，非数值列会从结果中排除
a = df.groupby('name').mean()
# print(a)

#         chinese       math    english
# name
# 张三    69.000000  40.000000  69.000000
# 李四    72.000000  91.000000  49.500000
# 王五    64.333333  60.666667  58.333333
# 赵六    86.000000  88.000000  37.000000

# 如果不想使用分组列作为索引，设置参数as_index=Falsed
b = df.groupby('name',as_index=False).mean()
# print(b)

#   name    chinese       math    english
# 0   张三  69.000000  40.000000  69.000000
# 1   李四  72.000000  91.000000  49.500000
# 2   王五  64.333333  60.666667  58.333333
# 3   赵六  86.000000  88.000000  37.000000

# 对多列分组后求平均值
c = df.groupby(['name','chinese']).mean()
c2 = df.groupby(['name','chinese'],as_index=False).mean()
print(c)
             math  english
# name chinese               
# 张三   40         85       67
# 李四   44         40       57
#      84         85       78
# 王五   35         44       56
#      40         80       81
#      65         35       85
# 赵六   39         92       56

print(c2)

# 
#   name  chinese  math  english
# 0   张三       40    85       67
# 1   李四       44    40       57
# 2   李四       84    85       78
# 3   王五       35    44       56
# 4   王五       40    80       81
# 5   王五       65    35       85
# 6   赵六       39    92       56