新手数据分析项目(三)— Students Performance in Exams

本次实践项目是最后一个探索性分析项目,大家将会从这个项目中学到所有基础图表的绘制,并且每类图表都有不同参数绘制的图像,所以本文篇幅会很长,请小伙伴们耐心学习。

项目所用数据为Students Performance in Exams,下载网址如下

Students Performance in Exams​www.kaggle.com/datasets/spscientist/students-performance-in-exams正在上传…重新上传取消

导入本次项目所需要的包

import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

导入数据,并且查看数据

warnings.filterwarnings("ignore") #忽略可忽略报错
data = pd.read_csv('E:/Kaggle/StudentsPerformance.csv')
data.head()

race/ethnicity:种族/民族 、parental level of education:父母的教育水平 、lunch:午餐、test preparation course:考前准备课程

如果细心一点可以进行随机抽样查看数据

data.sample(frac=0.01) # 随机抽取1%的数据

除此之外,下面还有许多常用的查看数据函数

data.info()

data.iloc[:,0:3].dtypes # iloc可以对二维数据进行切片

data.describe()

data.dtypes

data.corr() # 显示数据之间的邻近值分析

data.isnull().sum()

自定义列名,方便我们后续的索引

data.rename(columns=({'gender':'Gender', 'race/ethnicity':'Race/Ethnicity', 
                      'parental level of education':'Parental_Level_of_Education', 
                      'lunch':'Lunch','test preparation course':'Test_Preparation_Course',
                      'math score':'Math_Score','reading score':'Reading_Score', 
                      'writing score':'Writing_Score'}),inplace=True)

Bar Plot:柱状图

seaborn.barplot(x=None, y=None, hue=None, data=None, order=None, hue_order=None, estimator=, ci=95, n_boot=1000, units=None, orient=None, color=None, palette=None, saturation=0.75, errcolor='.26', errwidth=None, capsize=None, dodge=True, ax=None, **kwargs)

sns.set(style = 'whitegrid')
ax = sns.barplot(x=data['Gender'].value_counts().index, y=data['Gender'].value_counts().values, 
                 palette="Blues_d", hue=['femal','male'])
plt.legend(loc=8) # 指定图例位置
plt.xlabel('Gender')
plt.ylabel('Frequency')
plt.title('Show of Gender Bar Plot')
plt.show()

sns.barplot(x=data['Gender'].value_counts().index, y=data['Gender'].value_counts().values)
plt.title('Genders other rate')
plt.ylabel('Rates')
plt.show()

plt.figure(figsize=(7,7))
sns.barplot(x=data['Race/Ethnicity'].value_counts().index,y=data['Race/Ethnicity'].value_counts().values)
plt.xlabel('Race/Ethnicity')
plt.ylabel('Frequency')
plt.title('Show of Race/Ethnicity Bar Plot')
plt.show()

plt.figure(figsize=(10,7))
sns.barplot(x="Parental_Level_of_Education", y="Writing_Score", hue="Gender", data=data)
plt.xticks(rotation=45)
plt.show()

sns.barplot(x="Parental_Level_of_Education", y="Reading_Score", hue="Gender",data=data)
plt.xticks(rotation=45)
plt.show()

plt.figure(figsize=(10,7))
sns.barplot(x="Parental_Level_of_Education", y="Math_Score", hue="Gender", data=data)
plt.xticks(rotation=45)
plt.show()

plt.figure(figsize=(12,7))
sns.catplot(y="Gender", x="Math_Score", hue="Parental_Level_of_Education",data=data, kind="bar") # 分类型数据绘图
plt.title('for Parental Level Of Education Gender & Math_score')
plt.show()

plt.figure(figsize=(10,10))
sns.catplot(x="Gender", y="Math_Score", hue="Test_Preparation_Course", data=data, kind="bar")
plt.title('for Test Preparation Course Gender & Math_Score')
plt.show()

ax = sns.barplot("Parental_Level_of_Education", "Writing_Score", data=data,
                linewidth=2.5, facecolor=(1,1,1,0),
                errcolor=".2", edgecolor=".2")
plt.xticks(rotation=90)
plt.show()

plt.title("Lunch - Free/reduced & standard")
sns.barplot(x=data_lunch_score.index, y=data_lunch_score.values)
plt.show()

f, ax = plt.subplots(figsize=(9,10)) # 创建子图
sns.barplot(x=data['Gender'].value_counts().values, y=data['Gender'].value_counts().index, 
            alpha=0.5, color='red', label='Gender')
sns.barplot(x=data['Race/Ethnicity'].value_counts().values, y=data['Race/Ethnicity'].value_counts().index, 
            color='blue', alpha=0.7, label='Race/Ethnicity')
ax.legend(loc='upper right', frameon=True) # frame=True:在图例位置绘制框架
ax.set(xlabel='Gender , Race/Ethnicity', ylabel='Groups', title='Gender vs Race/Ethnicity')
plt.show()

Point Plot:点图

seaborn.pointplot(x=None, y=None, hue=None, data=None, order=None, hue_order=None, estimator=, ci=95, n_boot=1000, units=None, markers='o', linestyles='-', dodge=False, join=True, scale=1, orient=None, color=None, palette=None, errwidth=None, capsize=None, ax=None, **kwargs)

# 查看group B拥有数学成绩的个数 为190
data['Race/Ethnicity'].unique()
len(data[(data['Race/Ethnicity']=='group B')].Math_Score)

f, ax1 = plt.subplots(figsize=(25,10))
sns.pointplot(x=np.arange(1,191), y=data[(data['Race/Ethnicity']=='group B')].Math_Score, color='lime', alpha=0.8)
sns.pointplot(x=np.arange(1,191), y=data[(data['Race/Ethnicity']=='group B')].Reading_Score, color='red', alpha=0.5)
plt.xlabel('Group B index State')
plt.ylabel('Frequency')
plt.title('Group B Math Score & Reading_Score')
plt.xticks(rotation=90)
plt.grid()
plt.show()

ax = sns.pointplot(x="Reading_Score", y="Writing_Score", hue="Gender", 
                   data=data, markers=["o", "x"], linestyles=["-", "--"])
plt.xticks(rotation=90)
plt.show()

Joint Plot:双变量关系图

seaborn.jointplot(x, y, data=None, kind='scatter', stat_func=None, color=None, height=6, ratio=5, space=0.2, dropna=True, xlim=None, ylim=None, joint_kws=None, marginal_kws=None, annot_kws=None, **kwargs)

plt.figure(figsize=(10,10))
sns.jointplot(x=np.arange(1,191), y=data[(data['Race/Ethnicity']=='group B')].Math_Score, color='lime', alpha=0.8)
plt.xlabel('Group B index State')
plt.ylabel('Frequency')
plt.title('Group B Frequency Race/Ethnicity')
plt.xticks(rotation=90)
plt.tight_layout() # 自动调整子图参数,使之填充整个图像区域
plt.show()

plt.figure(figsize=(10,10))
sns.jointplot(x=np.arange(1,191), y=data[(data['Race/Ethnicity']=='group B')].Math_Score, color='lime', kind='hex', alpha=0.8) # kind='hex':六边形点
plt.xlabel('Group B index State')
plt.ylabel('Frequency')
plt.title('Group B Frequency Race/Ethnicity')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

plt.figure(figsize=(10,10))
sns.jointplot(x=np.arange(1,191), y=data[(data['Race/Ethnicity']=='group B')].Math_Score,
              color='lime', space=0, kind='kde') # kind='kde':核密度估计 space=0:指定边缘轴和主轴之间的距离
plt.xlabel('Group B index State')
plt.ylabel('Frequency')
plt.title('Group B Frequency Race/Ethnicity')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

plt.figure(figsize=(10,10))
sns.jointplot(x=np.arange(1,191),y=data[(data['Race/Ethnicity']=='group B')].Reading_Score,color='k').plot_joint(sns.kdeplot, zorder=0, n_levels=6)
plt.xlabel('Group B index State')
plt.ylabel('Frequency')
plt.title('Group B Math Score & Reading_Score')
plt.xticks(retation=90)
plt.show()

plt.figure(figsize=(10,10))
sns.jointplot(x=np.arange(1,191), y=data[(data['Race/Ethnicity']=='group B')].Reading_Score, color='lime', alpha=0.8)
plt.xlabel('Group B index State')
plt.ylabel('Frequency')
plt.title('Group B Frequency Race/Ethnicity')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

Pie Chart

labels = data['Race/Ethnicity'].value_counts().index
colors = ['blue', 'red','yellow','green','brown']
explode = [0,0,0.1,0,0]
values = data['Race/Ethnicity'].value_counts().values

plt.figure(figsize=(7,7))
plt.pie(values, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%')
plt.title('Racr/Ethnicity According Analysis', color='black', fontsize=10)
plt.show()

plt.figure(figsize=(7,7))
labels = ['Math Score', 'Reading Score', 'Writing Score']
colors = ['blue', 'red', 'yellow']
explode = [0,0,0.1]
values = [data.Math_Score.mean(), data.Reading_Score.mean(), data.Writing_Score.mean()]

plt.pie(values, labels=labels, colors=colors, explode=explode, autopct='%1.1f%%', shadow=True)
plt.legend(['Math Score', 'Reading Score', 'Writing Score'], loc = 3)
plt.axis('equal') # x轴y轴等长
plt.tight_layout()
plt.show()

labels = 'group A', 'group B', 'group C', 'group D', 'group E'
sizes = data.groupby('Race/Ethnicity')['Reading_Score'].mean().values
colors = ['gold', 'yellowgreen', 'lightcoral', 'lightskyblue']
explode = (0.1, 0, 0, 0, 0)

plt.pie(sizes, explode=explode, labels=labels, 
        colors=colors, autopct='%1.1f%%', shadow=True, startangle=140) # shadow=True:设置阴影, startangle:起始角度
plt.title('Reading Score for Every Race/Ethnicity Mean')
plt.axis('equal')
plt.show()

Lm Plot:回归图

seaborn.lmplot(x, y, data, hue=None, col=None, row=None, palette=None, col_wrap=None, height=5, aspect=1, markers='o', sharex=True, sharey=True, hue_order=None, col_order=None, row_order=None, legend=True, legend_out=True, x_estimator=None, x_bins=None, x_ci='ci', scatter=True, fit_reg=True, ci=95, n_boot=1000, units=None, order=1, logistic=False, lowess=False, robust=False, logx=False, x_partial=None, y_partial=None, truncate=False, x_jitter=None, y_jitter=None, scatter_kws=None, line_kws=None, size=None)

sns.lmplot(x='Math_Score', y='Reading_Score', data=data)
plt.xlabel('Math Score')
plt.ylabel('Reading Score')
plt.title('Math Score vs Reading Score')
plt.show()

sns.lmplot(x = 'Math_Score', y='Writing_Score', hue='Gender', data=data)
plt.xlabel('Math Score')
plt.ylabel('Writing Score')
plt.title('Math Score vs Writing Score')
plt.show()

sns.lmplot(x='Math_Score', y='Writing_Score', hue='Gender', data=data, markers=['x','o'])
plt.xlabel('Math Score')
plt.ylabel('Writing Score')
plt.title('Math Score vs Writing_Score')
plt.show()

Violin Plot:小提琴图

seaborn.violinplot(x=None, y=None, hue=None, data=None, order=None, hue_order=None, bw='scott', cut=2, scale='area', scale_hue=True, gridsize=100, width=0.8, inner='box', split=False, dodge=True, orient=None, linewidth=None, color=None, palette=None, saturation=0.75, ax=None, **kwargs)

sns.violinplot(data['Math_Score'])
plt.xlabel('Math Score')
plt.ylabel('Frequency')
plt.title('Violin Math Score Show')
plt.show()

sns.violinplot(x=data['Race/Ethnicity'], y=data['Math_Score'])
plt.show()

sns.violinplot(data['Gender'], y=data['Reading_Score'], hue=data['Race/Ethnicity'], palette='muted')
plt.show()

sns.violinplot(data['Race/Ethnicity'],data['Writing_Score'],hue=data['Gender'],palette='muted',split=True)
plt.legend(loc=8)
plt.show()

sns.violinplot(data['Parental_Level_of_Education'],data['Math_Score'],hue=data['Gender'],dodge=False)
plt.xticks(rotation=90)
plt.show()

Heatmap Plot:热力图

seaborn.heatmap(data, vmin=None, vmax=None, cmap=None, center=None, robust=False, annot=None, fmt='.2g', annot_kws=None, linewidths=0, linecolor='white', cbar=True, cbar_kws=None, cbar_ax=None, square=False, xticklabels='auto', yticklabels='auto', mask=None, ax=None, **kwargs)

sns.heatmap(data.corr()) # corr 相关系数
plt.show()

sns.heatmap(data.corr(), vmin=0, vmax=1)
plt.show()

sns.heatmap(data.corr(), annot=True) # annot=True:每个方格写入数据
plt.show()

corr = data.corr() # 计算相关矩阵
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

f, ax = plt.subplots(figsize=(11,9))
cmap = sns.diverging_palette(220, 10, as_cmap=True) # 调色板

sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0, square=True, linewidths=.5, 
            cbar_kws={"shrink":.5}) # square=True:使每个单元格为正方形

sns.heatmap(data.corr(), cmap='YlGnBu')
plt.show()

sns.axes_style("white")
mask = np.zeros_like(data.corr())
mask[np.triu_indices_from(mask)] = True
sns.heatmap(data.corr(), vmax=.3, mask=mask, square=True) 
plt.show()

Box Plot:箱线图

seaborn.boxplot(x=None, y=None, hue=None, data=None, order=None, hue_order=None, orient=None, color=None, palette=None, saturation=0.75, width=0.8, dodge=True, fliersize=5, linewidth=None, whis=1.5, notch=False, ax=None, **kwargs)

sns.set(style='whitegrid')
sns.boxplot(data['Math_Score'])
plt.show()

sns.boxplot(x=data['Gender'],y=data['Math_Score'])
plt.show()

sns.boxplot(x=data['Race/Ethnicity'], y=data['Writing_Score'], hue=data['Gender'],palette="Set3")
plt.show()

sns.boxplot(data['Math_Score'], orient='h', palette="Set2") # orient='h':使图像竖直显示
plt.show()

sns.boxenplot(x="Race/Ethnicity", y="Writing_Score", color='b', scale="linear", data=data)
plt.show()

sns.boxplot(x=data['Race/Ethnicity'], y=data['Writing_Score'], hue=data['Gender'], dodge=False)
plt.show()

sns.boxplot(x=data['Parental_Level_of_Education'], y=data['Math_Score'])
plt.xticks(rotation=90)
sns.swarmplot(x=data['Parental_Level_of_Education'], y=data['Math_Score'],color=".25")
plt.xticks(rotation=90)
plt.show()

Swarm Plot:分簇散点图

seaborn.swarmplot(x=None, y=None, hue=None, data=None, order=None, hue_order=None, dodge=False, orient=None, color=None, palette=None, size=5, edgecolor='gray', linewidth=0, ax=None, **kwargs)

sns.set(style='whitegrid')
sns.swarmplot(x=data['Math_Score'])
plt.show()

sns.set(style="whitegrid")

sns.swarmplot(y=data["Writing_Score"], color='red')
sns.swarmplot(y=data["Reading_Score"], color='blue')
plt.title('Writing & Reading Scores')
plt.show()

sns.swarmplot(x=data['Lunch'], y=data['Reading_Score'])
plt.show()

sns.swarmplot(x=data['Test_Preparation_Course'], y=data['Math_Score'], hue=data['Gender'])
plt.show()

sns.swarmplot(x=data['Test_Preparation_Course'], y=data['Writing_Score'], 
              hue=data['Race/Ethnicity'], palette='Set2', dodge=True)
plt.show()

sns.boxplot(x=data['Lunch'], y=data['Math_Score'], whis=np.inf) # np.inf:+∞
sns.swarmplot(x=data['Lunch'], y=data['Math_Score'], color='.2')
plt.show()

sns.violinplot(x=data['Test_Preparation_Course'], y=data['Reading_Score'], inner=None)
sns.swarmplot(x=data['Test_Preparation_Course'], y=data['Reading_Score'], color='white', edgecolor='gray')
plt.show()

Pair Plot:成对的变量关系图

seaborn.pairplot(data, hue=None, hue_order=None, palette=None, vars=None, x_vars=None, y_vars=None, kind='scatter', diag_kind='auto', markers=None, height=2.5, aspect=1, dropna=True, plot_kws=None, diag_kws=None, grid_kws=None, size=None)

sns.pairplot(data)
plt.show()

sns.pairplot(data, diag_kind='kde')
plt.show()

sns.pairplot(data, kind='reg')
plt.show()

sns.pairplot(data, diag_kind="kde", markers="+", 
             plot_kws=dict(s=50, edgecolor="b", linewidth=1), diag_kws=dict(shade=True))
plt.show()

sns.pairplot(data, hue = "Reading_Score")
plt.show()

Count Plot:分类直方图

seaborn.countplot(x=None, y=None, hue=None, data=None, order=None, hue_order=None, orient=None, color=None, palette=None, saturation=0.75, dodge=True, ax=None, **kwargs)

sns.countplot(data['Race/Ethnicity'])
plt.show()

sns.countplot(data['Gender'])
plt.show()

sns.countplot(data['Race/Ethnicity'], hue=data['Gender'])
plt.show()

sns.countplot(y=data['Parental_Level_of_Education'], palette="Set3", hue=data['Gender'])
plt.legend(loc=4)
plt.show()

sns.countplot(x=data['Lunch'], facecolor=(0,0,0,0), linewidth=5, edgecolor=sns.color_palette("dark", 3))
plt.show()

sns.countplot(x="Parental_Level_of_Education", hue="Lunch", data=data)
plt.xticks(rotation=45)
plt.show()

Strip Plot:散点图

seaborn.stripplot(x=None, y=None, hue=None, data=None, order=None, hue_order=None, jitter=True, dodge=False, orient=None, color=None, palette=None, size=5, edgecolor='gray', linewidth=0, ax=None, **kwargs)

sns.stripplot(x=data['Reading_Score'])
plt.show()

sns.stripplot(x="Parental_Level_of_Education", y='Writing_Score', data=data)
plt.xticks(rotation=45)
plt.show()

sns.stripplot(x="Gender", y="Writing_Score", jitter=True, data=data)
plt.xticks(rotation=45)
plt.show()

sns.stripplot(x="Lunch", y='Reading_Score', jitter=0.05, data=data)
plt.xticks(rotation=45)
plt.show()

sns.stripplot(x='Test_Preparation_Course', y='Reading_Score', hue='Gender', jitter=True, data=data)
plt.show()

sns.stripplot(x='Race/Ethnicity', y='Math_Score', hue='Lunch', jitter=True, dodge=True, palette="Set2", data=data)
plt.show()

sns.stripplot(x='Lunch', y='Math_Score', hue='Lunch', jitter=True, dodge=True, 
              size=20, marker='D', edgecolor='gray', alpha=.25, palette="Set2", data=data)
plt.legend(loc=10)
plt.show()

Factor Plot:用于绘制两维变量的关系图

seaborn.catplot(x=None, y=None, hue=None, data=None, row=None, col=None, col_wrap=None, estimator=, ci=95, n_boot=1000, units=None, order=None, hue_order=None, row_order=None, col_order=None, kind='strip', height=5, aspect=1, orient=None, color=None, palette=None, legend=True, legend_out=True, sharex=True, sharey=True, margin_titles=False, facet_kws=None, **kwargs)

sns.factorplot(x="Lunch", y="Math_Score", hue="Gender", data=data)
plt.show()

sns.factorplot(x='Gender', y='Reading_Score', hue="Lunch", kind='violin', data=data)
plt.show()

sns.factorplot(x="Race/Ethnicity", y="Math_Score", hue="Gender", col='Lunch', data=data)
plt.show()

g = sns.factorplot(x="Parental_Level_of_Education", y="Writing_Score", hue="Lunch", col="Gender", data=data)
plt.tight_layout()
plt.show()

DisPlot:带有核密度估计曲线的直方图

seaborn.distplot(a, bins=None, hist=True, kde=True, rug=False, fit=None, hist_kws=None, kde_kws=None, rug_kws=None, fit_kws=None, color=None, vertical=False, norm_hist=False, axlabel=None, label=None, ax=None)

sns.distplot(data['Reading_Score'], rug=True, hist=False) #hist:控制是否显示条形图
plt.show()

ax = sns.distplot(data['Writing_Score'], vertical=True)
plt.show()

ax = sns.distplot(data['Math_Score'])
plt.show()

sx = sns.distplot(data['Reading_Score'], color='y')
plt.show()

sns.set(style='white', palette="muted", color_codes=True)
rs = np.random.RandomState(10)

f, axes = plt.subplots(2, 2, figsize=(7,7), sharex=True)
sns.despine(left=True) # 移除坐标轴

d = rs.normal(size = 100)

sns.distplot(d, kde=False, color='b', ax=axes[0, 0])
sns.distplot(d, hist=False, rug=True, color="r", ax=axes[0,1])
sns.distplot(d, hist=False, color="g", kde_kws={"shade":True}, ax=axes[1, 0])
sns.distplot(d, color="m", ax=axes[1,1])

plt.setp(axes, yticks=[]) # 设置axes对象属性
plt.tight_layout()
plt.show()

Line Plot:折线图

seaborn.lineplot(x=None, y=None, hue=None, size=None, style=None, data=None, palette=None, hue_order=None, hue_norm=None, sizes=None, size_order=None, size_norm=None, dashes=True, markers=None, style_order=None, units=None, estimator='mean', ci=95, n_boot=1000, sort=True, err_style='band', err_kws=None, legend='brief', ax=None, **kwargs)

data[data['Gender']=='male']['Math_Score'].value_counts().sort_index().plot.line(color='b')
data[data['Gender']=='female']['Math_Score'].value_counts().sort_index().plot.line(color='r')
plt.xlabel('Math_Score')
plt.ylabel('Frequency')
plt.title('Math_Score vs Frequency')
plt.show()

sns.lineplot(x='Math_Score', y='Reading_Score', data=data)
plt.show()

sns.lineplot(x='Reading_Score', y='Writing_Score', hue='Lunch', data=data)
plt.show()

sns.lineplot(x='Writing_Score', y='Reading_Score', data=data, hue='Lunch', style='Gender')
plt.show()

female_filter = data[data['Gender']=='female']
sns.lineplot(x='Reading_Score', y='Writing_Score', data=female_filter, hue='Lunch', style='Test_Preparation_Course', dashes=False)
plt.show()

sns.lineplot(x="Math_Score", y="Writing_Score", hue="Lunch", err_style="bars",ci=68, data=data)
plt.show()

ax = sns.lineplot(x="Math_Score", y="Reading_Score", hue="Test_Preparation_Course", units="Lunch", estimator=None, lw=1, data=data.query("Gender == 'male'"))
plt.show()

ax = sns.lineplot(x="Math_Score", y="Writing_Score", hue="Lunch", style="Gender", data=data)
plt.show()

sns.lineplot(data=x, color='coral', label='Race/Ethnicity')
plt.show()

Despine:移除坐标轴

seaborn.despine(fig=None, ax=None, top=True, right=True, left=False, bottom=False, offset=None, trim=False)

f, ax = plt.subplots(figsize=(6.5, 6.5))
sns.despine(f, left=True, bottom=True)
sns.scatterplot(x="Math_Score", y="Reading_Score", hue="Gender", size="Gender", data=data)
plt.show()

sns.set(style="white")
sns.relplot(x="Reading_Score", y="Math_Score", hue="Gender", 
            sizes=(40,400), alpha=.5, palette="muted", height=6, data=data)
plt.show()

f, ax = plt.subplots(figsize=(6.5, 6.5))
sns.despine(f, left=True, bottom=True)
sns.scatterplot(x="Reading_Score", y="Writing_Score", hue="Lunch", size="Gender", data=data)
plt.show()

f, ax = plt.subplots(figsize=(6.5, 6.5))
sns.despine(f, left=True, bottom=True)
sns.scatterplot(x="Reading_Score", y="Writing_Score", hue="Lunch", 
                size="Gender", data=data[data['Parental_Level_of_Education']=='some college'])
plt.show()

f, ax = plt.subplots(figsize=(6.5, 6.5))
sns.despine(f, left=True, bottom=True) # 为True时移除
sns.scatterplot(x="Reading_Score", y="Writing_Score", hue="Lunch", size="Gender", 
                data=data[np.logical_and(data['Race/Ethnicity']=='group C',data['Parental_Level_of_Education']=='some college')])
plt.show()

除了各类图形的绘制,最后也为大家介绍了一个常用的函数despine,这个函数主要用于移除坐标轴,在美化图形时经常用到。

实践项目3到这里就已经结束了,同时,我们的探索性分析到这里也已经结束了。本次项目未对其进行深入浅出的分析,而是将重点放在了图形的绘制上面,所以造成了很多不必要的分析。当然,要真正掌握探索性分析不仅仅需要学会各类图形的绘制,更需要业务思维和分析思维能力的结合,如果有兴趣的小伙伴可以试着以本次项目为例,对其展开深入的分析并得出分析结果。

这里提一下题外话,如果数据量小,例如几万、几十万其实是可以简单使用excel完成的,但是如果数据量大,那你还需要掌握SQL对数据进行清洗整理,然后再用python进行分析。当然,除了考虑用python进行分析,还可以使用BI工具进行分析,个人更建议使用BI工具,因为在美化图形时更为方便,且选择更多。这里简单说一下,美化图形也是数据分析时最需要注意的几点之一,你分析出来的结果,更多的是给别人看,所以不仅要通俗易懂,而且在美化方面也需要更加注意。

另外,kaggle其实更多的是预测性分析项目,所以博主后续也会出相应的实践项目,其实从kaggle更注重预测性分析来看,我们作为数据分析师也应该更注重预测性分析,当然探索性分析是基础,但是你如果掌握预测性分析薪资肯定会噌噌噌的往上升。在此之前,我们需要学习预测性分析的核心——机器学习。

  • 0
    点赞
  • 9
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值