目的:研究红酒品质和理化性质的关系
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
读取数据
df = pd.read_csv("D:\Study\AI_data\winequality-red.csv",sep=';')
df.head(5)#显示前五行数据
fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 7.4 | 0.70 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 |
1 | 7.8 | 0.88 | 0.00 | 2.6 | 0.098 | 25.0 | 67.0 | 0.9968 | 3.20 | 0.68 | 9.8 | 5 |
2 | 7.8 | 0.76 | 0.04 | 2.3 | 0.092 | 15.0 | 54.0 | 0.9970 | 3.26 | 0.65 | 9.8 | 5 |
3 | 11.2 | 0.28 | 0.56 | 1.9 | 0.075 | 17.0 | 60.0 | 0.9980 | 3.16 | 0.58 | 9.8 | 6 |
4 | 7.4 | 0.70 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 |
df.info()#数据表的基本信息
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 fixed acidity 1599 non-null float64
1 volatile acidity 1599 non-null float64
2 citric acid 1599 non-null float64
3 residual sugar 1599 non-null float64
4 chlorides 1599 non-null float64
5 free sulfur dioxide 1599 non-null float64
6 total sulfur dioxide 1599 non-null float64
7 density 1599 non-null float64
8 pH 1599 non-null float64
9 sulphates 1599 non-null float64
10 alcohol 1599 non-null float64
11 quality 1599 non-null int64
dtypes: float64(11), int64(1)
memory usage: 150.0 KB
df.shape#维度查看,几行几列
(1599, 12)
df.describe() #查看常见统计量
fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 |
mean | 8.319637 | 0.527821 | 0.270976 | 2.538806 | 0.087467 | 15.874922 | 46.467792 | 0.996747 | 3.311113 | 0.658149 | 10.422983 | 5.636023 |
std | 1.741096 | 0.179060 | 0.194801 | 1.409928 | 0.047065 | 10.460157 | 32.895324 | 0.001887 | 0.154386 | 0.169507 | 1.065668 | 0.807569 |
min | 4.600000 | 0.120000 | 0.000000 | 0.900000 | 0.012000 | 1.000000 | 6.000000 | 0.990070 | 2.740000 | 0.330000 | 8.400000 | 3.000000 |
25% | 7.100000 | 0.390000 | 0.090000 | 1.900000 | 0.070000 | 7.000000 | 22.000000 | 0.995600 | 3.210000 | 0.550000 | 9.500000 | 5.000000 |
50% | 7.900000 | 0.520000 | 0.260000 | 2.200000 | 0.079000 | 14.000000 | 38.000000 | 0.996750 | 3.310000 | 0.620000 | 10.200000 | 6.000000 |
75% | 9.200000 | 0.640000 | 0.420000 | 2.600000 | 0.090000 | 21.000000 | 62.000000 | 0.997835 | 3.400000 | 0.730000 | 11.100000 | 6.000000 |
max | 15.900000 | 1.580000 | 1.000000 | 15.500000 | 0.611000 | 72.000000 | 289.000000 | 1.003690 | 4.010000 | 2.000000 | 14.900000 | 8.000000 |
plt.style.use('ggplot')
colnm = df. columns.tolist()#dataframe数据类型转换为list数据类型
fig = plt.figure(figsize = (12,6))
for i in range(12):
plt.subplot(2,6,i+1) #将整个图像划分为两行六列共12个图像
sns.boxplot(df[colnm[i]],orient = "v",width = 0.5,color='r')
plt.ylabel(colnm[i],fontsize=12)
plt.tight_layout()#自动调整子图参数,使之填充整个图像区域。这是个实验特性,可能在一些情况下不工作。它仅检查坐标轴刻度、刻度标签以及标题部分
print('\nFigure 1: Univariate Boxplots(单变量箱线图)')
Figure 1: Univariate Boxplots(单变量箱线图)
箱线图的各线释义
箱子的上下底分别是上四分之位0.75和下四分之位0.25,箱体包含50%的数据,箱体中间的线指的是中位数。箱子高度在一定程度上反映了数据的波动程度。
上边界和下边界之外的值可以理解为异常值
还有一些极端情况,箱子被压得很扁,甚至只剩下一条线,同时还存在着很多异常值。这些情况的出现,有两个常见的原因。第一,样本数据中,存在特别大或者特别小的异常值,这种离群的表现,导致箱子整体被压缩,反而凸显出来这些异常,第二,样本数据特别少,因此箱体受单个数据的影响被放大了。
colnm1 = df.columns.tolist()
plt.figure(figsize = (12,8))
for i in range(12):
plt.subplot(4,3,i+1)#创建四行三列十二个图像
df[colnm1[i]].hist(bins = 100 ,color ='b')#直方图df.hist()
plt.xlabel(colnm1[i],fontsize=12)#横轴字段名
plt.ylabel('Frequency')#纵轴频率
plt.tight_layout()
print('\nFigure 2:Univariate Histograms单变量直方图')
Figure 2:Univariate Histograms单变量直方图
品质评价范围是0-10,这个数据集中范围是3到8,有82%的红酒品质是5或6(690+650=1340)
这个数据集有7个酸度相关的特征:fixed acidity(非挥发性酸),volatile acidity(挥发性酸),citric acid(柠檬酸),free sulfur dioxide游离二氧化硫,total sulfur dioxide总二氧化硫,sulphates硫酸盐类,pH。
前6个特征都与红酒的pH相关。pH是对数的尺度,下面对前6个特征取对数然后作直方图。
另外,pH值主要是与fixed acidity有关,fixed acidity比volatile acidity和citric acid高1到2个数量级
acidityFeat =['fixed acidity','volatile acidity','citric acid','free sulfur dioxide','total sulfur dioxide','sulphates']
plt.figure(figsize=(12,4))
for i in range(len(acidityFeat)):
ax = plt.subplot(2,3,i+1)#两行三列六副图
v = np.log10(np.clip(df[acidityFeat[i]].values,a_min = 0.001,a_max = None))
plt.hist(v,bins=50,color = 'g')
plt.xlabel('log(' + acidityFeat[i]+')',fontsize = 12)
plt.ylabel('Frequency')
plt.tight_layout()
print('\nFigure:3 acidity Features in log10 Scale')
Figure:3 acidity Features in log10 Scale
plt.figure(figsize=(12,6))
bins = 10**(np.linspace(-2,2))
plt.hist(df['fixed acidity'],bins = bins,edgecolor='k',label = 'Fixed acidity')
plt.hist(df['volatile acidity'],bins=bins,edgecolor='k',label='volatile acidity')
plt.hist(df['citric acid'],bins=bins,edgecolor='k',label='citric acid')
plt.xscale('log')
plt.xlabel('Acid Concentration(g/dm^3)')
plt.ylabel('Frequency')
plt.title('Histogram of Acid Concentration')
plt.legend()
plt.tight_layout()
print('Figure 4')
Figure 4 Histogram of Acid Concentration
df['total acid'] = df['fixed acidity']+df['volatile acidity']+df['citric acid']
plt.figure(figsize = (12,5))
plt.subplot(121)
plt.hist(df['total acid'],bins=50,color = 'm')
plt.xlabel('total acid')
plt.ylabel('Frequency')
plt.subplot(122)
plt.hist(np.log(df['total acid']),bins = 50,color='m')
plt.xlabel('log(total acid)')
plt.ylabel('Frequency')
plt.tight_layout()
print("Figure 5: Total Acid Histogram")
Figure 5: Total Acid Histogram
甜度(sweetness)
Residual sugar(剩余糖分) 与酒的甜度有关,通常用来区别各种红酒,干红(<= 4g/L),半干(4-12 g/L),半甜(12-45g/L)和甜(>45g/L)。这个数据中,主要为干红,没有甜葡萄酒。
print(df.columns.tolist())
['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'quality', 'total acid']
#residual sugar
df['sweetness'] = pd.cut(df['residual sugar'],bins = [0,4,12,45],labels = ['dry','medium dry','semi-sweet'])
plt.figure(figsize=(8,4))
df['sweetness'].value_counts().plot(kind='bar',color='y')
#value_counts()函数用于数据表的计数及排序。可用于查看指定列中有多少不同的数据值。
#图类型是柱状图bar
plt.xticks(rotation=0)#横轴分类字体倾斜程度
plt.xlabel('sweetness',fontsize=12)
plt.ylabel('Frequency',fontsize=12)
plt.tight_layout()
print('Figure 6:sweetness')
Figure 6:sweetness
双变量分析
红酒品质与理化性质分析
sns.set_style('ticks')
sns.set_context('notebook',font_scale=1.1)
colnm = df.columns.tolist()[:11] + ['total acid']
plt.figure(figsize=(10,8))
for i in range(12):
plt.subplot(4,3,i+1)
sns.boxplot(x='quality',y=colnm[i],data=df,color='g',width=0.6)
plt.ylabel(colnm[i],fontsize=12)
plt.tight_layout()
print('\nFigure 7:Physicochemical Propertirs and Wine Quality by Boxplot')
Figure 7:Physicochemical Propertirs and Wine Quality by Boxplot
品质更好的酒有更高的柠檬酸,硫酸盐(sulphates)和酒精度数。硫酸盐(硫酸钙)的加入通常是调整酒的酸度的。其中酒精度数和品质的相关性最高。
品质好的酒有较低的挥发性酸类,密度和ph。
残留糖分,氯离子,二氧化硫似乎对酒的品质影响不大。
sns.set_style('dark')#主题设置为dark
plt.figure(figsize=(12,8))
colnm = df.columns.tolist()[:11] + ['total acid','quality']
mcorr = df[colnm].corr()
mask = np.zeros_like(mcorr,dtype=np.bool_)
mask[np.triu_indices_from(mask)] = True
cmap = sns.diverging_palette(220,10,as_cmap=True)
g = sns.heatmap(mcorr,mask=mask,cmap=cmap,square = True,annot=True,fmt='0.2f')
print('\nFigure 8:Pairwise Correlation Plot 两两相关图')
Figure 8:Pairwise Correlation Plot 两两相关图
密度和酒精浓度
密度和酒精浓度是相关的,物理上,两者并不是线性关系。Figure 8展示了两者的关系。另外密度还与酒中其他物质的含量有关,但是关系很小。
#style
sns.set_style('ticks')
sns.set_context("notebook",font_scale=1.4)
plt.figure(figsize=(6,4))
sns.regplot(x='density',y='alcohol',data = df,scatter_kws = {'s':10},color='g')
#sns.regplot()绘图数据和线性回归模型拟合。
plt.xlim(0.989,1.005)#x轴范围
plt.ylim(7,16)#y轴范围
print('Figure 9: Density vs Alchol')
Figure 9: Density vs Alchol
酸性物质含量和pH
pH和非挥发性酸性物质由-0.683的相关性。因为非挥发性酸性物质的含量远远高于其它酸性物质,总酸性物质(total acidity)这个特征并没有太多意义
otherFeat = ['fixed acidity','volatile acidity','total sulfur dioxide','sulphates','total acid']
plt.figure(figsize = (12,6))
for i in range(len(otherFeat)):
plt.subplot(2,3,i+1)
sns.regplot(x='pH',y=otherFeat[i],data = df, scatter_kws={'s':10},color='k')
plt.tight_layout()
print("Figure 10: pH vs acid")
Figure 10: pH vs acid
多变量分析
与品质相关性最高的三个特征是酒精浓度、挥发性酸度和柠檬酸。下面图中显示的酒精浓度,挥发性酸和品质的关系。
plt.style.use('ggplot')
sns.lmplot(x='alcohol',y='volatile acidity',hue='quality',
data=df, fit_reg = False, scatter_kws={'s':10})#lmplot用于绘制回归图,fit_reg参数为Fasle时不显示拟合线
plt.title("Figure 11-1: Scatter Plots of Alcohol,Volatile Acid and Quality")
plt.show()
酒精浓度,挥发性酸和品质对于好酒(7,8)以及差酒(3,4),关系很明显。但是对于中等酒(5,6),酒精浓度的挥发性酸度有很大程度的交叉。
sns.lmplot(x='alcohol',y='volatile acidity',col='quality',hue='quality',
data=df,fit_reg=False,
aspect=0.9,col_wrap=3,scatter_kws={'s':20})
print("Figure 11-2: Scatter Plots of Alcohol,Volatile Acid and quality")
plt.show()
Figure 11-2: Scatter Plots of Alcohol,Volatile Acid and quality
pH,非挥发性酸,和柠檬酸
pH和非挥发性的酸以及柠檬酸有相关性。整体趋势也很合理,即浓度越高,pH越低。
#style
sns.set_style('ticks')
sns.set_context("notebook",font_scale=1.4)
plt.figure(figsize=(12,10))
cm = plt.cm.get_cmap('RdBu')
sc = plt.scatter(df['fixed acidity'],df['citric acid'],c=df['pH'],
vmin=2.6,vmax=4,s=15,cmap=cm)
bar=plt.colorbar(sc)
bar.set_label('pH',rotation=0)
plt.xlabel('fixed acidity')
plt.ylabel('citric acid')
plt.xlim(4,18)
plt.ylim(0,1)
print('Figure 12: pH with Fixed Acidity and Citric Acid')
C:\Users\60545\AppData\Local\Temp\ipykernel_8216\1399992010.py:6: MatplotlibDeprecationWarning: The get_cmap function was deprecated in Matplotlib 3.7 and will be removed two minor releases later. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap(obj)`` instead.
cm = plt.cm.get_cmap('RdBu')
Figure 12: pH with Fixed Acidity and Citric Acid
红酒品质主要与酒精浓度,挥发性酸和柠檬酸有关。对于品质优于7,或者劣于4的酒,直观上是线性可分的。但是分支为5,6的酒很难线性区分。
lt.colorbar(sc)
bar.set_label('pH',rotation=0)
plt.xlabel('fixed acidity')
plt.ylabel('citric acid')
plt.xlim(4,18)
plt.ylim(0,1)
print('Figure 12: pH with Fixed Acidity and Citric Acid')
Figure 12: pH with Fixed Acidity and Citric Acid
红酒品质主要与酒精浓度,挥发性酸和柠檬酸有关。对于品质优于7,或者劣于4的酒,直观上是线性可分的。但是分支为5,6的酒很难线性区分。
参考:https://aistudio.baidu.com/aistudio/projectdetail/757183?channelType=0&channe