红酒数据集下载:https://github.com/jsusu/wine_analysis/tree/master/data_wine
# 红酒数据分析
'''这个notebook分析了红酒的通用数据集。这个数据集有1599个样本,11个红酒的理化性质,以及红酒的品质(评分从0到10)。
这里主要目的在于展示进行数据分析的常见python包的调用,以及数据可视化。主要内容分为:单变量,双变量,和多变量分析。'''
#%matplotlib inline
#%config InlineBackend.figure_format = 'retina'
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# 颜色
color = sns.color_palette()
# 数据print精度
pd.set_option('precision',3)
df = pd.read_csv("./data_wine/winequality-red.csv", sep=";")
df.head(5)
fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 7.4 | 0.70 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.998 | 3.51 | 0.56 | 9.4 | 5 |
1 | 7.8 | 0.88 | 0.00 | 2.6 | 0.098 | 25.0 | 67.0 | 0.997 | 3.20 | 0.68 | 9.8 | 5 |
2 | 7.8 | 0.76 | 0.04 | 2.3 | 0.092 | 15.0 | 54.0 | 0.997 | 3.26 | 0.65 | 9.8 | 5 |
3 | 11.2 | 0.28 | 0.56 | 1.9 | 0.075 | 17.0 | 60.0 | 0.998 | 3.16 | 0.58 | 9.8 | 6 |
4 | 7.4 | 0.70 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.998 | 3.51 | 0.56 | 9.4 | 5 |
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 fixed acidity 1599 non-null float64
1 volatile acidity 1599 non-null float64
2 citric acid 1599 non-null float64
3 residual sugar 1599 non-null float64
4 chlorides 1599 non-null float64
5 free sulfur dioxide 1599 non-null float64
6 total sulfur dioxide 1599 non-null float64
7 density 1599 non-null float64
8 pH 1599 non-null float64
9 sulphates 1599 non-null float64
10 alcohol 1599 non-null float64
11 quality 1599 non-null int64
dtypes: float64(11), int64(1)
memory usage: 150.0 KB
# 变量分析
df.describe()
fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 1599.000 | 1599.000 | 1599.000 | 1599.000 | 1599.000 | 1599.000 | 1599.000 | 1599.000 | 1599.000 | 1599.000 | 1599.000 | 1599.000 |
mean | 8.320 | 0.528 | 0.271 | 2.539 | 0.087 | 15.875 | 46.468 | 0.997 | 3.311 | 0.658 | 10.423 | 5.636 |
std | 1.741 | 0.179 | 0.195 | 1.410 | 0.047 | 10.460 | 32.895 | 0.002 | 0.154 | 0.170 | 1.066 | 0.808 |
min | 4.600 | 0.120 | 0.000 | 0.900 | 0.012 | 1.000 | 6.000 | 0.990 | 2.740 | 0.330 | 8.400 | 3.000 |
25% | 7.100 | 0.390 | 0.090 | 1.900 | 0.070 | 7.000 | 22.000 | 0.996 | 3.210 | 0.550 | 9.500 | 5.000 |
50% | 7.900 | 0.520 | 0.260 | 2.200 | 0.079 | 14.000 | 38.000 | 0.997 | 3.310 | 0.620 | 10.200 | 6.000 |
75% | 9.200 | 0.640 | 0.420 | 2.600 | 0.090 | 21.000 | 62.000 | 0.998 | 3.400 | 0.730 | 11.100 | 6.000 |
max | 15.900 | 1.580 | 1.000 | 15.500 | 0.611 | 72.000 | 289.000 | 1.004 | 4.010 | 2.000 | 14.900 | 8.000 |
# set plot style
plt.style.use("ggplot")
colnm = df.columns.to_list()
fig = plt.figure(figsize=(10,6))
for i in range(12):
plt.subplot(2,6,i+1)
sns.boxplot(df[colnm[i]], orient="v", width = 0.5, color = color[0])
plt.ylabel(colnm[i],fontsize = 12)
#plt.subplots_adjust(left=0.2, wspace=0.8, top=0.9)
plt.tight_layout()
print('\nFigure 1: Univariate Boxplots')
Figure 1: Univariate Boxplots
colnm = df.columns.tolist()
plt.figure(figsize = (10, 8))
for i in range(12):
plt.subplot(4,3,i+1)
df[colnm[i]].hist(bins = 100, color = color[0])
plt.xlabel(colnm[i],fontsize = 12)
plt.ylabel('Frequency')
plt.tight_layout()
print('\nFigure 2: Univariate Histograms')
Figure 2: Univariate Histograms
# 品质:
'''这个数据集的目的是研究红酒品质和理化性质之间的关系。品质的评价范围是0-10,这个数据集中范围是3到8,有82%的红酒品质是5或6'''
# 酸度相关的特征
'''这个数据集有7个酸度相关的特征:fixed acidity, volatile acidity, citric acid, free sulfur dioxide, total sulfur dioxide, sulphates, pH。前6个特征都与红酒的pH的相关。
pH是在对数的尺度,下面对前6个特征取对数然后作histogram。另外,pH值主要是与fixed acidity有关,fixed acidity比volatile acidity和citric acid高1到2个数量级(Figure 4),比free sulfur dioxide, total sulfur dioxide, sulphates高3个数量级。
一个新特征total acid来自于前三个特征的和。'''
acidityFeat = ['fixed acidity', 'volatile acidity', 'citric acid',
'free sulfur dioxide', 'total sulfur dioxide', 'sulphates']
plt.figure(figsize = (10, 4))
for i in range(6):
ax = plt.subplot(2,3,i+1)
v = np.log10(np.clip(df[acidityFeat[i]].values, a_min = 0.001, a_max = None))
plt.hist(v, bins = 50, color = color[0])
plt.xlabel('log(' + acidityFeat[i] + ')',fontsize = 12)
plt.ylabel('Frequency')
plt.tight_layout()
print('\nFigure 3: Acidity Features in log10 Scale')
Figure 3: Acidity Features in log10 Scale
plt.figure(figsize=(6,3))
bins = 10**(np.linspace(-2, 2))
plt.hist(df['fixed acidity'], bins = bins, edgecolor = 'k', label = 'Fixed Acidity')
plt.hist(df['volatile acidity'], bins = bins, edgecolor = 'k', label = 'Volatile Acidity')
plt.hist(df['citric acid'], bins = bins, edgecolor = 'k', alpha = 0.8, label = 'Citric Acid')
plt.xscale('log')
plt.xlabel('Acid Concentration (g/dm^3)')
plt.ylabel('Frequency')
plt.title('Histogram of Acid Concentration')
plt.legend()
plt.tight_layout()
print('Figure 4')
Figure 4
# 总酸度
df['total acid'] = df['fixed acidity'] + df['volatile acidity'] + df['citric acid']
plt.figure(figsize = (8,3))
plt.subplot(121)
plt.hist(df['total acid'], bins = 50, color = color[0])
plt.xlabel('total acid')
plt.ylabel('Frequency')
plt.subplot(122)
plt.hist(np.log(df['total acid']), bins = 50 , color = color[0])
plt.xlabel('log(total acid)')
plt.ylabel('Frequency')
plt.tight_layout()
print("Figure 5: Total Acid Histogram")
Figure 5: Total Acid Histogram
# 甜度
'''
Residual sugar 与酒的甜度相关,通常用来区别各种红酒,干红(<=4 g/L), 半干(4-12 g/L),半甜(12-45 g/L),和甜(>45 g/L)。
这个数据中,主要为干红,没有甜葡萄酒。
'''
# Residual sugar
df['sweetness'] = pd.cut(df['residual sugar'], bins = [0, 4, 12, 45],
labels=["dry", "medium dry", "semi-sweet"])
plt.figure(figsize = (5,3))
df['sweetness'].value_counts().plot(kind = 'bar', color = color[0])
plt.xticks(rotation=0)
plt.xlabel('sweetness', fontsize = 12)
plt.ylabel('Frequency', fontsize = 12)
plt.tight_layout()
print("Figure 6: Sweetness")
Figure 6: Sweetness
# 双变量分析
'''
下面Figure 7和8分别显示了红酒理化特征和品质的关系。其中可以看出的趋势有:
品质好的酒有更高的柠檬酸,硫酸盐,和酒精度数。硫酸盐(硫酸钙)的加入通常是调整酒的酸度的。其中酒精度数和品质的相关性最高。
品质好的酒有较低的挥发性酸类,密度,和pH。
残留糖分,氯离子,二氧化硫似乎对酒的品质影响不大。
'''
sns.set_style('ticks')
sns.set_context("notebook", font_scale= 1.1)
colnm = df.columns.tolist()[:11] + ['total acid']
plt.figure(figsize = (10, 8))
for i in range(12):
plt.subplot(4,3,i+1)
sns.boxplot(x ='quality', y = colnm[i], data = df, color = color[2], width = 0.6)
plt.ylabel(colnm[i],fontsize = 12)
plt.tight_layout()
print("\nFigure 7: Physicochemical Properties and Wine Quality by Boxplot")
Figure 7: Physicochemical Properties and Wine Quality by Boxplot
sns.set_style("dark")
plt.figure(figsize = (10,8))
colnm = df.columns.tolist()[:11] + ['total acid', 'quality']
mcorr = df[colnm].corr()
mask = np.zeros_like(mcorr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
cmap = sns.diverging_palette(220, 10, as_cmap=True)
g = sns.heatmap(mcorr, mask=mask, cmap=cmap, square=True, annot=True, fmt='0.2f')
print("\nFigure 8: Pairwise Correlation Plot")
Figure 8: Pairwise Correlation Plot
# 密度和酒精浓度
'''
密度和酒精浓度是相关的,物理上,两者并不是线性关系。
Figure 8展示了两者的关系。另外密度还与酒中其他物质的含量有关,但是关系很小。
'''
# style
sns.set_style('ticks')
sns.set_context("notebook", font_scale= 1.4)
# plot figure
plt.figure(figsize = (6,4))
sns.regplot(x='density', y = 'alcohol', data = df, scatter_kws = {'s':10}, color = color[2])
plt.xlim(0.989, 1.005)
plt.ylim(7,16)
print('Figure 9: Density vs Alcohol')
Figure 9: Density vs Alcohol
酸性物质含量和pH
‘’‘pH和非挥发性酸性物质有-0.683的相关性。因为非挥发性酸性物质的含量远远高于其他酸性物质,总酸性物质(total acidity)这个特征并没有太多意义
‘’’
acidity_related = [‘fixed acidity’, ‘volatile acidity’, ‘total sulfur dioxide’,
‘sulphates’, ‘total acid’]
plt.figure(figsize = (10,6))
for i in range(5):
plt.subplot(2,3,i+1)
sns.regplot(x=‘pH’, y = acidity_related[i], data = df, scatter_kws = {‘s’:10}, color = color[2])
plt.tight_layout()
print(“Figure 10: pH vs acid”)
# 多变量分析
'''
与品质相关性最高的三个特征是酒精浓度,挥发性酸度,和柠檬酸。下面图中显示的酒精浓度,挥发性酸和品质的关系。
酒精浓度,挥发性酸和品质
对于好酒(7,8)以及差酒(3,4),关系很明显。但是对于中等酒(5,6),酒精浓度的挥发性酸度有很大程度的交叉。
'''
plt.style.use('ggplot')
sns.lmplot(x = 'alcohol', y = 'volatile acidity', hue = 'quality',
data = df, fit_reg = False, scatter_kws={'s':10}, size = 5)
print("Figure 11-1: Scatter Plots of Alcohol, Volatile Acid and Quality")
/Users/susu/opt/anaconda3/envs/data_analysis/lib/python3.7/site-packages/seaborn/regression.py:574: UserWarning: The `size` parameter has been renamed to `height`; please update your code.
warnings.warn(msg, UserWarning)
Figure 11-1: Scatter Plots of Alcohol, Volatile Acid and Quality
sns.lmplot(x = 'alcohol', y = 'volatile acidity', col='quality', hue = 'quality',
data = df,fit_reg = False, size = 3, aspect = 0.9, col_wrap=3,
scatter_kws={'s':20})
print("Figure 11-2: Scatter Plots of Alcohol, Volatile Acid and Quality")
Figure 11-2: Scatter Plots of Alcohol, Volatile Acid and Quality
# pH,非挥发性酸,和柠檬酸
# pH和非挥发性的酸以及柠檬酸有相关性。整体趋势也很合理,即浓度越高,pH越低。
# style
sns.set_style('ticks')
sns.set_context("notebook", font_scale= 1.4)
plt.figure(figsize=(6,5))
cm = plt.cm.get_cmap('RdBu')
sc = plt.scatter(df['fixed acidity'], df['citric acid'], c=df['pH'], vmin=2.6, vmax=4, s=15, cmap=cm)
bar = plt.colorbar(sc)
bar.set_label('pH', rotation = 0)
plt.xlabel('fixed acidity')
plt.ylabel('citric acid')
plt.xlim(4,18)
plt.ylim(0,1)
print('Figure 12: pH with Fixed Acidity and Citric Acid')
Figure 12: pH with Fixed Acidity and Citric Acid
# 总结:
# 整体而言,红酒的品质主要与酒精浓度,挥发性酸,和柠檬酸有关。对于品质优于7,或者劣于4的酒,直观上是线性可分的。但是品质为5,6的酒很难线性区分。