pandas案例分析,附加numpy matplotlib

import pandas as pd

df=pd.read_csv('',sep=';')

这是如果出现;  说明是用;做分隔符,而不是默认的,

 

import pandas as pd
red_df = pd.read_csv('winequality-red.csv', sep=';')
white_df = pd.read_csv('winequality-white.csv', sep=';')
red_df.head()
white_df.head()

fixed_acidityvolatile_aciditycitric_acidresidual_sugarchloridesfree_sulfur_dioxidetotal_sulfur_dioxidedensitypHsulphatesalcoholquality
07.00.270.3620.70.04545.0170.01.00103.000.458.86
16.30.300.341.60.04914.0132.00.99403.300.499.56
28.10.280.406.90.05030.097.00.99513.260.4410.16
37.20.230.328.50.05847.0186.00.99563.190.409.96
47.20.230.328.50.05847.0186.00.99563.190.409.96

 

print(red_df.shape)

(1599, 12)

print(white_df.shape)

(4898, 12)
red_df.isnull().sum()
fixed_acidity           0
volatile_acidity        0
citric_acid             0
residual_sugar          0
chlorides               0
free_sulfur_dioxide     0
total_sulfur-dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

white_df.isnull().sum()

fixed_acidity           0
volatile_acidity        0
citric_acid             0
residual_sugar          0
chlorides               0
free_sulfur_dioxide     0
total_sulfur_dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64
white_df.duplicated().sum()    重复值统计
937    但是重复行是不可以删除的

红葡萄酒数据集中有多少唯一的质量值?

red_df.quality.nunique()

6

红葡萄酒数据集中的平均密度是多少?

red_df.density.mean()

0.996746679174484

 

import numpy as np

a=np.random.random(1000)

生成1000个随机数的矩阵

np.mean(a)

求a得平均值
 

 

# 导入 numpy 和 pandas
import numpy as np
import pandas as pd
# 加载红葡萄酒和白葡萄酒数据集
red_df = pd.read_csv('winequality-red.csv', sep=';')
white_df = pd.read_csv('winequality-white.csv', sep=';')

# 为红葡萄酒数据框创建颜色数组
color_red =  np.repeat('red', red_df.shape[0])
# 为白葡萄酒数据框创建颜色数组
color_white = np.repeat('white', white_df.shape[0])

red_df['color'] = color_red
red_df.head()

fixed_acidityvolatile_aciditycitric_acidresidual_sugarchloridesfree_sulfur_dioxidetotal_sulfur-dioxidedensitypHsulphatesalcoholqualitycolor
07.40.700.001.90.07611.034.00.99783.510.569.45red
17.80.880.002.60.09825.067.00.99683.200.689.85red
27.80.760.042.30.09215.054.00.99703.260.659.85red
311.20.280.561.90.07517.060.00.99803.160.589.86red
47.40.700.001.90.07611.034.00.99783.510.569.45red

 

# 附加数据框
wine_df = red_df.append(white_df)

# 查看数据框,检查是否成功
wine_df.head()

alcoholchloridescitric_acidcolordensityfixed_acidityfree_sulfur_dioxidepHqualityresidual_sugarsulphatestotal_sulfur-dioxidetotal_sulfur_dioxidevolatile_acidity
09.40.0760.00red0.99787.411.03.5151.90.5634.0NaN0.70
19.80.0980.00red0.99687.825.03.2052.60.6867.0NaN0.88
29.80.0920.04red0.99707.815.03.2652.30.6554.0NaN0.76
39.80.0750.56red0.998011.217.03.1661.90.5860.0NaN0.28
49.40.0760.00red0.99787.411.03.5151.90.5634.0NaN0.70

 

 

 

将新组合的数据框保存为 winequality_edited.csv。务必设置 index=False,以避免保存未命名列!

wine_df.to_csv('winequality_edited.csv', index=False)

 

new_labels=list(red_df.columns)

new_labels[6]='total_sulfur_dioxide'

red_df.columns=new_labels

 

groupby函数

 

red_df.groupby('quality').mean()

求出平均值

red_df.groupby(['quality','color']).mean()

red_df.groupby(['quality','color'],as_index=False).mean()

不用颜色和质量做索引as_index=False

只对某一列做平均值

red_df.groupby(['quality','color'],as_index=False)['ph'].mean()

 

df.groupby('color').mean().quality
df.describe().pH
bin_edges = [2.72, 3.11, 3.21, 3.32, 4.01]
bin_names = ['high', 'mod_high', 'medium', 'low']
df['acidity_levels'] = pd.cut(df['pH'], bin_edges, labels=bin_names)
df.head()
df.groupby('acidity_levels').mean().quality
df.to_csv('winequality_edited.csv', index=False)

等效语句

# selecting malignant records in cancer data
df_m = df[df['diagnosis'] == 'M']
df_m = df.query('diagnosis == "M"')

# selecting records of people making over $50K
df_a = df[df['income'] == ' >50K']
df_a = df.query('income == " >50K"')
# get the median amount of alcohol content
# 获取酒精含量的中位数
df.alcohol.median()

# 选择酒精含量小于中位数的样本
low_alcohol =df[df.alcohol < 10.3]

# 选择酒精含量大于等于中位数的样本
high_alcohol =df[df.alcohol >= 10.3]

# 确保这些查询中的每个样本只出现一次
num_samples = df.shape[0]
num_samples == low_alcohol['quality'].count() + high_alcohol['quality'].count() # 应为真

# 获取低酒精含量组和高酒精含量组的平均质量评分

low_alcohol.quality.mean(), high_alcohol.quality.mean()

# 获取残留糖分的中位数

df.residual_sugar.median()

# 选择残留糖分小于中位数的样本
low_sugar =df[df.residual_sugar < 3]

# 选择残留糖分大于等于中位数的样本
high_sugar =df[df.residual_sugar >= 3]

# 确保这些查询中的每个样本只出现一次
num_samples == low_sugar['quality'].count() + high_sugar['quality'].count() # 应为真

 


# 获取低糖分组和高糖分组的平均质量评分
low_sugar.quality.mean(), high_sugar.quality.mean()

 

colors=['red','white']

wine_df.groupby('color')['quality'].mean().plot(kkind='bar',title='ceshi1',colors=['red','white'],alpha=.7)

 

 

 

引入sns  matplotlib 

import pandas as pd

import matplotlib.pyplot  as  plt

import seaborn as sns

%matplotlib inline

...

...

...

colors=['red','white']

color_means=wine_df.groupby('color')['quality'].mean()

color_means.plot(kkind='bar',title='ceshi1',colors=colors,alpha=.7)

plt.xlabel("colors",fontsize=18)

plt.ylabel("colors",fontsize=18)

 

 

 

counts= wine_df.groupby(['quality','color']).count()['pH']

counts

totals=wine_df.groupby('color').count()['pH']

proportions = counts /totals

proportions.plot(kind='bar',title='ceshi1',colors=colors,alpha=.7)

 

 

 

import matplotlib.pyplot as plt
% matplotlib inline

plt.bar([1, 2, 3], [224, 620, 425]);

 

# 绘制条柱
plt.bar([1, 2, 3], [224, 620, 425])

# 为 x 轴指定刻度标签及其标签
plt.xticks([1, 2, 3], ['a', 'b', 'c']);

# 用 x 轴的刻度标签绘制条柱
plt.bar([1, 2, 3], [224, 620, 425], tick_label=['a', 'b', 'c']);

 

plt.bar([1, 2, 3], [224, 620, 425], tick_label=['a', 'b', 'c'])
plt.title('Some Title')
plt.xlabel('Some X Label')
plt.ylabel('Some Y Label');

 

# 用查询功能选择每个组,并获取其平均质量
median = df['alcohol'].median()
low = df.query('alcohol < {}'.format(median))
high = df.query('alcohol >= {}'.format(median))

mean_quality_low = low['quality'].mean()
mean_quality_high = high['quality'].mean()

 

# 用合适的标签创建柱状图
locations = [1, 2]
heights = [mean_quality_low, mean_quality_high]
labels = ['Low', 'High']
plt.bar(locations, heights, tick_label=labels)
plt.title('Average Quality Ratings by Alcohol Content')
plt.xlabel('Alcohol Content')
plt.ylabel('Average Quality Rating');

 

 

用 Matplotlib 绘制酒的类型和质量视图

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
% matplotlib inline
import seaborn as sns
sns.set_style('darkgrid')

wine_df = pd.read_csv('winequality_edited.csv')

# 获取每个等级和颜色的数量
color_counts = wine_df.groupby(['color', 'quality']).count()['pH']
color_counts

# 获取每个颜色的总数
color_totals = wine_df.groupby('color').count()['pH']
color_totals

# 将红葡萄酒等级数量除以红葡萄酒样本总数,获取比例
red_proportions = color_counts['red'] / color_totals['red']
red_proportions

# 将白葡萄酒等级数量除以白葡萄酒样本总数,获取比例
white_proportions = color_counts['white'] / color_totals['white']
white_proportions

ind = np.arange(len(red_proportions))  # 组的 x 坐标位置
width = 0.35       # 条柱的宽度

# 绘制条柱
red_bars = plt.bar(ind, red_proportions, width, color='r', alpha=.7, label='Red Wine')
white_bars = plt.bar(ind + width, white_proportions, width, color='w', alpha=.7, label='White Wine')

# 标题和标签
plt.ylabel('Proportion')
plt.xlabel('Quality')
plt.title('Proportion by Wine Color and Quality')
locations = ind + width / 2  # x 坐标刻度位置
labels = ['3', '4', '5', '6', '7', '8', '9']  # x 坐标刻度标签
plt.xticks(locations, labels)

# 图例
plt.legend()

 

red_proportions['9'] = 0
red_proportions

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

  • 0
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值