配置环境
$ pip3 install jupyterlab # 安装jupyter
$ jupyter notebook # 运行jupter
$ pip3 install pandas # 安装pandas
$ pip3 install matplotlib # 安装matplotlib
加载数据
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv('salaries_10000.csv')
df.head()
| emp_no | salary | from_date | to_date |
---|
0 | 10001 | 60117 | 1986-06-26 | 1987-06-26 |
---|
1 | 10001 | 62102 | 1987-06-26 | 1988-06-25 |
---|
2 | 10001 | 66074 | 1988-06-25 | 1989-06-25 |
---|
3 | 10001 | 66596 | 1989-06-25 | 1990-06-25 |
---|
4 | 10001 | 66961 | 1990-06-25 | 1991-06-25 |
---|
df.describe()
| emp_no | salary |
---|
count | 10000.000000 | 10000.000000 |
---|
mean | 10526.233800 | 64287.636300 |
---|
std | 304.535669 | 16940.369764 |
---|
min | 10001.000000 | 39265.000000 |
---|
25% | 10262.750000 | 50853.750000 |
---|
50% | 10526.000000 | 61585.000000 |
---|
75% | 10791.000000 | 75238.000000 |
---|
max | 11053.000000 | 136004.000000 |
---|
数据的集中趋势
df['salary'].mean()
64287.6363
df['salary'].median()
61585.0
df['salary'].mode()
0 40000
dtype: int64
print("最小值", df['salary'].min())
print("最大值", df['salary'].max())
最小值 39265
最大值 136004
print ("第一分位数: ", df['salary'].quantile(q=0.25))
print ("第二分位数: ", df['salary'].quantile(q=0.50))
print ("第三分位数: ", df['salary'].quantile(q=0.75))
第一分位数: 50853.75
第二分位数: 61585.0
第三分位数: 75238.0
数据的离中趋势
df['salary'].var()
286976127.73639596
df['salary'].std()
16940.36976386277
df['salary'].quantile(q=0.75)-df['salary'].quantile(q=0.25)
24384.25
df['salary'].std() / df['salary'].mean()
0.2635089846017993
plt.hist(df['salary'],100)
(array([390., 105., 150., 166., 231., 180., 177., 220., 231., 210., 222.,
223., 233., 228., 228., 247., 210., 245., 234., 206., 192., 227.,
235., 205., 216., 190., 193., 200., 189., 167., 171., 174., 163.,
165., 160., 143., 153., 119., 136., 119., 123., 139., 106., 118.,
106., 89., 111., 92., 98., 83., 80., 62., 74., 79., 69.,
74., 58., 66., 52., 55., 52., 37., 36., 31., 30., 28.,
21., 22., 17., 22., 5., 12., 12., 12., 4., 8., 6.,
6., 4., 3., 7., 6., 7., 2., 5., 2., 2., 2.,
2., 5., 1., 1., 0., 0., 0., 1., 0., 0., 0.,
2.]),
array([ 39265. , 40232.39, 41199.78, 42167.17, 43134.56, 44101.95,
45069.34, 46036.73, 47004.12, 47971.51, 48938.9 , 49906.29,
50873.68, 51841.07, 52808.46, 53775.85, 54743.24, 55710.63,
56678.02, 57645.41, 58612.8 , 59580.19, 60547.58, 61514.97,
62482.36, 63449.75, 64417.14, 65384.53, 66351.92, 67319.31,
68286.7 , 69254.09, 70221.48, 71188.87, 72156.26, 73123.65,
74091.04, 75058.43, 76025.82, 76993.21, 77960.6 , 78927.99,
79895.38, 80862.77, 81830.16, 82797.55, 83764.94, 84732.33,
85699.72, 86667.11, 87634.5 , 88601.89, 89569.28, 90536.67,
91504.06, 92471.45, 93438.84, 94406.23, 95373.62, 96341.01,
97308.4 , 98275.79, 99243.18, 100210.57, 101177.96, 102145.35,
103112.74, 104080.13, 105047.52, 106014.91, 106982.3 , 107949.69,
108917.08, 109884.47, 110851.86, 111819.25, 112786.64, 113754.03,
114721.42, 115688.81, 116656.2 , 117623.59, 118590.98, 119558.37,
120525.76, 121493.15, 122460.54, 123427.93, 124395.32, 125362.71,
126330.1 , 127297.49, 128264.88, 129232.27, 130199.66, 131167.05,
132134.44, 133101.83, 134069.22, 135036.61, 136004. ]),
<a list of 100 Patch objects>)
print ("偏态系数: ", df['salary'].skew())
print ("峰态系数: ", df['salary'].kurt())
偏态系数: 0.6784771437175181
峰态系数: -0.03592105965629688