统计推断基础
- 数据说明:本数据是地区房价增长率数据
- 名称-中文含义
- dis_name-小区名称
- rate-房价同比增长率
import os
os.chdir('Q:/data')
os.getcwd()
'Q:\\data'
import pandas as pd
house_price_gr = pd.read_csv('Q:/data/house_price_gr.csv', encoding='gbk')
house_price_gr
dis_name | rate | |
---|---|---|
0 | 东城区甘南小区 | 0.169747 |
1 | 东城区察慈小区 | 0.165484 |
2 | 东城区胡家园小区 | 0.141358 |
3 | 东城区台基厂小区 | 0.063197 |
4 | 东城区青年湖小区 | 0.101528 |
5 | 东城区小黄庄小区 | 0.068467 |
6 | 东城区和平里六区 | 0.118572 |
7 | 东城区京香福苑小区 | 0.161386 |
8 | 东城区安贞苑50号院 | 0.085863 |
9 | 东城区安馨园小区 | 0.104397 |
10 | 东城区外交部街33号院 | 0.178980 |
11 | 西城区新文化街小区 | 0.057328 |
12 | 西城区新融苑小区 | 0.089179 |
13 | 西城区裕中西里小区 | 0.067066 |
14 | 西城区国英园小区 | 0.063661 |
15 | 西城区国家广电总局新302住宅小区 | 0.074919 |
16 | 崇文区东四块玉小区 | 0.108691 |
17 | 崇文区金鱼池危改小区 | 0.171723 |
18 | 崇文区新景家园 | 0.162617 |
19 | 宣武区法源寺小区 | 0.222625 |
20 | 宣武区建功南里小区 | 0.129224 |
21 | 宣武区椿树园小区 | 0.036800 |
22 | 宣武区恒昌花园 | 0.098843 |
23 | 宣武区康乐里小区 | 0.113615 |
24 | 宣武区小马厂电信住宅小区 | 0.109990 |
25 | 宣武区天桥小区 | 0.177385 |
26 | 宣武区牛街东里民族团结小区 | 0.067636 |
27 | 宣武区云河公寓 | 0.143818 |
28 | 朝阳区团结湖小区 | 0.106157 |
29 | 朝阳区西坝河东里 | 0.071425 |
... | ... | ... |
120 | 通州区天赐良园小区 | 0.080639 |
121 | 通州区翠屏北里西区 | 0.083920 |
122 | 通州区京贸国际公寓 | 0.060896 |
123 | 大兴区义和庄北里小区 | 0.121176 |
124 | 大兴区清源西里小区 | 0.139761 |
125 | 大兴县宏福园小区 | 0.114332 |
126 | 大兴区菊源里小区 | 0.110707 |
127 | 怀柔区南华园一区 | 0.148249 |
128 | 怀柔县龙湖花园小区 | 0.120356 |
129 | 怀柔县梅苑花园小区 | 0.096335 |
130 | 怀柔县南华园四区 | 0.116468 |
131 | 怀柔县馥郁苑小区 | 0.086261 |
132 | 怀柔区迎宾北路12号院 | 0.113126 |
133 | 顺义区华中园别墅 | 0.112126 |
134 | 顺义区裕龙花园 | 0.112064 |
135 | 顺义区双裕小区 | 0.067941 |
136 | 顺义区西辛小区 | 0.097185 |
137 | 顺义区中央电视台影视培训中心 | 0.104725 |
138 | 顺义区裕祥花园 | 0.178573 |
139 | 房山区桥梁厂生活区 | 0.126083 |
140 | 房山区原子能科学研究院生活区 | 0.142602 |
141 | 房山区碧桂园温泉小区 | 0.029540 |
142 | 房山区北京输油公司生活小区 | 0.159211 |
143 | 房山区西厢苑小区 | 0.135552 |
144 | 延庆县川北小区 | 0.161761 |
145 | 密云县沿湖小区 | 0.121524 |
146 | 密云县东菜园小区 | 0.104666 |
147 | 密云县花园小区 | 0.137225 |
148 | 开发区鹿鸣苑 | 0.073119 |
149 | 开发区星岛嘉园 | 0.048391 |
150 rows × 2 columns
参数估计
进行描述性统计分析
house_price_gr.describe(include='all')
dis_name | rate | |
---|---|---|
count | 150 | 150.000000 |
unique | 150 | NaN |
top | 东城区甘南小区 | NaN |
freq | 1 | NaN |
mean | NaN | 0.110061 |
std | NaN | 0.041333 |
min | NaN | 0.029540 |
25% | NaN | 0.080027 |
50% | NaN | 0.104908 |
75% | NaN | 0.140066 |
max | NaN | 0.243743 |
Histograph
%matplotlib inline
import seaborn as sns
from scipy import stats
# sns.distplot(house_price_gr.rate, kde=True, fit=stats.norm) # Histograph
Q-Q
import statsmodels.api as sm
from matplotlib import pyplot as plt
fig = sm.qqplot(house_price_gr.rate, fit=True, line='45')
fig.show()
E:\Anaconda3\lib\site-packages\matplotlib\figure.py:418: UserWarning: matplotlib is currently using a non-GUI backend, so cannot show the figure
"matplotlib is currently using a non-GUI backend, "
[外链图片转存(img-8zXHT82y-1562725663278)(output_8_1.png)]
Box Plots
house_price_gr.plot(kind='box') # Box Plots
<matplotlib.axes._subplots.AxesSubplot at 0x135079e8>
[外链图片转存(img-MjxUaxCX-1562725663279)(output_10_1.png)]
置信度区间估计
se = house_price_gr.rate.std() / len(house_price_gr) ** 0.5
LB = house_price_gr.rate.mean() - 3 * se
UB = house_price_gr.rate.mean() + 3 * se
(LB, UB)
(0.09993649947438818, 0.12018549392945813)
# 如果要求任意置信度下的置信区间的话,可以自己编一个函数
def confint(x, alpha=0.05):
n = len(x)
xb = x.mean()
df = n-1
tmp = (x.std() / n ** 0.5) * stats.t.ppf(1-alpha/2, df)
return {'Mean': xb, 'Degree of Freedom':df, 'LB':xb-tmp, 'UB':xb+tmp}
confint(house_price_gr.rate, 0.05)
{'Degree of Freedom': 149,
'LB': 0.10339228338892809,
'Mean': 0.11006099670192315,
'UB': 0.11672971001491822}
# 或者使用DescrStatsW
d1 = sm.stats.DescrStatsW(house_price_gr.rate)
d1.tconfint_mean(0.05) #
(0.10339228338892814, 0.11672971001491828)
假设检验与单样本T检验
当年住宅价格的增长率是否超过了10%的阈值
print('t-statistic=%6.4f, p-value=%6.4f, df=%s' %d1.ttest_mean(0.1))
#一般认为FICO高于690的客户信誉较高,请检验该产品的客户整体信用是否高于690
t-statistic=2.9812, p-value=0.0034, df=149.0
两样本T检验
- 数据说明:本数据是一份汽车贷款违约数据
- 名称 中文含义
- id id
- Acc 是否开卡(1=已开通)
- avg_exp 月均信用卡支出(元)
- avg_exp_ln 月均信用卡支出的自然对数
- gender 性别(男=1)
- Age 年龄
- Income 年收入(万元)
- Ownrent 是否自有住房(有=1;无=0)
- Selfempl 是否自谋职业(1=yes, 0=no)
- dist_home_val 所住小区房屋均价(万元)
- dist_avg_income 当地人均收入
- high_avg 高出当地平均收入
- edu_class 教育等级:小学及以下开通=0,中学=1,本科=2,研究生=3
导入数据和数据清洗
creditcard_exp = pd.read_csv('creditcard_exp.csv', skipinitialspace=True)
creditcard_exp = creditcard_exp.dropna(how='any')
creditcard_exp.head()
id | Acc | avg_exp | avg_exp_ln | gender | Age | Income | Ownrent | Selfempl | dist_home_val | dist_avg_income | age2 | high_avg | edu_class | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 19 | 1 | 1217.03 | 7.104169 | 1 | 40 | 16.03515 | 1 | 1 | 99.93 | 15.932789 | 1600 | 0.102361 | 3 |
1 | 5 | 1 | 1251.50 | 7.132098 | 1 | 32 | 15.84750 | 1 | 0 | 49.88 | 15.796316 | 1024 | 0.051184 | 2 |
3 | 86 | 1 | 856.57 | 6.752936 | 1 | 41 | 11.47285 | 1 | 0 | 16.10 | 11.275632 | 1681 | 0.197218 | 3 |
4 | 50 | 1 | 1321.83 | 7.186772 | 1 | 28 | 13.40915 | 1 | 0 | 100.39 | 13.346474 | 784 | 0.062676 | 2 |
5 | 67 | 1 | 816.03 | 6.704451 | 1 | 41 | 10.03015 | 0 | 1 | 119.76 | 10.332263 | 1681 | -0.302113 | 3 |
根据性别比较支出
creditcard_exp['avg_exp'].groupby(creditcard_exp['gender']).describe()
gender
0 count 50.000000
mean 925.705200
std 430.833365
min 163.180000
25% 593.312500
50% 813.650000
75% 1204.777500
max 1992.390000
1 count 20.000000
mean 1128.531000
std 462.281389
min 648.150000
25% 829.860000
50% 1020.005000
75% 1238.202500
max 2430.030000
dtype: float64
- 第一步:方差齐次检验
gender0 = creditcard_exp[creditcard_exp['gender'] == 0]['avg_exp']
gender1 = creditcard_exp[creditcard_exp['gender'] == 1]['avg_exp']
leveneTestRes = stats.levene(gender0, gender1, center='median')
print('w-value=%6.4f, p-value=%6.4f' %leveneTestRes)
w-value=0.0683, p-value=0.7946
- 第二步:T-test
stats.stats.ttest_ind(gender0, gender1, equal_var=True)
# Or Try: sm.stats.ttest_ind(gender0, gender1, usevar='pooled')
Ttest_indResult(statistic=-1.7429013868086289, pvalue=0.085871228784484485)
方差分析
- 单因素方差分析
import pandas as pd
pd.set_option('display.max_columns', None) # 设置显示所有列
creditcard_exp.groupby('edu_class')[['avg_exp']].describe()
avg_exp | ||
---|---|---|
edu_class | ||
0 | count | 2.000000 |
mean | 207.370000 | |
std | 62.494097 | |
min | 163.180000 | |
25% | 185.275000 | |
50% | 207.370000 | |
75% | 229.465000 | |
max | 251.560000 | |
1 | count | 23.000000 |
mean | 641.937826 | |
std | 147.577741 | |
min | 418.780000 | |
25% | 525.595000 | |
50% | 593.920000 | |
75% | 736.140000 | |
max | 987.660000 | |
2 | count | 23.000000 |
mean | 973.321304 | |
std | 229.163196 | |
min | 610.250000 | |
25% | 807.820000 | |
50% | 959.830000 | |
75% | 1075.270000 | |
max | 1472.820000 | |
3 | count | 22.000000 |
mean | 1422.280909 | |
std | 435.281442 | |
min | 816.030000 | |
25% | 1166.997500 | |
50% | 1343.025000 | |
75% | 1661.412500 | |
max | 2430.030000 |
import numpy as np
A = np.ones([2, 3, 4])
A?
edu = []
for i in range(4):
edu.append(creditcard_exp[creditcard_exp['edu_class'] == i]['avg_exp'])
stats.f_oneway(*edu)
F_onewayResult(statistic=31.825683356937645, pvalue=7.658361691248968e-13)
- 多因素方差分析
from statsmodels.formula.api import ols
sm.stats.anova_lm(ols('avg_exp ~ C(edu_class) + C(gender)',data=creditcard_exp).fit())
df | sum_sq | mean_sq | F | PR(>F) | |
---|---|---|---|---|---|
C(edu_class) | 3.0 | 8.126056e+06 | 2.708685e+06 | 31.578365 | 1.031496e-12 |
C(gender) | 1.0 | 4.178273e+04 | 4.178273e+04 | 0.487111 | 4.877082e-01 |
Residual | 65.0 | 5.575481e+06 | 8.577662e+04 | NaN | NaN |
ana = ols('avg_exp ~ C(edu_class) + C(gender) +C(edu_class)*C(gender)', data= creditcard_exp).fit()
sm.stats.anova_lm(ana)
df | sum_sq | mean_sq | F | PR(>F) | |
---|---|---|---|---|---|
C(edu_class) | 3.0 | 8.126056e+06 | 2.708685e+06 | 33.839350 | 3.753889e-13 |
C(gender) | 1.0 | 4.178273e+04 | 4.178273e+04 | 0.521988 | 4.726685e-01 |
C(edu_class):C(gender) | 3.0 | 5.406989e+05 | 1.802330e+05 | 2.251633 | 9.097723e-02 |
Residual | 63.0 | 5.042862e+06 | 8.004544e+04 | NaN | NaN |
相关分析
散点图
creditcard_exp.plot(x='Income', y='avg_exp', kind='scatter')
<matplotlib.axes._subplots.AxesSubplot at 0x13af56d8>
[外链图片转存(img-DXpyaRQC-1562725663280)(output_34_1.png)]
相关性分析:“spearman”,“pearson” 和 “kendall”
creditcard_exp[['Income', 'avg_exp']].corr(method='pearson')
Income | avg_exp | |
---|---|---|
Income | 1.000000 | 0.674011 |
avg_exp | 0.674011 | 1.000000 |
卡方检验
accepts = pd.read_csv('accepts.csv')
accepts = accepts.sample(30)
cross_table = pd.crosstab(accepts.bankruptcy_ind, columns=accepts.bad_ind)
# Or try this: accepts.pivot_table(index='bankruptcy_ind',columns='bad_ind', values='application_id', aggfunc='count')
cross_table
bad_ind | 0 | 1 |
---|---|---|
bankruptcy_ind | ||
N | 4163 | 1017 |
Y | 345 | 103 |
print('chisq = %6.4f\n p-value = %6.4f\n dof = %i\n expected_freq = %s' %stats.chi2_contingency(cross_table))
chisq = 2.7098
p-value = 0.0997
dof = 1
expected_freq = [[ 4149.15422886 1030.84577114]
[ 358.84577114 89.15422886]]
import pandas as pd
wp = pd.Panel(np.random.randn(2, 5, 4), items=['item1', 'item2'], major_axis=pd.date_range('1/1/2000', periods=5), minor_axis=['A', 'B', 'C', 'D'])
wp
df=pd.DataFrame([1,2,3,4,5])
df.set_index("Date", inplace=True)
# display(A.head(1))
# A.set_index('Date'=True)
# A
# indexed_df = df.set_index(['A', 'B'])
# indexed_df2 = df.set_index(['A', [0, 1, 2, 0, 1, 2]])
# indexed_df3 = df.set_index([[0, 1, 2, 0, 1, 2]])
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()
TypeError: an integer is required
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
~\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
2524 try:
-> 2525 return self._engine.get_loc(key)
2526 except KeyError:
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
KeyError: 'Date'
During handling of the above exception, another exception occurred:
TypeError Traceback (most recent call last)
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()
TypeError: an integer is required
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
<ipython-input-45-ed76529a4bdb> in <module>()
3 wp
4 df=pd.DataFrame([1,2,3,4,5])
----> 5 df.set_index("Date", inplace=True)
6 # display(A.head(1))
7 # A.set_index('Date'=True)
~\Anaconda3\lib\site-packages\pandas\core\frame.py in set_index(self, keys, drop, append, inplace, verify_integrity)
3144 names.append(None)
3145 else:
-> 3146 level = frame[col]._values
3147 names.append(col)
3148 if drop:
~\Anaconda3\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
2137 return self._getitem_multilevel(key)
2138 else:
-> 2139 return self._getitem_column(key)
2140
2141 def _getitem_column(self, key):
~\Anaconda3\lib\site-packages\pandas\core\frame.py in _getitem_column(self, key)
2144 # get column
2145 if self.columns.is_unique:
-> 2146 return self._get_item_cache(key)
2147
2148 # duplicate columns & possible reduce dimensionality
~\Anaconda3\lib\site-packages\pandas\core\generic.py in _get_item_cache(self, item)
1840 res = cache.get(item)
1841 if res is None:
-> 1842 values = self._data.get(item)
1843 res = self._box_item_values(item, values)
1844 cache[item] = res
~\Anaconda3\lib\site-packages\pandas\core\internals.py in get(self, item, fastpath)
3841
3842 if not isna(item):
-> 3843 loc = self.items.get_loc(item)
3844 else:
3845 indexer = np.arange(len(self.items))[isna(self.items)]
~\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
2525 return self._engine.get_loc(key)
2526 except KeyError:
-> 2527 return self._engine.get_loc(self._maybe_cast_indexer(key))
2528
2529 indexer = self.get_indexer([key], method=method, tolerance=tolerance)
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
KeyError: 'Date'