1、将参数处理为一维或者多维数组后
#x为一维数组(只有一个参数),多维数组(多个参数)。处理方式参见下文
x=sm.add_constant(x)#添加一个常数
model=sm.OLS(salary,x).fit()#拟合
print(model.summary())#输出关系
import statsmodels.api as sm
import numpy as np
import math
import pandas as pd
class ceosal():
def __init__(self):
self.data=pd.read_stata('CEOSAL1.DTA')
pd.set_option('max_columns',None)
pd.set_option('max_rows',None)
def log(self,num):
return [math.log(i)for i in num]#取对数
def t(self,num):
return np.transpose(num)#两种方式转置二维数组
#return list(map(list, zip(*x)))
def ols(self):
salary=list(self.data['salary'])
salary=self.log(salary)
sales=list(self.data['sales'])
sales=self.log(sales)
roe=list(self.data['roe'])
ros=list(self.data['ros'])
x=[sales,roe,ros]
x=self.t(x)
x=sm.add_constant(x)#添加一个常数
model=sm.OLS(salary,x).fit()#拟合
print(model.summary())#输出关系
#输出内容均是与计量经济学专业术语相关
if __name__=='__main__':
d=ceosal()
d.ols()
# 提取回归系数
model.params
# 提取回归系数标准差
model.bse
# 提取回归系数p值
model.pvalues
# 提取回归系数t值
model.tvalues
# 提取回归系数置信区间 默认5%,括号中可填具体数字 比如0.05, 0.1
model.conf_int()
# 提取模型预测值
model.fittedvalues
# 提取残差
model.resid
# 模型自由度(系数自由度)
model.df_model
# 残差自由度(样本自由度)
model.df_resid
# 模型样本数量
model.nobs
# 提取R方
model.rsquared
# 提取调整R方
model.rsquared_adj
# 提取AIC
model.aic
# 提取BIC
model.bic
# 提取F-statistic
model.fvalue
# 提取F-statistic 的pvalue
model.f_pvalue
# 模型mse
model.mse_model
# 残差mse
model.mse_resid
# 总体mse
model.mse_total
# 协方差矩阵比例因子
model.scale
# White异方差稳健标准误
model.HC0_se
# MacKinnon和White(1985)的异方差稳健标准误
model.HC1_se
# White异方差矩阵
model.cov_HC0
# MacKinnon和White(1985)的异方差矩阵
model.cov_HC1
标准化处理
from scipy.stats.mstats import zscore
arg = [dist, dist2,intst, intst2, area, land, rooms, baths, age]
arg = self.t(arg)
price = zscore(price)
arg = zscore(arg)
# arg=sm.add_constant(arg)#数据标准化之后不需要截距项
model = sm.OLS(price, arg).fit()
print(model.summary('price', ['dist', 'dist2','intst', 'intst2', 'area', 'land', 'rooms', 'baths', 'age'],
'question3')) # 分别设置因变量名称,自变量名称(包括常数),回归的标题
#zscore可以处理一维或者多维数组,但是要做添加截距项之前,而且一般情况下,标准化之后就不需要截距项。
2、ols回归的另一种方式
import statsmodels.formula.api as smf
price=self.data
fit = smf.ols('price~sqrft+bdrms', data=price).fit()
# 模型概览的反馈
print(fit.summary())
3、异方差检验的方式
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
import statsmodels.api as sm#引入statasmodel模型
import statsmodels.formula.api as smf
from matplotlib.font_manager import FontProperties#中文处理
from mpl_toolkits.mplot3d import Axes3D as ax
from numpy import *
import numpy as np
#异方差性检验
class Testheter():
def __init__(self):
mpl.rcParams['font.sans-serif'] = ['SimHei'] # 配置显示中文
plt.rcParams['axes.unicode_minus'] = False
self.data = pd.read_stata('WAGE1.DTA')
self.data.dropna(axis=0, how='any', inplace=True) # 删除缺失值
pd.set_option('max_columns', None) # 显示所以行
pd.set_option('max_rows', None) # 显示所有列
def graph(self):
fit=smf.ols('wage~educ+exper',data=self.data).fit()
a=np.linalg.cond(fit.model.exog)
print(fit.summary())
print("检验多重共线性 ",a)
print("#"*100)
plt.scatter(fit.predict(), (fit.resid-fit.resid.mean())/fit.resid.std())
plt.xlabel('预测值')
plt.ylabel('标准化残差')
# 添加水平参考线
plt.axhline(y=0, color='r', linewidth=2)
plt.show()
def white(self):
fit = smf.ols('wage~educ+exper', data=self.data).fit()
a=sm.stats.diagnostic.het_white(fit.resid, exog=fit.model.exog)
name = ['Lagrange multiplier statistic', 'p-value','f-value', 'f p-value']
print("white test")
for n,a in zip(name,a):
print(n,a)
print("#"*100)
#print("拉格朗日乘子的统计(lagrange multiplier statistic(lm : float)) ",a[0])
#print("拉格朗日乘子检验的p值(p-value of lagrange multiplier test(lm_pvalue :float)) ", a[1])
#print("误差方差不依赖于x的假设的f统计量(f-statistic of the hypothesis that the error variance does not depend on x(fvalue : float)) ", a[2])
#print("f统计量的p值(p-value for the f-statistic(f_pvalue : float))", a[3])
def BreushPagan(self):
fit = smf.ols('wage~educ+exper', data=self.data).fit()
a = sm.stats.diagnostic.het_breuschpagan(fit.resid, exog_het=fit.model.exog)
name = ['Lagrange multiplier statistic', 'p-value', 'f-value', 'f p-value']
print("Breush-Pagan test")
for n, a in zip(name, a):
print(n, a)
print("#" * 100)
def GoldfeldQuandt(self):
fit = smf.ols('wage~educ+exper', data=self.data).fit()
a = sm.stats.diagnostic.het_breuschpagan(fit.resid, exog_het=fit.model.exog)
name = ['F statistic', 'p-value']
print("GoldfeldQuandt test")
for n, a in zip(name, a):
print(n, a)
print("#" * 100)
def arch(self):
fit = smf.ols('wage~educ+exper', data=self.data).fit()
a = sm.stats.diagnostic.het_arch(fit.resid)
name = ['Lagrange multiplier statistic', 'p-value', 'f p-value','ResultsStore']
print("arch test")
for n, a in zip(name, a):
print(n, a)
print("#" * 100)
if __name__=='__main__':
d=Testheter()
d.graph()
d.white()
d.BreushPagan()
d.GoldfeldQuandt()
d.arch()
4、异方差的修正
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
import statsmodels.api as sm#引入statasmodel模型
import statsmodels.formula.api as smf
from matplotlib.font_manager import FontProperties#中文处理
from mpl_toolkits.mplot3d import Axes3D as ax
from numpy import *
import math
import numpy as np
#异方差性检验
class Testheter():
def __init__(self):
mpl.rcParams['font.sans-serif'] = ['SimHei'] # 配置显示中文
plt.rcParams['axes.unicode_minus'] = False
self.data = pd.read_stata('WAGE1.DTA')
self.data.dropna(axis=0, how='any', inplace=True) # 删除缺失值
pd.set_option('max_columns', None) # 显示所以行
pd.set_option('max_rows', None) # 显示所有列
def log(self, num): # 取对数
return [math.log(i) for i in num]
def t(self, num): # 将二维数组转置
return np.transpose(num)
def logs(self):
wage =self.log(list(self.data['wage'])) # 将dataframe转化为列表
educ = list(self.data['educ'])
exper = self.log(list(self.data['exper']))
print("取对数法")
print("_"*1000)
arg = [educ, exper] # 构造多维数组
arg = self.t(arg) # 转置
arg = sm.add_constant(arg) # 添加一个常数
model = sm.OLS(wage, arg).fit() # 利用statsmodels进行拟合
print(model.summary('log(wage)', ['const', 'educ', 'log(exper)'],'取对数')) # 分别设置因变量名称,自变量名称(包括常数)
a = sm.stats.diagnostic.het_white(model.resid, exog=model.model.exog)
name = ['Lagrange multiplier statistic', 'p-value', 'f-value', 'f p-value']
print("white test")
for n, a in zip(name, a):
print(n, a)
print("#" * 100)
def olss(self):
print('\n\n\n\n\n\n')
print("加权最小二乘法")
print("_" * 1000)
fit = smf.ols('wage~educ+exper', data=self.data).fit()
w1 = 1 / np.abs(fit.resid)
fit2 = sm.formula.wls('wage~educ+exper', data=self.data, weights=w1).fit()
print(fit2.summary())
a = sm.stats.diagnostic.het_white(fit2.resid, exog=fit2.model.exog)
name = ['Lagrange multiplier statistic', 'p-value', 'f-value', 'f p-value']
print("white test")
for n, a in zip(name, a):
print(n, a)
print("#" * 100)
if __name__=='__main__':
d=Testheter()
d.logs()
d.olss()