%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.formula.api import ols
cred = pd.read_csv("creditcard_exp.csv",skipinitialspace=True)
cred.head()
#数据清洗 用于建模
cred2 = cred[cred['avg_exp'].notnull()].iloc[:, :].copy()
cred3 = cred[cred['avg_exp'].isnull()].iloc[:, :].copy()
cred2.head()
#相关分析 散点图
cred2.plot('Income','avg_exp',kind='scatter')
#线性回归 回归建模时 y 不能有空数据
lm_s = ols('avg_exp ~ Income',data=cred2).fit()
print(lm_s.params)
lm_s.summary()
#预测 预测用原始数据
pre = lm_s.predict(cred) #每条记录根据回归系数预测出y avg_exp = 258.04+97.72*Income
cred['pre'] = pre
cred.head()