建立一个逻辑回归模型来预测一个学生是否被大学录取,根据两次考试的结果来决定每个申请人的录取机会,有以前的申请人的历史数据, 可以用它作为逻辑回归的训练集。
python实现逻辑回归 目标: 建立分类器(求解出三个参数θ0θ1θ2)即得出界线。备注:θ1对应Exam1成绩,θ2对应Exam2 设定阈值,根据阈值判断录取结果 备注:阈值指的是最终得到的概率值,将概率值转化成一个类别,一般是>0.5是被录取,<0.5未被录取。
一:__init__.py (逻辑回归主函数)
#逻辑回归主函数
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns #绘制散点图
plt.style.use('fivethirtyeight') #样式美化
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report#这个包是评价报告
import sigmoid as sg #sigmoid函数
import Cost_function as cf #代价函数
import gradient_descent as gd #梯度下降
import scipy.optimize as opt #在这里寻找高级优化算法
data = pd.read_csv(r'******', names = ['exam1', 'exam2', 'admitted'])
print(data.head())
#画出散点图
sns.set(context = "notebook", style = "darkgrid", palette = sns.color_palette("RdBu", 2), color_codes = False) #设置参数样式,默认主题darkgrid(灰色背景+白网格)
sns.lmplot('exam1', 'exam2', hue = 'admitted', data = data,
height = 6,
fit_reg = False, #fit_feg 参数,控制是否显示拟合的直线
scatter_kws = {"s": 50}
) #hue参数是将name所指定的不同类型的数据叠加在一张图中显示
#plt.show()
def get_X(df): #读取特征值
ones = pd.DataFrame({'Ones': np.ones(len(df))})
data = pd.concat([ones, df], axis = 1) #以上两行相当于添加了X0=1 的一列
return data.iloc[:, :-1].values #这个操作返回ndarray,不是矩阵。 as_matrix()已经被淘汰
def get_y(df): #读取标签
return np.array(df.iloc[:, -1])
def normalize_feature(df):
return df.apply(lambda column: (column - column.mean()) / column.std()) #特征归一化(特征缩放)
#设置X,y,theta
X = get_X(data)
y = get_y(data)
theta = np.zeros(3) #np.zeros(k) 创建的是矩阵,并存在向量,为k*1的矩阵
# print(X.shape)
# print(y.shape)
print(cf.cost(theta, X, y))
print(gd.gradient(theta, X, y)) #使用梯度下降算法优化θ
#使用scipy.optimize.minimize 里的高级优化算法去寻找参数θ
res = opt.minimize(fun= cf.cost, x0 = theta, args = (X, y), method = 'Newton-CG', jac = gd.gradient)
print(res)
# 用训练集预测和验证
def predict(x, theta): # 实现变量类型转换
y_pred = sg.sigmoid(X.dot(theta))
return (y_pred >= 0.5).astype(int)
final_theta = res.x
y_pred = predict(X, final_theta)
print(classification_report(y, y_pred))
#寻找决策边界
#θ2对应的是exam2的成绩在这里把x2用y代替,为了方便画图,同时 把θ参数y为基准归一 便于画图。
coef = -(res.x / res.x[2])
x = np.arange(130, step = 0.1)
y = coef[0] + coef[1] * x
#画出散点图和决策分界线
sns.set(context="notebook", style="ticks", font_scale=1.5)
sns.lmplot('exam1', 'exam2', hue='admitted', data=data,
height=6,
fit_reg=False,
scatter_kws={"s": 25}
)
plt.plot(x, y, 'grey')
plt.xlim(0, 130)
plt.ylim(0, 130)
plt.title('Decision Boundary')
二:Regularized.py(正则化主函数)
#正则化主函数
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns #绘制散点图
plt.style.use('fivethirtyeight') #样式美化
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report#这个包是评价报告
import feature_mapping as fm
import regularized_function as rf
import sigmoid as sg #sigmoid函数
import scipy.optimize as opt #用高级算法拟合参数
def predict(x, theta): # 实现变量类型转换
y_pred = sg.sigmoid(X.dot(theta))
return (y_pred >= 0.5).astype(int)
df = pd.read_csv(r'******', names = ['test1', 'test2', 'accepted'])
#print(df.head())
#绘制散点图
sns.set(context="notebook", style="ticks", font_scale=1.5)
sns.lmplot('test1', 'test2', hue='accepted', data=df,
height=6,
fit_reg=False,
scatter_kws={"s": 50}
)
plt.title('Regularized Logistic Regression')
#plt.show()
x1 = np.array(df.test1)
x2 = np.array(df.test2)
data2 = fm.feature_mapping(x1, x2, power = 6)
theta = np.zeros(data2.shape[1])
X = fm.feature_mapping(x1, x2, power = 6, as_ndarray=True)
y = np.array(df.iloc[:, -1])
# print(X.shape)
# print(y.shape)
print(rf.regularized_cost(theta, X, y, l = 1)) #正则化代价函数
print(rf.regularized_gradient(theta, X, y)) #正则化梯度
#拟合参数
print('init cost = {}'.format(rf.regularized_cost(theta, X, y)))
res = opt.minimize(fun = rf.regularized_cost, x0 = theta, args = (X, y), method = 'Newton-CG', jac = rf.regularized_gradient)
print(res)
final_theta = res.x
y_pred = predict(X, final_theta)
print(classification_report(y, y_pred)) #预测
#使用不同的? 画出决策边界
def draw_boundary(power, l): #展示一个画图
density = 1000
threshhold = 2 * 10 ** -3
final_theta = feature_mapped_logistic_regression(power, l)
x, y = find_decision_boundary(density, power, final_theta, threshhold)
df = pd.read_csv(r'******', names=['test1', 'test2', 'accepted'])
sns.lmplot('test1', 'test2', hue='accepted', data=df, height=6, fit_reg=False, scatter_kws={"s": 100})
plt.scatter(x, y, c='R', s=10)
plt.title('Decision boundary')
plt.show()
def feature_mapped_logistic_regression(power, l):
df = pd.read_csv(r'******', names=['test1', 'test2', 'accepted'])
x1 = np.array(df.test1)
x2 = np.array(df.test2)
y = np.array(df.iloc[:, -1])
X = fm.feature_mapping(x1, x2, power, as_ndarray=True)
theta = np.zeros(X.shape[1])
res = opt.minimize(fun=rf.regularized_cost,
x0=theta,
args=(X, y, l),
method='TNC',
jac=rf.regularized_gradient)
final_theta = res.x
return final_theta
#寻找决策边界函数
def find_decision_boundary(density, power, theta, threshhold):
t1 = np.linspace(-1, 1.5, density)
t2 = np.linspace(-1, 1.5, density)
cordinates = [(x, y) for x in t1 for y in t2]
x_cord, y_cord = zip(*cordinates)
mapped_cord = fm.feature_mapping(x_cord, y_cord, power) # this is a dataframe
inner_product = mapped_cord.values @ theta
decision = mapped_cord[np.abs(inner_product) < threshhold]
return decision.f10, decision.f01
draw_boundary(power=6, l=0) #set lambda = 1
三:sigmoid.py(sigmoid函数)
import numpy as np
import matplotlib.pyplot as plt
def sigmoid(z):
gz = 1 / (1 + np.exp(-z))
return gz
#绘制sigmoid函数的图像
# fig, ax = plt.subplots(figsize=(8, 6))
# ax.plot(np.arange(-10, 10, step=0.01),
# sigmoid(np.arange(-10, 10, step=0.01)))
# ax.set_ylim((-0.1,1.1))
# ax.set_xlabel('z', fontsize=18)
# ax.set_ylabel('g(z)', fontsize=18)
# ax.set_title('sigmoid function', fontsize=18)
# plt.show()
四: Cost_function.py(代价函数)
#代价函数
import numpy as np
import sigmoid as sg #sigmoid函数
def cost(theta, X, y):
h = sg.sigmoid(X.dot(theta))
costf = np.sum((-y * np.log(h)) - (1 - y) * np.log(1 - h)) / len(X)
return costf
五:gradient_descent.py(梯度下降函数)
#梯度下降函数
import sigmoid as sg
def gradient(theta, X, y):
h = sg.sigmoid(X.dot(theta))
grad = X.T.dot(h - y) / len(X)
return grad
六:feature_mapping.py(特征映射)
#特征映射
#如果样本量多,逻辑回归问题很复杂,而原始特征只有x1,x2可以用多项式创建更多的特征x1、x2、x1x2、x1^2、x2^2、... X1^nX2^n。
#因为更多的特征进行逻辑回归时,得到的分割线可以是任意高阶函数的形状。
import numpy as np
import pandas as pd
def feature_mapping(x, y, power, as_ndarray=False):
# """return mapped features as ndarray or dataframe"""
data = {"f{}{}".format(i - p, p): np.power(x, i - p) * np.power(y, p)
for i in np.arange(power + 1)
for p in np.arange(i + 1)
}
if as_ndarray:
return pd.DataFrame(data).values
else:
return pd.DataFrame(data)
七:regularized_function.py(正则化相关函数)
#正则化的相关函数
import numpy as np
import Cost_function as cf
import gradient_descent as gd
#正则化代价函数
def regularized_cost(theta, X, y, l = 1):
theta_j1_to_n = theta[1:]
regularized_term = (l / (2*len(X))) * np.power(theta_j1_to_n, 2).sum() #注意这个地方 len前后的括号要括号
regu_cost = cf.cost(theta, X, y) + regularized_term
return regu_cost
#正则化梯度
def regularized_gradient(theta, X, y, l = 1):
theta_j1_to_n = theta[1:]
regularized_theta = (l / len(X)) * theta_j1_to_n
# 因为θ从1开始 所以需要将0数组和以计算好的数组进行拼接,达到与参数数量一致
regularized_term = np.concatenate([np.array([0]), regularized_theta])
return gd.gradient(theta, X, y) + regularized_term