看了吴恩达老师的视频的一点小思考
为什么使用sigmoid函数作为假设函数:
因为最终的输出的分类是固定的两个值,所以预测的值要限制在一个特定的范围内,否则无法确定判定的标准,这里认为预测值在{0,1}之间,大于0.5判定为1,小于0.5判定为0。如果等于0.5则说明样本点在分割的边界上,它属于两个分类的概率是相等的。越是接近分割边界样本点之间的分类差异越不明显,所以应该表现出接近0.5的速度越快,越是远离分割边界的样本点分类差异越明显,表现出远离0.5,接近1或0的速度应该越来越慢。这个特征正好和sigmoid函数的特征相符合。
线性可分的二分类
# -*- coding: utf-8 -*-
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt # 用于画图
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report#这个包是评价报告
import numpy as np
# 读取数据文件
df = pd.read_csv("ex2data1.txt",names=['exam1', 'exam2', 'admitted'])
# sns.set(context="notebook", style="ticks", font_scale=1.5)
#
# sns.lmplot('exam1', 'exam2', hue='admitted', data=df,
# height=6,
# fit_reg=False,
# scatter_kws={"s": 25}
# )
X = df.iloc[:,:-1].values
Y = df.iloc[:,-1].values
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
print(type(X_train))
S=StandardScaler()
S.fit(X_train)
x_train_stand=S.transform(X_train)
x_test_stand=S.transform(X_test)
Log=LogisticRegression(C=10.0)
Log.fit(x_train_stand,Y_train) #训练模型
prediction=Log.predict(x_test_stand) #用训练的模型Log来预测测试数据
def x2(x1):
return (-Log.coef_[0,0]*x1-Log.intercept_)/Log.coef_[0,1]
x1_plot = x_test_stand[:,0]
x2_plot = x2(x1_plot)
df1 = pd.DataFrame(x_train_stand,columns=["x1","x2"])
df1["y"] = Y_train
sns.lmplot("x1", "x2", hue="y", data=df1,
height=6,
fit_reg=False
)
plt.plot(x1_plot,x2_plot,c='r')
plt.title('Decision Boundary')
plt.show()
print(classification_report(Y_test, prediction))
结果:
线性不可分的二分类
# 读取数据文件
df = pd.read_csv("ex2data2.txt",names=['exam1', 'exam2', 'admitted'])
print(df.describe())
X = df.iloc[:,:-1].values
Y = df.iloc[:,-1].values
#特征变换
def feature_mapping(x, y, power, as_ndarray=False):
data = {"f{}{}".format(i - p, p): np.power(x, i - p) * np.power(y, p)
for i in np.arange(power + 1)
for p in np.arange(i + 1)
}
if as_ndarray:
return pd.DataFrame(data).values
else:
return pd.DataFrame(data)
x1 = np.array(df.exam1)
x2 = np.array(df.exam2)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
S=StandardScaler()
S.fit(X_train)
x_train_stand=S.transform(X_train)
x_test_stand=S.transform(X_test)
power = 6
data = feature_mapping(X_train[:,0],X_train[:,1],power=power)
print(data.head())
dataTest = feature_mapping(x_test_stand[:,0],x_test_stand[:,1],power=power)
Log=LogisticRegression(C=1000.0,penalty='l2',solver='liblinear')
Log.fit(data,Y_train) #训练模型
prediction=Log.predict(dataTest) #用训练的模型Log来预测测试数据
print(classification_report(Y_test, prediction))
def draw_boundary(power, l):
density = 1000
threshhold = 2 * 10**-3
final_theta = Log.coef_
x, y = find_decision_boundary(density, power, final_theta, threshhold)
print(final_theta)
df = pd.read_csv('ex2data2.txt', names=['test1', 'test2', 'accepted'])
sns.lmplot('test1', 'test2', hue='accepted', data=df, height=6, fit_reg=False, scatter_kws={"s": 100})
plt.scatter(x, y, c='R', s=10)
plt.title('Decision boundary')
plt.show()
def find_decision_boundary(density, power, theta, threshhold):
t1 = np.linspace(-1, 1.5, density)
t2 = np.linspace(-1, 1.5, density)
cordinates = [(x, y) for x in t1 for y in t2]
print(cordinates.__len__())
x_cord, y_cord = zip(*cordinates)
mapped_cord = feature_mapping(x_cord, y_cord, power) # this is a dataframe
print(mapped_cord.shape)
print(theta.shape)
inner_product = mapped_cord.values @ np.transpose(theta)
print(inner_product.shape)
decision = mapped_cord[np.abs(inner_product) < threshhold]
return decision.f10, decision.f01
draw_boundary(power=power, l=1)
为什么用sklearn的逻辑回归做效果这么差,还有待优化。