其中
#
导入
numpy
工具包,并且更名为
np
import
numpy
as
np
#
导入
pandas
工具包,并且更名为
pd
import
pandas
as
pd
#
导入
matplotlib
工具包中的
pyplot
,并且更名为
plt
import
matplotlib.pyplot
as
plt
#
导入
sklearn
工具包中的逻辑回归分类器
from
sklearn.linear_model
import
LogisticRegression
#
读取训练数据集(在UCI中可以找到)
df_train = pd.read_csv(
'breast-cancer-train.csv'
)
#
读取测试数据集
df_test = pd.read_csv(
'breast-cancer-test.csv'
)
'''
该数据总共有
4
列,除了编号之外,
包含
Clump Thickness
,
Cell Size
,
Type
三个特征
,
其中前两个特征是数值型的,最后一项是布尔型,分别代表肿瘤患者的良性与恶性
'''
#
选取
'Clump Thickness'
与
'Cell Size'
作为特征
(
属性
)
,构建测试集中的正负分类(恶性肿瘤,良性肿瘤)样本。
#
恶性肿瘤用
0
表示;良性肿瘤用
1
表示。
df_test_negative = df_test.loc[df_test[
'Type'
] ==
0
][[
'Clump Thickness'
,
'Cell Size'
]]
df_test_positive = df_test.loc[df_test[
'Type'
] ==
1
][[
'Clump Thickness'
,
'Cell Size'
]]
#
绘制良性肿瘤样本点,标记为红色的
o
plt.scatter(df_test_negative[
'Clump Thickness'
],df_test_negative[
'Cell Size'
],
marker
=
'o'
,
s
=
200
,
c
=
'red'
)
#
绘制恶性肿瘤样本点,标记为黑色的
x
plt.scatter(df_test_positive[
'Clump Thickness'
],df_test_positive[
'Cell Size'
],
marker
=
'x'
,
s
=
150
,
c
=
'black'
)
#
绘制
x,y
轴说明
plt.xlabel(
'Clump Thickness'
)
plt.ylabel(
'Cell Size'
)
plt.show()
#
利用
numpy
中的
random
函数随机采样直线的截距和系数。
intercept = np.random.random([
1
])
coef = np.random.random([
2
])
lx = np.arange(
0
,
12
)
ly = (-intercept-lx * coef[
0
]) / coef[
1
]
#
绘制一条随机直线
plt.plot(lx,ly,
c
=
'yellow'
)
plt.scatter(df_test_negative[
'Clump Thickness'
],df_test_negative[
'Cell Size'
],
marker
=
'o'
,
s
=
200
,
c
=
'red'
)
plt.scatter(df_test_positive[
'Clump Thickness'
],df_test_positive[
'Cell Size'
],
marker
=
'x'
,
s
=
150
,
c
=
'black'
)
plt.xlabel(
'Clump Thickness'
)
plt.ylabel(
'Cell Size'
)
plt.show()
#
引入
LR
分类器
lr = LogisticRegression()
#
使用前
10
条训练样本学习直线的系数和截距
lr.fit(df_train[[
'Clump Thickness'
,
'Cell Size'
]][:
10
],df_train[
'Type'
][:
10
])
print
(
"Testing accuracy(10 training samples):"
,lr.score(df_test[[
'Clump Thickness'
,
'Cell Size'
]],df_test[
'Type'
]))
intercept = lr.intercept_
coef = lr.coef_[
0
,:]
ly = (-intercept - lx * coef[
0
]) / coef[
1
]
plt.plot(lx,ly,
c
=
'green'
)
plt.scatter(df_test_negative[
'Clump Thickness'
],df_test_negative[
'Cell Size'
],
marker
=
'o'
,
s
=
200
,
c
=
'red'
)
plt.scatter(df_test_positive[
'Clump Thickness'
],df_test_positive[
'Cell Size'
],
marker
=
'x'
,
s
=
150
,
c
=
'black'
)
plt.xlabel(
'Clump Thickness'
)
plt.ylabel(
'Cell Size'
)
plt.show()
#
使用所有训练样本学习直线的系数和截距
lr.fit(df_train[[
'Clump Thickness'
,
'Cell Size'
]],df_train[
'Type'
])
print
(
"Testing accuracy(all training samples):"
,lr.score(df_test[[
'Clump Thickness'
,
'Cell Size'
]],df_test[
'Type'
]))
intercept=lr.intercept_
coef=lr.coef_[
0
,:]
ly=(-intercept -lx*coef[
0
])/coef[
1
]
plt.plot(lx,ly,
c
=
'blue'
)
plt.scatter(df_test_negative[
'Clump Thickness'
],df_test_negative[
'Cell Size'
],
marker
=
'o'
,
s
=
200
,
c
=
'red'
)
plt.scatter(df_test_positive[
'Clump Thickness'
],df_test_positive[
'Cell Size'
],
marker
=
'x'
,
s
=
150
,
c
=
'black'
)
plt.xlabel(
'Clump Thickness'
)
plt.ylabel(
'Cell Size'
)
plt.show()
其中
#
导入
numpy
工具包,并且更名为
np
import
numpy
as
np
#
导入
pandas
工具包,并且更名为
pd
import
pandas
as
pd
#
导入
matplotlib
工具包中的
pyplot
,并且更名为
plt
import
matplotlib.pyplot
as
plt
#
导入
sklearn
工具包中的逻辑回归分类器
from
sklearn.linear_model
import
LogisticRegression
#
读取训练数据集(在UCI中可以找到)
df_train = pd.read_csv(
'breast-cancer-train.csv'
)
#
读取测试数据集
df_test = pd.read_csv(
'breast-cancer-test.csv'
)
'''
该数据总共有
4
列,除了编号之外,
包含
Clump Thickness
,
Cell Size
,
Type
三个特征
,
其中前两个特征是数值型的,最后一项是布尔型,分别代表肿瘤患者的良性与恶性
'''
#
选取
'Clump Thickness'
与
'Cell Size'
作为特征
(
属性
)
,构建测试集中的正负分类(恶性肿瘤,良性肿瘤)样本。
#
恶性肿瘤用
0
表示;良性肿瘤用
1
表示。
df_test_negative = df_test.loc[df_test[
'Type'
] ==
0
][[
'Clump Thickness'
,
'Cell Size'
]]
df_test_positive = df_test.loc[df_test[
'Type'
] ==
1
][[
'Clump Thickness'
,
'Cell Size'
]]
#
绘制良性肿瘤样本点,标记为红色的
o
plt.scatter(df_test_negative[
'Clump Thickness'
],df_test_negative[
'Cell Size'
],
marker
=
'o'
,
s
=
200
,
c
=
'red'
)
#
绘制恶性肿瘤样本点,标记为黑色的
x
plt.scatter(df_test_positive[
'Clump Thickness'
],df_test_positive[
'Cell Size'
],
marker
=
'x'
,
s
=
150
,
c
=
'black'
)
#
绘制
x,y
轴说明
plt.xlabel(
'Clump Thickness'
)
plt.ylabel(
'Cell Size'
)
plt.show()
#
利用
numpy
中的
random
函数随机采样直线的截距和系数。
intercept = np.random.random([
1
])
coef = np.random.random([
2
])
lx = np.arange(
0
,
12
)
ly = (-intercept-lx * coef[
0
]) / coef[
1
]
#
绘制一条随机直线
plt.plot(lx,ly,
c
=
'yellow'
)
plt.scatter(df_test_negative[
'Clump Thickness'
],df_test_negative[
'Cell Size'
],
marker
=
'o'
,
s
=
200
,
c
=
'red'
)
plt.scatter(df_test_positive[
'Clump Thickness'
],df_test_positive[
'Cell Size'
],
marker
=
'x'
,
s
=
150
,
c
=
'black'
)
plt.xlabel(
'Clump Thickness'
)
plt.ylabel(
'Cell Size'
)
plt.show()
#
引入
LR
分类器
lr = LogisticRegression()
#
使用前
10
条训练样本学习直线的系数和截距
lr.fit(df_train[[
'Clump Thickness'
,
'Cell Size'
]][:
10
],df_train[
'Type'
][:
10
])
print
(
"Testing accuracy(10 training samples):"
,lr.score(df_test[[
'Clump Thickness'
,
'Cell Size'
]],df_test[
'Type'
]))
intercept = lr.intercept_
coef = lr.coef_[
0
,:]
ly = (-intercept - lx * coef[
0
]) / coef[
1
]
plt.plot(lx,ly,
c
=
'green'
)
plt.scatter(df_test_negative[
'Clump Thickness'
],df_test_negative[
'Cell Size'
],
marker
=
'o'
,
s
=
200
,
c
=
'red'
)
plt.scatter(df_test_positive[
'Clump Thickness'
],df_test_positive[
'Cell Size'
],
marker
=
'x'
,
s
=
150
,
c
=
'black'
)
plt.xlabel(
'Clump Thickness'
)
plt.ylabel(
'Cell Size'
)
plt.show()
#
使用所有训练样本学习直线的系数和截距
lr.fit(df_train[[
'Clump Thickness'
,
'Cell Size'
]],df_train[
'Type'
])
print
(
"Testing accuracy(all training samples):"
,lr.score(df_test[[
'Clump Thickness'
,
'Cell Size'
]],df_test[
'Type'
]))
intercept=lr.intercept_
coef=lr.coef_[
0
,:]
ly=(-intercept -lx*coef[
0
])/coef[
1
]
plt.plot(lx,ly,
c
=
'blue'
)
plt.scatter(df_test_negative[
'Clump Thickness'
],df_test_negative[
'Cell Size'
],
marker
=
'o'
,
s
=
200
,
c
=
'red'
)
plt.scatter(df_test_positive[
'Clump Thickness'
],df_test_positive[
'Cell Size'
],
marker
=
'x'
,
s
=
150
,
c
=
'black'
)
plt.xlabel(
'Clump Thickness'
)
plt.ylabel(
'Cell Size'
)
plt.show()