肿瘤检测
逻辑斯蒂回归分类(LogisticRegression)和随机梯度下降(SGD)
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
#导入sklearn中的逻辑斯蒂回归分类器
from sklearn.linear_model import LogisticRegression
df_train = pd.read_csv('../dataSets/breast-cancer-train.csv')
df_test = pd.read_csv('../dataSets/breast-cancer-test.csv')
df_test_negative = df_test.loc[df_test['Type'] == 0][['Clump Thickness', 'Cell Size', 'Type']]
df_test_positive = df_test.loc[df_test['Type'] == 1][['Clump Thickness', 'Cell Size', 'Type']]
# print(df_test_negative, df_test_positive, sep='\n******************************\n')
plt.scatter(df_test_negative['Clump Thickness'], df_test_negative['Cell Size'], marker='o', s=200, c='red')
plt.scatter(df_test_positive['Clump Thickness'], df_test_positive['Cell Size'], marker='x', s=150, c='black')
plt.xlabel('Clump Thickness')
plt.ylabel('Cell Size')
# intercept = np.random.random_sample()
# coef = np.random.random_sample(2)
# lx = np.arange(1,12)
# ly = (intercept+lx*coef[0])/coef[1]
# plt.plot(lx,ly,c='yellow')
# plt.show()
#逻辑斯蒂回归分类器
lr = LogisticRegression()
lr.fit(df_train[['Clump Thickness', 'Cell Size']], df_train['Type'])
score = lr.score(df_test[['Clump Thickness', 'Cell Size']], df_test['Type'])
print('Testing accuracy:', score)
intercept = lr.intercept_
coef = lr.coef_[0, :]
# print(type(coef))#返回类型<class 'numpy.ndarray'>
# print(coef[0], coef[1], sep='\t')
lx = np.arange(1, 12)
ly = (-intercept - lx * coef[0]) / coef[1] # intercept+coef[0]*lx+coef[1]*ly=0
plt.plot(lx, ly, c='green')
plt.show()
这里是使用完整的数据集,并且用LogisticRegression和SGD两种方法
import numpy as np
import pandas as pd
# from sklearn.cross_validation import train_test_split #已经被废弃
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.linear_model import SGDClassifier
'''
# Attribute Domain
-- -----------------------------------------
1. Sample code number id number
2. Clump Thickness 1 - 10
3. Uniformity of Cell Size 1 - 10
4. Uniformity of Cell Shape 1 - 10
5. Marginal Adhesion 1 - 10
6. Single Epithelial Cell Size 1 - 10
7. Bare Nuclei 1 - 10
8. Bland Chromatin 1 - 10
9. Normal Nucleoli 1 - 10
10. Mitoses 1 - 10
11. Class: (2 for benign, 4 for malignant)
'''
# 创建特征值列表名称
column_names = ['Sample code number', 'Clump Thickness', 'Uniformity of Cell Size',
'Uniformity of Cell Shape', 'Marginal Adhesion', 'Single Epithelial Cell Size',
'Bare Nuclei', 'Bland Chromatin', 'Normal Nucleoli',
'Mitoses', 'Class']
# 读取数据
data = pd.read_csv('../dataSets/breast-cancer-wisconsin.data', names=column_names)
# 查看数据
# print(data[['Sample code number','Class']])
# 数据处理,将?替换为标准缺失值,丢弃缺失值(丢弃策略:any:表示只要有一个维度缺失就丢弃)
data = data.replace(to_replace='?', value=np.nan)
data = data.dropna(how='any')
# 查看数据量和数据维度,矩阵行表示数据量,列表示数据维度
print(data.shape)
'''
对数据集进行切分
25%作为测试集 75%作为训练集
'''
# 训练集 测试集 训练集标签 测试集标签
X_train, X_test, y_train, y_test = train_test_split(data[column_names[1:10]], data[column_names[10]], test_size=0.25,
random_state=33)
# 统计样本的类别和数量[eg: 2 100]
print(y_train.value_counts()) # y_train: <class 'pandas.core.series.Series'>
print(y_test.value_counts())
'''
数据预处理
'''
ss = StandardScaler()
# 标准化、归一化
X_train = ss.fit_transform(X_train)
# 归一化 ? 若使用标准化并归一化的测试集呢?
X_test = ss.transform(X_test) # score is: 0.988304093567
# X_test = ss.fit_transform(X_test) # score is: 0.970760233918 #这是绝对错误的!!!
'''
创建分类器
LogisticRegression SGDClassifier
'''
lr = LogisticRegression()
# 调用LogisticRegression模型中的fit函数来训练模型参数
lr.fit(X_train, y_train)
intercept = lr.intercept_
coef = lr.coef_[0, :]
# 得到的线性函数
print('intercept:',intercept)
print(coef[0], coef[1], sep='\t')
lr_y_predict = lr.predict(X_test)
score = lr.score(X_test,y_test)
print('Accuracy of LR Classifier:', score)
report = classification_report(y_test, lr_y_predict, target_names=['Benign', 'Malignant'])
print(report)
sgdc = SGDClassifier(max_iter=5,tol=None)
sgdc.fit(X=X_train, y=y_train)
sgdc_y_predict = sgdc.predict(X_test)
intercept = sgdc.intercept_
coef = sgdc.coef_[0, :]
print('intercept:',intercept)
print(coef[0], coef[1], sep='\t')
report = classification_report(y_test, sgdc_y_predict, target_names=['Benign', 'Malignant'])
print(report)
程序的运行结果
(683, 11)
2 344
4 168
Name: Class, dtype: int64
2 100
4 71
Name: Class, dtype: int64
intercept: [-1.22395889]
1.27854421503 0.162362376046
Accuracy of LR Classifier: 0.988304093567
precision recall f1-score support
Benign 0.99 0.99 0.99 100
Malignant 0.99 0.99 0.99 71
avg / total 0.99 0.99 0.99 171
intercept: [-0.82457626]
8.7408800265 -0.0709122072493
precision recall f1-score support
Benign 1.00 0.98 0.99 100
Malignant 0.97 1.00 0.99 71
avg / total 0.99 0.99 0.99 171
手写数字识别
使用Scikit-learn中提供的支持向量机分类器LinearSVC。
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
'''
加载数据集:Scikit-learn内部封装的数据集
'''
digits = load_digits() # digits: <class 'sklearn.utils.Bunch'>
# print(digits.data.shape) # (1797, 64) digits.data: <class 'numpy.ndarray'>
# print(digits.target) # <class 'numpy.ndarray'> [0 1 2 ..., 8 9 8]
# print(type(digits.target_names)) # <class 'numpy.ndarray'> [0 1 2 3 4 5 6 7 8 9]
# print(type(digits.target_names.astype(str))) # <class 'numpy.ndarray'> ['0' '1' '2' '3' '4' '5' '6' '7' '8' '9']
'''
切分数据集
'''
X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, test_size=0.25, random_state=33)
'''
标准化数据集
'''
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)
'''
支持向量机分类器
'''
l_svc = LinearSVC()
l_svc.fit(X_train, y_train)
y_predict = l_svc.predict(X_test)
'''
性能指标
'''
score = l_svc.score(X_test, y_test)
print('the accuracy of linearSVC is ', score)
report = classification_report(y_test, y_predict, target_names=digits.target_names.astype(str), digits=2)
print(report)
结果
the accuracy of linearSVC is 0.953333333333
precision recall f1-score support
0 0.92 1.00 0.96 35
1 0.96 0.98 0.97 54
2 0.98 1.00 0.99 44
3 0.93 0.93 0.93 46
4 0.97 1.00 0.99 35
5 0.94 0.94 0.94 48
6 0.96 0.98 0.97 51
7 0.92 1.00 0.96 35
8 0.98 0.84 0.91 58
9 0.95 0.91 0.93 44
avg / total 0.95 0.95 0.95 450
特别说明:本文所有案例都是来自书籍《机器学习及实战》