分类学习

肿瘤检测

  逻辑斯蒂回归分类(LogisticRegression)和随机梯度下降(SGD)

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
#导入sklearn中的逻辑斯蒂回归分类器
from sklearn.linear_model import LogisticRegression

df_train = pd.read_csv('../dataSets/breast-cancer-train.csv')
df_test = pd.read_csv('../dataSets/breast-cancer-test.csv')

df_test_negative = df_test.loc[df_test['Type'] == 0][['Clump Thickness', 'Cell Size', 'Type']]
df_test_positive = df_test.loc[df_test['Type'] == 1][['Clump Thickness', 'Cell Size', 'Type']]

# print(df_test_negative, df_test_positive, sep='\n******************************\n')

plt.scatter(df_test_negative['Clump Thickness'], df_test_negative['Cell Size'], marker='o', s=200, c='red')
plt.scatter(df_test_positive['Clump Thickness'], df_test_positive['Cell Size'], marker='x', s=150, c='black')
plt.xlabel('Clump Thickness')
plt.ylabel('Cell Size')

# intercept = np.random.random_sample()
# coef = np.random.random_sample(2)
# lx = np.arange(1,12)
# ly = (intercept+lx*coef[0])/coef[1]
# plt.plot(lx,ly,c='yellow')
# plt.show()

#逻辑斯蒂回归分类器
lr = LogisticRegression()
lr.fit(df_train[['Clump Thickness', 'Cell Size']], df_train['Type'])
score = lr.score(df_test[['Clump Thickness', 'Cell Size']], df_test['Type'])
print('Testing accuracy:', score)
intercept = lr.intercept_
coef = lr.coef_[0, :]
# print(type(coef))#返回类型<class 'numpy.ndarray'>
# print(coef[0], coef[1], sep='\t')
lx = np.arange(1, 12)
ly = (-intercept - lx * coef[0]) / coef[1] # intercept+coef[0]*lx+coef[1]*ly=0
plt.plot(lx, ly, c='green')
plt.show()

这里写图片描述
  
  这里是使用完整的数据集,并且用LogisticRegression和SGD两种方法

import numpy as np
import pandas as pd
# from sklearn.cross_validation import train_test_split  #已经被废弃
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.linear_model import SGDClassifier

'''
#  Attribute                     Domain
   -- -----------------------------------------
   1. Sample code number            id number
   2. Clump Thickness               1 - 10
   3. Uniformity of Cell Size       1 - 10
   4. Uniformity of Cell Shape      1 - 10
   5. Marginal Adhesion             1 - 10
   6. Single Epithelial Cell Size   1 - 10
   7. Bare Nuclei                   1 - 10
   8. Bland Chromatin               1 - 10
   9. Normal Nucleoli               1 - 10
  10. Mitoses                       1 - 10
  11. Class:                        (2 for benign, 4 for malignant)

'''
# 创建特征值列表名称
column_names = ['Sample code number', 'Clump Thickness', 'Uniformity of Cell Size',
                'Uniformity of Cell Shape', 'Marginal Adhesion', 'Single Epithelial Cell Size',
                'Bare Nuclei', 'Bland Chromatin', 'Normal Nucleoli',
                'Mitoses', 'Class']
# 读取数据
data = pd.read_csv('../dataSets/breast-cancer-wisconsin.data', names=column_names)
# 查看数据
# print(data[['Sample code number','Class']])
# 数据处理,将?替换为标准缺失值,丢弃缺失值(丢弃策略:any:表示只要有一个维度缺失就丢弃)
data = data.replace(to_replace='?', value=np.nan)
data = data.dropna(how='any')
# 查看数据量和数据维度,矩阵行表示数据量,列表示数据维度
print(data.shape)

'''

  对数据集进行切分
  25%作为测试集    75%作为训练集
'''
# 训练集  测试集   训练集标签 测试集标签
X_train, X_test, y_train, y_test = train_test_split(data[column_names[1:10]], data[column_names[10]], test_size=0.25,
                                                    random_state=33)
# 统计样本的类别和数量[eg: 2  100]
print(y_train.value_counts())  # y_train:  <class 'pandas.core.series.Series'>
print(y_test.value_counts())

'''
  数据预处理
'''
ss = StandardScaler()
# 标准化、归一化
X_train = ss.fit_transform(X_train)
# 归一化 ? 若使用标准化并归一化的测试集呢?
X_test = ss.transform(X_test)  # score is: 0.988304093567
# X_test = ss.fit_transform(X_test) # score is: 0.970760233918 #这是绝对错误的!!!

'''
  创建分类器
  LogisticRegression    SGDClassifier
'''
lr = LogisticRegression()
# 调用LogisticRegression模型中的fit函数来训练模型参数
lr.fit(X_train, y_train)
intercept = lr.intercept_
coef = lr.coef_[0, :]
# 得到的线性函数
print('intercept:',intercept)
print(coef[0], coef[1], sep='\t')
lr_y_predict = lr.predict(X_test)
score = lr.score(X_test,y_test)
print('Accuracy of LR Classifier:', score)
report = classification_report(y_test, lr_y_predict, target_names=['Benign', 'Malignant'])
print(report)

sgdc = SGDClassifier(max_iter=5,tol=None)
sgdc.fit(X=X_train, y=y_train)
sgdc_y_predict = sgdc.predict(X_test)
intercept = sgdc.intercept_
coef = sgdc.coef_[0, :]
print('intercept:',intercept)
print(coef[0], coef[1], sep='\t')
report = classification_report(y_test, sgdc_y_predict, target_names=['Benign', 'Malignant'])
print(report)

 程序的运行结果

(683, 11)
2    344
4    168
Name: Class, dtype: int64
2    100
4     71
Name: Class, dtype: int64
intercept: [-1.22395889]
1.27854421503   0.162362376046
Accuracy of LR Classifier: 0.988304093567
             precision    recall  f1-score   support

     Benign       0.99      0.99      0.99       100
  Malignant       0.99      0.99      0.99        71

avg / total       0.99      0.99      0.99       171

intercept: [-0.82457626]
8.7408800265    -0.0709122072493
             precision    recall  f1-score   support

     Benign       1.00      0.98      0.99       100
  Malignant       0.97      1.00      0.99        71

avg / total       0.99      0.99      0.99       171

手写数字识别

  使用Scikit-learn中提供的支持向量机分类器LinearSVC。

from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

'''
 加载数据集:Scikit-learn内部封装的数据集
'''
digits = load_digits()  # digits: <class 'sklearn.utils.Bunch'>
# print(digits.data.shape) # (1797, 64)   digits.data: <class 'numpy.ndarray'>
# print(digits.target) # <class 'numpy.ndarray'> [0 1 2 ..., 8 9 8]
# print(type(digits.target_names)) # <class 'numpy.ndarray'> [0 1 2 3 4 5 6 7 8 9]
# print(type(digits.target_names.astype(str))) # <class 'numpy.ndarray'> ['0' '1' '2' '3' '4' '5' '6' '7' '8' '9']
'''
 切分数据集
'''
X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, test_size=0.25, random_state=33)
'''
 标准化数据集
'''
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)
'''
 支持向量机分类器
'''
l_svc = LinearSVC()
l_svc.fit(X_train, y_train)
y_predict = l_svc.predict(X_test)
'''
 性能指标
'''
score = l_svc.score(X_test, y_test)
print('the accuracy of linearSVC is ', score)
report = classification_report(y_test, y_predict, target_names=digits.target_names.astype(str), digits=2)
print(report)

结果

the accuracy of linearSVC is  0.953333333333
             precision    recall  f1-score   support

          0       0.92      1.00      0.96        35
          1       0.96      0.98      0.97        54
          2       0.98      1.00      0.99        44
          3       0.93      0.93      0.93        46
          4       0.97      1.00      0.99        35
          5       0.94      0.94      0.94        48
          6       0.96      0.98      0.97        51
          7       0.92      1.00      0.96        35
          8       0.98      0.84      0.91        58
          9       0.95      0.91      0.93        44

avg / total       0.95      0.95      0.95       450

特别说明:本文所有案例都是来自书籍《机器学习及实战》

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值