# coding=utf-8
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
def logistic():
"""
逻辑回归做二分类进行癌症预测 (根据细胞的属性特征)
:return:
"""
# 构造列明
column = ['sample code number', 'clump thickness', 'uniformity of cell size', 'yniformity of cell shape', 'marginal adhesion', 'single epithelial cell size', 'bare nuclei', 'bland chromatin', 'normal nucleoli', 'mitoses', 'class']
# 读取数据
data = pd.read_csv("./breast-cancer-wisconsin.data", names=column)
print(len(data))
print(data.head(10))
# 缺失值处理
data = data.replace(to_replace='?', value=np.nan)
# 删除nan值的数据
data = data.dropna()
# 数据分割
x_train, x_test, y_train, y_test = train_test_split(data[column[1:10]], data[column[10]], test_size=0.25)
print("-"*100)
# 标准化处理
std = StandardScaler()
x_train = std.fit_transform(x_train)
x_test =std.fit_transform(x_test)
# print(x_train)
# print(x_test)
# 逻辑回归机器学习
log = LogisticRegression(C=1.0)
log.fit(x_train, y_train)
print(log.coef_)
y_predict = log.predict(x_test)
print("预测值: ", log.score(x_test, y_test))
print("召回率:" , classification_report(y_test, y_predict, labels=[2, 4], target_names=["良性", "恶性"]))
return None
if __name__ == "__main__":
logistic()