4.15-多元线性回归、逻辑回归、随机梯度

最新推荐文章于 2021-03-26 10:53:54 发布

weixin_40387941

最新推荐文章于 2021-03-26 10:53:54 发布

阅读量337

点赞数

分类专栏： python分类文章标签： python分类模型

python分类专栏收录该内容

4 篇文章 0 订阅

订阅专栏

#1.数据预处理
import numpy as np
import matplotlib.pyplot as plt
#定义存储数据（x）和目标数据(y)的数组
x,y=[],[]
#遍历数据集，变量sample是一个样本
for sample in open("price.txt","r"):
#调用split方法并将逗号作为参数传入
_x,_y=sample.split(",")
#将字符串数据转化为浮点数
x.append(float(_x))
y.append(float(_y))
#将读完的数据转化为Numpy数组，进行下一步处理
x,y=np.array(x),np.array(y)
print(x,y)
#标准化
x=(x-x.mean())/x.std()
#将原始数据集以散点图形式画出
plt.figure()
plt.scatter(x,y,c="g",s=6)
plt.show()
#2.选择与训练模型
#先取100个点作为画图的基础
x0=np.linspace(-2,4,100)
#Numpy里自带两个函数。polyfit(x,y,deg),该函数是模型的训练函数。#返回的是使损失函数最小的参数p，即多项式的各项系数。
#另一个是polyval(p,x),根据输入的x，返回多项式y的值。
#deg表示输入的参数n,可自己设置，比较哪个模型更好。
def get_model(deg):
return lambda input_x=x0:np.polyval(np.polyfit(x,y,deg),input_x)#x,y是已知样本的x,y
#3.评估与可视化结果
#看谁的损失函数最小，即模型好。根据n,输入的x,y返回相对应的损失。
def get_cost(deg,input_x,input_y):
return 0.5*((get_model(deg)(input_x)-input_y)**2).sum()
#定义测试参数集并根据它进行各种实验。
test_set=(1,4,10)
for d in test_set:
print(get_cost(d,x,y))
#可能会出现过拟合的情况，因此结合图像，判断。
plt.scatter(x,y,c="g",s=20)
for d in test_set:
plt.plot(x0,get_model(d)(),label="degree={}".format(d))
#设置横纵坐标范围为（-2,4）和（1e5即为10**5,8e5）
plt.xlim(-2,4)
plt.ylim(1e5,8e5)
#调用legend方法使得曲线对应的label正确显示。
plt.legend()
plt.show()
#n=1最好。

#####
#kaagle书笔记一
#成员运算代码，in可以询问列表，元组和字典，但是只能询问字典中的键。
#exg1:对字典d的键进行循环遍历，输出每组键值对
d={1:'1',"abc":0.1,0.4:80}
for k in d:
print(k,":",d[k])
#本篇练习总结:线性分类器。训练集在10万量级以上，考虑时间耗用，推荐使用随机梯度对模型参数进行估计。
import pandas as pd
import numpy as np
#1.数据预处理
#创建特征列表
column_names=['Id','1','2','3','4','5','6','7','8','9','class']
#从互联网读取数据
data=pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data',names=column_names)
#将？号替换为标准缺失值。
data=data.replace(to_replace='?',value=np.nan)
#只要有一个维度有缺失值，则丢弃。
data=data.dropna(how='any')
#输出data的数据量和维度
data.shape
#2.对带有标记的数据进行分割，划分训练和测试数据集
from sklearn.cross_validation import train_test_split#一个模块
#随机采样25%作为测试，其余作为训练。#1:10包括第二列，但不包括第11列。
X_train,X_test,y_train,y_test=train_test_split(data[column_names[1:10]],data[column_names[10]],test_size=0.25,random_state=33)
#检查训练和测试样本的类别的数量分布
y_train.value_counts()
y_test.value_counts()
#标准化数据
from sklearn.preprocessing import StandardScaler#标准化数据
from sklearn.linear_model import LogisticRegression#逻辑斯蒂回归
from sklearn.linear_model import SGDClassifier#随机梯度参数估计
#标准化数据，使得每个维度的特征数据方差为1，均值为0.
ss=StandardScaler()
X_train=ss.fit_transform(X_train)#fit_transform()的作用就是先拟合数据，然后转化它将其转化为标准形式,用在训练集上。
X_test=ss.transform(X_test)#即tranform()的作用是通过找中心和缩放等实现标准化，用在测试集上。
#初始化两个分类器
lr=LogisticRegression()
sgdc=SGDClassifier()
#调用LogisticRegressio分类器中的fit函数用来训练模型参数
lr.fit(X_train,y_train)
#使用训练好的lr模型对X_test预测
lr_y_predict=lr.predict(X_test)
#调用SGDClassifier分类器中的fit函数用来训练模型参数
sgdc.fit(X_train,y_train)
sgdc_y_predict=lr.predict(X_test)
#从准确度，分类模型lr,sgdc自带有score函数；还有准确率，召回率，F1评价指标计算阴性还是阳性。
print('Accuracy of LR Classifier: ',lr.score(X_test,y_test))
from sklearn.metrics import classification_report
print(classification_report(y_test,lr_y_predict,target_names=['Benign','Malignant']))
print('Accuracy of SGD Classifier: ',sgdc.score(X_test,y_test))
from sklearn.metrics import classification_report
print(classification_report(y_test,sgdc_y_predict,target_names=['Benign','Malignant']))
#逻辑回归准确度稍微好点，因为scikit-learn采用解析方式算LogisticRegression的参数，而用梯度法估计SGDClassifier的参数。

weixin_40387941

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
4.15-多元线性回归、逻辑回归、随机梯度

#1.数据预处理import numpy as npimport matplotlib.pyplot as plt#定义存储数据（x）和目标数据(y)的数组x,y=[],[]#遍历数据集，变量sample是一个样本for sample in open("price.txt","r"): #调用split方法并将逗号作为参数传入 _x,_y=sample.split(",...
复制链接

扫一扫