sklearn学习

最新推荐文章于 2018-12-07 17:35:00 发布

亲爱的水告先生

最新推荐文章于 2018-12-07 17:35:00 发布

阅读量114

点赞数

分类专栏：数据分析

本文链接：https://blog.csdn.net/weixin_43488626/article/details/84676956

版权

数据分析专栏收录该内容

5 篇文章 0 订阅

订阅专栏

sklearn

字典实例化(转化为二进制数据)

import jieba
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


from sklearn.preprocessing import MinMaxScaler, StandardScaler, Imputer

def dictvenc():
    # 实例化  字典向量化   有多少种状态就用多少位的二进制表示,1表示有,0表示没有
    dict = DictVectorizer(sparse=False)
    # 调用fit_transform
    data = dict.fit_transform([{'city': '北京', 'pos': '北方', 'temperature': 100},
                               {'city': '上海', 'pos': '东方', 'temperature': 60},
                               {'city': '深圳', 'pos': '南方', 'temperature': 30},
                               {'city': '重庆', 'pos': '南方', 'temperature': 70},
                               ])
    print(dict.get_feature_names())
    print(data)
    print(dict.inverse_transform(data)[0])
    return None

文本实例化

def countvec():
    # 对文本进行特征值化
    # 文本向量化
    cv = CountVectorizer()
    # 会过滤掉一些英文停止词   中文的没有
    data = cv.fit_transform(["this is a test test", "we have a test"])
    print(cv.get_feature_names())
    print(data.toarray())
    return None


def cutword():
    con1 = jieba.cut('床前明月光, 我要学python.')
    con2 = jieba.cut('床前明月光, 疑是地上霜.')
    con3 = jieba.cut('生存或死亡, 这是一个问题.')

    # 转化成列表
    content1 = list(con1)
    content2 = list(con2)
    content3 = list(con3)

    c1 = ' '.join(content1)
    c2 = ' '.join(content2)
    c3 = ' '.join(content3)
    return c1, c2, c3


def hanzivec():
    c1, c2, c3 = cutword()
    cv = CountVectorizer()
    print(c1, c2, c3)
    data = cv.fit_transform([c1, c2, c3])

    for f_name in cv.get_feature_names():
        print(f_name)

    print(data.toarray())
    return None

hanzivec()

归一化处理

数据归一化处理将所有数的区间缩放到2-3

def mm():
    # 归一化处理    按列算的
    mm = MinMaxScaler(feature_range=(2, 3))
    data = mm.fit_transform([[90, 2, 10, 40], [60, 4, 15, 45], [75, 3, 13, 46]])

    print(data)

标准化缩放(缩放至-1, 1进行比较)

def stand():
    # 标准化缩放   每一列相加为0, 标准差为1
    std = StandardScaler()
    data = std.fit_transform([[1., -1., 3.],
                              [2., 4., 2.],
                              [4., 6., -1.]])
    print(data)

线性模型(最小二乘法)二维回归预测

import numpy as np
from sklearn import linear_model
import matplotlib.pyplot as plt
# 房屋面积与价格历史数据(csv文件)   转化为二维数组
data = np.array([[150, 6450], [200, 7450], [250, 8450], [300, 9450], [350, 11450], [400, 15450], [600, 18450]])
# print(data[:, 0])  这是X轴坐标   y轴坐标
plt.scatter(data[:, 0], data[:, 1], color='blue')
#线性模型
regr = linear_model.LinearRegression()
# 拟合
regr.fit(data[:, 0].reshape(-1, 1), data[:, 1])
# 直线的斜率,截距
a, b = regr.coef_, regr.intercept_
plt.plot(data[:, 0], regr.predict(data[:, 0].reshape(-1, 1)), color='red')
# 预测第175天房价数据
print(regr.predict([[175]]))
plt.show()

多维回归(多个参数预测)

import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

def price_predict():
    # 数据有三个特征:距离地铁距离,附近小学数量,小区绿化率
    X = np.array([[500.0, 3.0, 0.3], [1000.0, 1.0, 0.6], [750.0, 2.0, 0.3], [600.0, 5.0, 0.2],
                  [1200.0, 1.0, 0.6]])
    Y = np.array([10000., 9000., 8000., 12000., 8500.])
    #具有三个特征的房屋对应的房价
    std_x = StandardScaler()
    x_tarin = std_x.fit_transform(X)

    std_y = StandardScaler()
    y_train = std_y.fit_transform(Y.reshape(-1, 1))
    # 构建线性预测模型
    lr = LinearRegression()
    # 模型在历史数据上进行训练  Y.reshape(-1, 1)将Y变为二维数组, fit函数要求二维数组
    lr.fit(x_tarin, y_train)
    # 使用训练模型预测新房屋[1300, 3.0, 0.4]的价格
    x_predict = std_x.transform(np.array([[1300, 3.0, 0.4]]))
    print(std_y.inverse_transform(lr.predict(x_predict)))

if __name__ == '__main__':
    price_predict()

示例

# 通过使用sklearn里面提供的数据包来进行使用,数据包是关于美国购房的信息
from sklearn.datasets import load_boston
# 加载数据
lb = load_boston()
# 数据的特征数据
X = lb.data
print(X[0])
# 数据的预测价格
Y = lb.target
std_x = StandardScaler()
x_house = std_x.fit_transform(X)
std_y = StandardScaler()
y_house = std_y.fit_transform(Y.reshape(-1, 1))
# 构建线性预测模型
lr = LinearRegression()
# 模型在历史数据上进行训练
lr.fit(x_house, y_house)
# 使用训练模型预测房屋价格
x_predict = std_x.transform((np.array([[6.320e-03, 1.800e+01, 2.310e+00, 0.000e+00, 5.380e-01, 6.575e+00, 6.520e+01,4.090e+00, 1.000e+00, 2.960e+02, 1.530e+01, 3.969e+02, 4.980e+00]])))
print(std_y.inverse_transform(lr.predict(x_predict)))

利用数据生成数据集和测试集,用来生成预测模型

def boston_linear():
    # 线性回归直接预测房子价格
    # 获取数据
    lb = load_boston()
    # 分割数据集到训练集和测试集
    x_train, x_test, y_train, y_test = train_test_split(lb.data, lb.target, test_size=0.25)
    # 特征值和目标值都必须进行标准化处理,实例化两个标准化API
    std_x = StandardScaler()
    x_train = std_x.fit_transform(x_train)
    # 用转化训练集的标准归一化测试集
    x_test = std_x.transform(x_test)
    #测试数据目标值
    std_y = StandardScaler()
    # -1 表示系统自动计算行数
    y_train = std_y.fit_transform(y_train.reshape(-1, 1))
    y_test = std_y.transform(y_test.reshape(-1, 1))
    # estimator预测
    # 建立线性模型预测结果
    lr = LinearRegression()
    lr.fit(x_train, y_train)
    # 保存训练好的模型
    joblib.dump(lr, './test.pkl')
    # 预测测试集的房子价格
    # print("orgfin value is :::", std_y.inverse_transform(y_test[9]))
    # y_lr_predict = std_y.inverse_transform(lr.predict(np.array([x_test[9]])))
    # print(u"predict value is :::", y_lr_predict)
    model = joblib.load("./test.pkl")
    # 测试集的原始数据
    y_median = std_y.inverse_transform(y_test)
    # 测试集的预测数据
    y_predict = std_y.inverse_transform(lr.predict(x_test))
    # 测试集的个数
    n = y_predict.shape[0]
    # 均方差     测试集原始数据和预测数据的平方差的和除以个数
    # a = ((y_median - y_predict) ** 2).sum() / n
    # 平均绝对值误差  测试集原始数据和预测数据的差的和除以个数
    # a = abs(y_median - y_predict).sum() / n
    # 可释方差得分   
    # a = 1 - np.std(y_median - y_predict) / np.std(y_median)
    # 中值绝对误差
    # a = np.sum(abs(y_median - y_predict))
    # R2决定系数(拟合优度)
    a = r2_score(y_median, y_predict)
    print(a)

亲爱的水告先生

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
sklearn学习

sklearn字典实例化(转化为二进制数据)import jiebafrom sklearn.feature_extraction import DictVectorizerfrom sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizerfrom sklearn.preprocessing imp...
复制链接

扫一扫

专栏目录