生信机器学习入门1 - 数据预处理与线性回归(Linear regression)预测

数据预处理与线性回归(Linear regression)预测

数据集下载

# data文件夹中包含数据集文件
git https://github.com/LittleGlowRobot/machine_learning.git

数据预处理主要步骤

参考github。
数据预处理

读取数据集

import numpy as np
import pandas as pd

########### 数据预处理实践 ###########
dataset = pd.read_csv('data/Data.csv')
print(dataset)

dataset

1. 数据预处理

X = dataset.loc[:, ['Country', 'Age', 'Salary']].values
print(X)

Y = dataset.loc[:, 'Purchased'].values
print(Y)

####### X #######
# [['France' 44.0 72000.0]
#  ['Spain' 27.0 48000.0]
#  ['Germany' 30.0 54000.0]
#  ['Spain' 38.0 61000.0]
#  ['Germany' 40.0 nan]
#  ['France' 35.0 58000.0]
#  ['Spain' nan 52000.0]
# ['France' 48.0 79000.0]
# ['Germany' 50.0 83000.0]
# ['France' 37.0 67000.0]]

####### Y #######
# ['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']

2. 填充缺失值

from sklearn.impute import SimpleImputer
# 使用平均值填充缺失值
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

# 重新填充
imputer = imputer.fit(X[:, 1:3])

# 赋值
X[:, 1:3] = imputer.transform(X[:, 1:3])
print(X)

####### X #######

# [['France' 44.0 72000.0]
#  ['Spain' 27.0 48000.0]
#  ['Germany' 30.0 54000.0]
#  ['Spain' 38.0 61000.0]
#  ['Germany' 40.0 63777.77777777778]
#  ['France' 35.0 58000.0]
#  ['Spain' 38.77777777777778 52000.0]
#  ['France' 48.0 79000.0]
#  ['Germany' 50.0 83000.0]
# ['France' 37.0 67000.0]]

3. 解析分类数据

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

# 解析分类数据, 转换为数字表示类别
categories = ColumnTransformer([("", OneHotEncoder(), [0])], remainder='passthrough')

X = categories.fit_transform(X)
label_encoder_Y = LabelEncoder()
Y = label_encoder_Y.fit_transform(Y)

print(X)
# 0表示NO, 1表示YES

####### X #######
# [[0.0 1.0 0.0 0.0 44.0 72000.0]
#  [1.0 0.0 0.0 1.0 27.0 48000.0]
#  [1.0 0.0 1.0 0.0 30.0 54000.0]
#  [1.0 0.0 0.0 1.0 38.0 61000.0]
#  [1.0 0.0 1.0 0.0 40.0 63777.77777777778]
#  [0.0 1.0 0.0 0.0 35.0 58000.0]
#  [1.0 0.0 0.0 1.0 38.77777777777778 52000.0]
#  [0.0 1.0 0.0 0.0 48.0 79000.0]
#  [1.0 0.0 1.0 0.0 50.0 83000.0]
# [0.0 1.0 0.0 0.0 37.0 67000.0]]

print(Y)
####### Y #######
# [0 1 0 0 1 1 0 1 0 1]

4. 划分数据集

from sklearn.model_selection import train_test_split

# 将数据集按80%:20%划分为训练数据集和测试数据集
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

print(X_train)
# [[1.0 0.0 1.0 0.0 40.0 63777.77777777778]
# [0.0 1.0 0.0 0.0 37.0 67000.0]
# [1.0 0.0 0.0 1.0 27.0 48000.0]

print(X_test)
# [1.0 0.0 0.0 1.0 38.77777777777778 52000.0]
# [0.0 1.0 0.0 0.0 48.0 79000.0]
# [1.0 0.0 0.0 1.0 38.0 61000.0]
# [0.0 1.0 0.0 0.0 44.0 72000.0]
# [0.0 1.0 0.0 0.0 35.0 58000.0]]
# [[1.0 0.0 1.0 0.0 30.0 54000.0]
# [1.0 0.0 1.0 0.0 50.0 83000.0]]

print(Y_train)
# [1 1 1 0 1 0 0 1]

print(Y_test)
# [0 0]

5. 特征缩放

from sklearn.preprocessing import StandardScaler

# 特征缩放
standard_scaler_X = StandardScaler()
x_tarin = standard_scaler_X.fit_transform(X_train)
X_test = standard_scaler_X.transform(X_test)

print(X_train)
# [[1.0 0.0 1.0 0.0 40.0 63777.77777777778]
#  [0.0 1.0 0.0 0.0 37.0 67000.0]
#  [1.0 0.0 0.0 1.0 27.0 48000.0]
#  [1.0 0.0 0.0 1.0 38.77777777777778 52000.0]
#  [0.0 1.0 0.0 0.0 48.0 79000.0]
#  [1.0 0.0 0.0 1.0 38.0 61000.0]
#  [0.0 1.0 0.0 0.0 44.0 72000.0]
#  [0.0 1.0 0.0 0.0 35.0 58000.0]]

print(X_test)
# [[ 1.         -7.         22.66890466 -5.67954246 -7.80844702 -6.56578547]
#  [ 1.         -7.         22.66890466 -5.67954246 -7.70634099 -6.56578543]]

简单线性回归(Linear regression)实践

线性回归主要步骤

线性回归

代码实现Linear regression

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

dataset_scores = pd.read_csv('data/studentscores.csv')
# 按Scores列降序排序数据
# dataset_scores_sorted = dataset_scores.sort_values(by='Scores', ascending=False)

print(dataset_scores)
# Hours  Scores
# 0     2.5      21
# 1     5.1      47
# 2     3.2      27
# 3     8.5      75
# 4     3.5      30
# 5     1.5      20
# 6     9.2      88
# 7     5.5      60
# 8     8.3      81
# 9     2.7      25
# 10    7.7      85
# 11    5.9      62
# 12    4.5      41
# 13    3.3      42
# 14    1.1      17
# 15    8.9      95
# 16    2.5      30
# 17    1.9      24
# 18    6.1      67
# 19    7.4      69
# 20    2.7      30
# 21    4.8      54
# 22    3.8      35
# 23    6.9      76
# 24    7.8      86
# 25    2.1      93
# 26    2.2      93
# 27    2.5      93

# 数据预处理, iloc基于行号和列号索引
X = dataset_scores.iloc[0:25, :1].values
Y = dataset_scores.iloc[0:25, -1:].values
print("X:\n",X)
print("Y:\n",Y)

# X:
#  [[2.5]
#  [5.1]
#  [3.2]
#  [8.5]
#  [3.5]
#  [1.5]
#  [9.2]
#  [5.5]
#  [8.3]
#  [2.7]
#  [7.7]
#  [5.9]
#  [4.5]
#  [3.3]
#  [1.1]
#  [8.9]
#  [2.5]
#  [1.9]
#  [6.1]
#  [7.4]
#  [2.7]
#  [4.8]
#  [3.8]
#  [6.9]
#  [7.8]]

# Y:
#  [[21]
#  [47]
#  [27]
#  [75]
#  [30]
#  [20]
#  [88]
#  [60]
#  [81]
#  [25]
#  [85]
#  [62]
#  [41]
#  [42]
#  [17]
#  [95]
#  [30]
#  [24]
#  [67]
#  [69]
#  [30]
#  [54]
#  [35]
#  [76]
#  [86]]

# 划分训练集和测试集
from sklearn.model_selection import train_test_split
# 3/4训练集, 1/4测试集
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=1/4, random_state=0)
print(X_train)
# [[7.8]
#  [6.9]
#  [1.1]
#  [5.1]
#  [7.7]
#  [3.3]
#  [8.3]
#  [9.2]
#  [6.1]
#  [3.5]
#  [2.7]
#  [5.5]
#  [2.7]
#  [8.5]
#  [2.5]
#  [4.8]
#  [8.9]
#  [4.5]]

print(X_test)
# [[1.5]
#  [3.2]
#  [7.4]
#  [2.5]
#  [5.9]
#  [3.8]
#  [1.9]]

print(Y_train)
# [[86]
#  [76]
#  [17]
#  [47]
#  [85]
#  [42]
#  [81]
#  [88]
#  [67]
#  [30]
#  [25]
#  [60]
#  [30]
#  [75]
#  [21]
#  [54]
#  [95]
#  [41]]

print(Y_test)
# [[20]
#  [27]
#  [69]
#  [30]
#  [62]
#  [35]
#  [24]]

# 训练线性回归模型
from sklearn.linear_model import LinearRegression
linear_regressor = LinearRegression()
linear_regressor = linear_regressor.fit(X_train, Y_train)

# 使用测试集进行预测
Y_predict = linear_regressor.predict(X_test)
print(Y_predict)
# [[16.84472176]
#  [33.74557494]
#  [75.50062397]
#  [26.7864001 ]
#  [60.58810646]
#  [39.71058194]
#  [20.8213931 ]]

# 训练集数据可视化
import matplotlib.pyplot as plt

# 训练集数据可视化, 红色点
plt.scatter(X_train, Y_train, color='red')

# 使用X_train训练集作为输入,线性模型预测结果
plt.plot(X_train, linear_regressor.predict(X_train), 'bo-')
plt.show()

训练集数据可视化

# 测试集数据可视化
plt.scatter(X_test, Y_test, color='red')

# 使用X_test测试集作为输入,线性模型预测结果
plt.plot(X_test, Y_predict, 'bo-')
plt.show()

 测试集数据可视化

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

生信与基因组学

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值