生信机器学习入门1 - 数据预处理与线性回归（Linear regression）预测

最新推荐文章于 2025-08-12 14:51:20 发布

原创最新推荐文章于 2025-08-12 14:51:20 发布 · 406 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#机器学习 #线性回归 #人工智能

生物信息与机器学习专栏收录该内容

8 篇文章

订阅专栏

部署运行你感兴趣的模型镜像

数据预处理与线性回归（Linear regression）预测

数据集下载

# data文件夹中包含数据集文件
git https://github.com/LittleGlowRobot/machine_learning.git

数据预处理主要步骤

参考github。
数据预处理

读取数据集

import numpy as np
import pandas as pd

########### 数据预处理实践 ###########
dataset = pd.read_csv('data/Data.csv')
print(dataset)

dataset

1. 数据预处理

X = dataset.loc[:, ['Country', 'Age', 'Salary']].values
print(X)

Y = dataset.loc[:, 'Purchased'].values
print(Y)

####### X #######
# [['France' 44.0 72000.0]
#  ['Spain' 27.0 48000.0]
#  ['Germany' 30.0 54000.0]
#  ['Spain' 38.0 61000.0]
#  ['Germany' 40.0 nan]
#  ['France' 35.0 58000.0]
#  ['Spain' nan 52000.0]
# ['France' 48.0 79000.0]
# ['Germany' 50.0 83000.0]
# ['France' 37.0 67000.0]]

####### Y #######
# ['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']

2. 填充缺失值

from sklearn.impute import SimpleImputer
# 使用平均值填充缺失值
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

# 重新填充
imputer = imputer.fit(X[:, 1:3])

# 赋值
X[:, 1:3] = imputer.transform(X[:, 1:3])
print(X)

####### X #######

# [['France' 44.0 72000.0]
#  ['Spain' 27.0 48000.0]
#  ['Germany' 30.0 54000.0]
#  ['Spain' 38.0 61000.0]
#  ['Germany' 40.0 63777.77777777778]
#  ['France' 35.0 58000.0]
#  ['Spain' 38.77777777777778 52000.0]
#  ['France' 48.0 79000.0]
#  ['Germany' 50.0 83000.0]
# ['France' 37.0 67000.0]]

3. 解析分类数据

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

# 解析分类数据， 转换为数字表示类别
categories = ColumnTransformer([("", OneHotEncoder(), [0])], remainder='passthrough')

X = categories.fit_transform(X)
label_encoder_Y = LabelEncoder()
Y = label_encoder_Y.fit_transform(Y)

print(X)
# 0表示NO, 1表示YES

####### X #######
# [[0.0 1.0 0.0 0.0 44.0 72000.0]
#  [1.0 0.0 0.0 1.0 27.0 48000.0]
#  [1.0 0.0 1.0 0.0 30.0 54000.0]
#  [1.0 0.0 0.0 1.0 38.0 61000.0]
#  [1.0 0.0 1.0 0.0 40.0 63777.77777777778]
#  [0.0 1.0 0.0 0.0 35.0 58000.0]
#  [1.0 0.0 0.0 1.0 38.77777777777778 52000.0]
#  [0.0 1.0 0.0 0.0 48.0 79000.0]
#  [1.0 0.0 1.0 0.0 50.0 83000.0]
# [0.0 1.0 0.0 0.0 37.0 67000.0]]

print(Y)
####### Y #######
# [0 1 0 0 1 1 0 1 0 1]

4. 划分数据集

from sklearn.model_selection import train_test_split

# 将数据集按80%:20%划分为训练数据集和测试数据集
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

print(X_train)
# [[1.0 0.0 1.0 0.0 40.0 63777.77777777778]
# [0.0 1.0 0.0 0.0 37.0 67000.0]
# [1.0 0.0 0.0 1.0 27.0 48000.0]

print(X_test)
# [1.0 0.0 0.0 1.0 38.77777777777778 52000.0]
# [0.0 1.0 0.0 0.0 48.0 79000.0]
# [1.0 0.0 0.0 1.0 38.0 61000.0]
# [0.0 1.0 0.0 0.0 44.0 72000.0]
# [0.0 1.0 0.0 0.0 35.0 58000.0]]
# [[1.0 0.0 1.0 0.0 30.0 54000.0]
# [1.0 0.0 1.0 0.0 50.0 83000.0]]

print(Y_train)
# [1 1 1 0 1 0 0 1]

print(Y_test)
# [0 0]

5. 特征缩放

from sklearn.preprocessing import StandardScaler

# 特征缩放
standard_scaler_X = StandardScaler()
x_tarin = standard_scaler_X.fit_transform(X_train)
X_test = standard_scaler_X.transform(X_test)

print(X_train)
# [[1.0 0.0 1.0 0.0 40.0 63777.77777777778]
#  [0.0 1.0 0.0 0.0 37.0 67000.0]
#  [1.0 0.0 0.0 1.0 27.0 48000.0]
#  [1.0 0.0 0.0 1.0 38.77777777777778 52000.0]
#  [0.0 1.0 0.0 0.0 48.0 79000.0]
#  [1.0 0.0 0.0 1.0 38.0 61000.0]
#  [0.0 1.0 0.0 0.0 44.0 72000.0]
#  [0.0 1.0 0.0 0.0 35.0 58000.0]]

print(X_test)
# [[ 1.         -7.         22.66890466 -5.67954246 -7.80844702 -6.56578547]
#  [ 1.         -7.         22.66890466 -5.67954246 -7.70634099 -6.56578543]]

简单线性回归(Linear regression)实践

线性回归主要步骤

线性回归

代码实现Linear regression

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

dataset_scores = pd.read_csv('data/studentscores.csv')
# 按Scores列降序排序数据
# dataset_scores_sorted = dataset_scores.sort_values(by='Scores', ascending=False)

print(dataset_scores)
# Hours  Scores
# 0     2.5      21
# 1     5.1      47
# 2     3.2      27
# 3     8.5      75
# 4     3.5      30
# 5     1.5      20
# 6     9.2      88
# 7     5.5      60
# 8     8.3      81
# 9     2.7      25
# 10    7.7      85
# 11    5.9      62
# 12    4.5      41
# 13    3.3      42
# 14    1.1      17
# 15    8.9      95
# 16    2.5      30
# 17    1.9      24
# 18    6.1      67
# 19    7.4      69
# 20    2.7      30
# 21    4.8      54
# 22    3.8      35
# 23    6.9      76
# 24    7.8      86
# 25    2.1      93
# 26    2.2      93
# 27    2.5      93

# 数据预处理, iloc基于行号和列号索引
X = dataset_scores.iloc[0:25, :1].values
Y = dataset_scores.iloc[0:25, -1:].values
print("X:\n",X)
print("Y:\n",Y)

# X:
#  [[2.5]
#  [5.1]
#  [3.2]
#  [8.5]
#  [3.5]
#  [1.5]
#  [9.2]
#  [5.5]
#  [8.3]
#  [2.7]
#  [7.7]
#  [5.9]
#  [4.5]
#  [3.3]
#  [1.1]
#  [8.9]
#  [2.5]
#  [1.9]
#  [6.1]
#  [7.4]
#  [2.7]
#  [4.8]
#  [3.8]
#  [6.9]
#  [7.8]]

# Y:
#  [[21]
#  [47]
#  [27]
#  [75]
#  [30]
#  [20]
#  [88]
#  [60]
#  [81]
#  [25]
#  [85]
#  [62]
#  [41]
#  [42]
#  [17]
#  [95]
#  [30]
#  [24]
#  [67]
#  [69]
#  [30]
#  [54]
#  [35]
#  [76]
#  [86]]

# 划分训练集和测试集
from sklearn.model_selection import train_test_split
# 3/4训练集， 1/4测试集
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=1/4, random_state=0)
print(X_train)
# [[7.8]
#  [6.9]
#  [1.1]
#  [5.1]
#  [7.7]
#  [3.3]
#  [8.3]
#  [9.2]
#  [6.1]
#  [3.5]
#  [2.7]
#  [5.5]
#  [2.7]
#  [8.5]
#  [2.5]
#  [4.8]
#  [8.9]
#  [4.5]]

print(X_test)
# [[1.5]
#  [3.2]
#  [7.4]
#  [2.5]
#  [5.9]
#  [3.8]
#  [1.9]]

print(Y_train)
# [[86]
#  [76]
#  [17]
#  [47]
#  [85]
#  [42]
#  [81]
#  [88]
#  [67]
#  [30]
#  [25]
#  [60]
#  [30]
#  [75]
#  [21]
#  [54]
#  [95]
#  [41]]

print(Y_test)
# [[20]
#  [27]
#  [69]
#  [30]
#  [62]
#  [35]
#  [24]]

# 训练线性回归模型
from sklearn.linear_model import LinearRegression
linear_regressor = LinearRegression()
linear_regressor = linear_regressor.fit(X_train, Y_train)

# 使用测试集进行预测
Y_predict = linear_regressor.predict(X_test)
print(Y_predict)
# [[16.84472176]
#  [33.74557494]
#  [75.50062397]
#  [26.7864001 ]
#  [60.58810646]
#  [39.71058194]
#  [20.8213931 ]]

# 训练集数据可视化
import matplotlib.pyplot as plt

# 训练集数据可视化, 红色点
plt.scatter(X_train, Y_train, color='red')

# 使用X_train训练集作为输入，线性模型预测结果
plt.plot(X_train, linear_regressor.predict(X_train), 'bo-')
plt.show()

训练集数据可视化

# 测试集数据可视化
plt.scatter(X_test, Y_test, color='red')

# 使用X_test测试集作为输入，线性模型预测结果
plt.plot(X_test, Y_predict, 'bo-')
plt.show()

测试集数据可视化

您可能感兴趣的与本文相关的镜像

Python3.8

Conda

Python

Python 是一种高级、解释型、通用的编程语言，以其简洁易读的语法而闻名，适用于广泛的应用，包括Web开发、数据分析、人工智能和自动化脚本