前言
本期给大家分享介绍如何用Dataloader创建数据集
背景
示例代码
from torch import nn
import torch
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import torch.functional as F
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
#####----绘制混淆矩阵图----#
from sklearn.metrics import confusion_matrix
from matplotlib.pylab import style
# style.use('ggplot')
from matplotlib import rcParams
config = {
"font.family": 'serif', # 衬线字体
"font.size": 10, # 相当于小四大小
"font.serif": ['SimSun'], # 宋体
"mathtext.fontset": 'stix', # matplotlib渲染数学字体时使用的字体,和Times New Roman差别不大
'axes.unicode_minus': False # 处理负号,即-号
}
rcParams.update(config)
加载数据、创建数据集
def data_read(file_path):
"""
:fun: 读取cwru mat格式数据
:param file_path: .mat文件路径 eg: r'D:.../01_示例数据/1750_12k_0.021-OuterRace3.mat'
:return accl_data: 读取到的加速度数据
"""
import scipy.io as scio
data = scio.loadmat(file_path) # 加载mat数据
data_key_list = list(data.keys()) # mat文件为字典类型,将key变为list类型
accl_key = data_key_list[3] # mat文件为字典类型,其加速度列在key_list的第4个
accl_data = data[accl_key].flatten() # 获取加速度信号,并展成1维数据
accl_data = (accl_data-np.mean(accl_data))/np.std(accl_data) #Z-score标准化数据集
return accl_data
def data_spilt(data, num_2_generate=20, each_subdata_length=1024):
"""
:Desription: 将数据分割成n个小块。输入数据data采样点数是400000,分成100个子样本数据,每个子样本数据就是4000个数据点
:param data: 要输入的数据
:param num_2_generate: 要生成的子样本数量
:param each_subdata_length: 每个子样本长度
:return spilt_datalist: 分割好的数据,类型为2维list
"""
data = list(data)
total_length = len(data)
start_num = 0 # 子样本起始值
end_num = each_subdata_length # 子样本终止值
step_length = int((total_length - each_subdata_length) / (num_2_generate - 1)) # step_length: 向前移动长度
i = 1
spilt_datalist = []
while i <= num_2_generate:
each_data = data[start_num: end_num]
each_data = (each_data-np.mean(each_data))/(np.std(each_data)) # 做Z-score归一化
spilt_datalist.append(each_data)
start_num = 0 + i * step_length;
end_num = each_subdata_length + i * step_length
i = i + 1
spilt_data_arr = np.array(spilt_datalist)
return spilt_data_arr
划分子样本
data_base_dir = r'D:\22-学习记录\01_自己学习积累\03_基于CNN的轴承故障诊断(4分类)\dataset/train_data'
fault_type_list = os.listdir(data_base_dir)
file_path_list = list()
train_data = []
train_label = []
for fault_type in fault_type_list:
file_name_list = os.listdir(os.path.join(data_base_dir, fault_type))
for file_name in file_name_list:
print(file_name)
num_2_generate = 60 # 每个mat文件生成60个子样本
if 'Normal' in file_name:
num_2_generate = num_2_generate*6 # Normal状态,每个mat文件生成360个子样本
file_path = os.path.join(data_base_dir, fault_type, file_name)
fault_type2fault_label = {'BF':'0', 'OF':'1', 'IF':'2', 'Normal':'3'}
##=========获取数据===========##
data = data_read(file_path) # 读取单个mat数据
##======基于滑动窗方法划分获取更多子样本数据======##
sub_data_list = data_spilt(data=data, num_2_generate=num_2_generate)
train_label.extend(list(fault_type2fault_label[fault_type]*len(sub_data_list))) # 训练标签 eg: ['0', '0', ...,'3',]
train_data.extend(sub_data_list) # 训练数据
输出结果:
1730_12k_0.007-Ball.mat
1730_12k_0.014-Ball.mat
1730_12k_0.021-Ball.mat
1730_48k_0.007-Ball.mat
1730_48k_0.014-Ball.mat
1730_48k_0.021-Ball.mat
...
1772_12k_0.021-OuterRace3.mat
1772_48k_0.007-OuterRace3.mat
1772_48k_0.014-OuterRace6.mat
1772_48k_0.021-OuterRace3.mat
data_base_dir = r'D:\22-学习记录\01_自己学习积累\03_基于CNN的轴承故障诊断(4分类)/dataset/test_data'
fault_type_list = os.listdir(data_base_dir)
file_path_list = list()
test_data = []
test_label = []
for fault_type in fault_type_list:
file_name_list = os.listdir(os.path.join(data_base_dir, fault_type))
for file_name in file_name_list:
print(file_name)
num_2_generate = 30 # 每个mat文件生成30个子样本
if 'Normal' in file_name:
num_2_generate = num_2_generate*6 # Normal状态,每个mat文件生成180个子样本
file_path = os.path.join(data_base_dir, fault_type, file_name)
fault_type2fault_label = {'BF':'0', 'OF':'1', 'IF':'2', 'Normal':'3'}
##=========获取数据===========##
data = data_read(file_path) # 读取单个mat数据
##======基于滑动窗方法划分获取更多子样本数据======##
sub_data_list = data_spilt(data=data, num_2_generate=num_2_generate)
test_label.extend(list(fault_type2fault_label[fault_type]*len(sub_data_list))) # 测试标签 eg: ['0', '0', ...,'3',]
test_data.extend(sub_data_list) # 测试数据
输出结果:
1797_12k_0.007-Ball.mat
1797_12k_0.014-Ball.mat
1797_12k_0.021-Ball.mat
1797_48k_0.007-Ball.mat
...
1797_12k_0.021-OuterRace3.mat
1797_48k_0.007-OuterRace3.mat
1797_48k_0.014-OuterRace6.mat
1797_48k_0.021-OuterRace3.mat
创建dataloader
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
train_data = np.array(train_data) #训练数据
train_label = np.array((train_label), dtype=int) #训练标签 array([0, 0, 0, ..., 1, 1, 1])
train_x = torch.from_numpy(train_data).type(torch.float32) ##torch.Size([2880, 1024])
train_y = torch.from_numpy(train_label).type(torch.LongTensor) ##torch.Size([2880])
train_x = torch.unsqueeze(train_x, dim=1) ##扩展成3维 torch.Size([2880, 1, 1024])
test_data = np.array(test_data) #测试数据
test_label = np.array((test_label), dtype=int) #测试标签array([0, 0, 0, ..., 1, 1, 1])
test_x = torch.from_numpy(test_data).type(torch.float32) ##torch.Size([720, 1024])
test_y = torch.from_numpy(test_label).type(torch.LongTensor) ##torch.Size([720])
test_x = torch.unsqueeze(test_x, dim=1) ##torch.Size([720, 1, 1024])
看看train_data的维度
train_data.shape
# 输出结果
(4320, 1024)
#创建dataset
from sklearn.model_selection import train_test_split
train_ds = TensorDataset(train_x, train_y)
train_ds,valid_ds = train_test_split(train_ds, test_size = 0.2,random_state = 42) #将训练集分隔维训练集和测试集,分割比例5:1
test_ds = TensorDataset(test_x, test_y)
#创建dataloader
batch_size = 64 # 批次大小
train_dl = DataLoader(dataset = train_ds, batch_size = batch_size, shuffle=True) #shuffle是进行乱序
valid_dl = DataLoader(dataset = valid_ds, shuffle=True, batch_size = batch_size)
test_dl = DataLoader(dataset = test_ds, batch_size = batch_size, shuffle=False)
for i, (imgs, labels) in enumerate(train_dl):
print(i, imgs, labels)
## 输出结果
0
tensor([[[-0.3515, 0.3796, 1.1076, ..., -1.5466, -1.2220, -0.3242]],
[[ 1.3776, 1.4640, 0.7423, ..., 0.1370, 0.3632, 0.6326]],
[[-0.4793, -0.6978, -0.5840, ..., -0.6917, -1.3257, -1.4273]],
...,
[[-0.2387, -0.5382, -1.1767, ..., 0.8166, 0.7749, -2.3905]],
[[-0.9560, -1.3880, -1.7271, ..., 0.4720, 0.2392, 0.1732]],
[[ 0.2588, 0.0780, -0.5720, ..., -0.0647, 0.4554, 1.2545]]])
tensor([3, 3, 3, 1, 2, 3, 3, 0, 2, 3, 2, 1, 3, 2, 2, 2, 3, 2, 2, 0, 2, 1, 2, 2,
1, 2, 3, 3, 1, 2, 1, 0, 2, 1, 3, 0, 1, 3, 1, 3, 3, 1, 1, 3, 2, 0, 0, 1,
0, 0, 0, 2, 1, 0, 0, 3, 1, 3, 3, 0, 2, 2, 0, 3])
...