import numpy as np
import pandas as pd
from collections import Counter
DATASET_PATH = "./UCI_data/raw/UCI HAR Dataset/"
INPUT_SIGNAL_TYPES = [ #文件前半部分公共名
"body_acc_x_",
"body_acc_y_",
"body_acc_z_",
"body_gyro_x_",
"body_gyro_y_",
"body_gyro_z_",
"total_acc_x_",
"total_acc_y_",
"total_acc_z_"
]
def load_x(X_signals_paths):
X_signals = []
for signal_type_path in X_signals_paths:#外面的for是赋文件路径然后分别进行内for操作
with open(signal_type_path, "r") as f:#里面的for就是 从打开的文件中加载行 然后对加载的行作两空格‘ ’代替1空格‘ ’的操作 然后删除头尾空白 然后通过单空格分离数据
X_signals.append( #将分离的数据组成新的list 最后serie从list中取数据组成数组
[np.array(serie, dtype=np.float32) #创建一个serie数组
for serie in [row.replace(' ', ' ').strip().split(' ') for row in f]]
) #从打开的文件中加载行 然后对加载的行作两空格‘ ’代替1空格‘ ’的操作 然后删除头尾空白 然后通过单空格分离数据
#将分离的数据放进X_signals
#print(np.array(X_signals).shape) #(1,7352,128) (特征数,样本个数,时间步长)
return np.transpose(X_signals, (1, 2, 0)) #0轴是第一个方括号代表序号 1轴是第二个方括号代表样本量 2轴是第二个方括号代表序列数据采样量
#(样本个数,时间步长,特征数) #然后transpose由0,1,2换成了换成样本量,采样量,序号 1,2,0
def load_y(y_path):
# Read dataset from disk, dealing with text file's syntax
with open(y_path, "r") as f:
y = np.array(
[elem for elem in [row.replace(' ', ' ').strip().split(' ') for row in f]],
dtype=np.int32
)
y = y.reshape(-1, )
# Substract 1 to each output class for friendly 0-based indexing
return y - 1 #y从1开始的
#数据文件的路径
train_x_signals_paths = [
DATASET_PATH + "train/Inertial Signals/" + signal + "train.txt" for signal in INPUT_SIGNAL_TYPES
] #用signal遍历train 9个文件名
test_x_signals_paths = [
DATASET_PATH + "test/Inertial Signals/" + signal + "test.txt" for signal in INPUT_SIGNAL_TYPES
]
#标签路径
train_y_path = DATASET_PATH + "train/y_train.txt"
test_y_path = DATASET_PATH + "test/y_test.txt"
#数据处理
train_x = load_x(train_x_signals_paths) #对9个文件名做load操作,做完一个操作后train_x是三维的
test_x = load_x(test_x_signals_paths)
#print(train_x_signals_paths)
# print("train_x.shape", train_x.shape)
#print("test_x.shape", test_x.shape)
#标签处理
train_y = load_y(train_y_path)
test_y = load_y(test_y_path)
# train_y_matrix = np.asarray(pd.get_dummies(train_y), dtype=np.int8) #先进行one-hot编码
# test_y_matrix = np.asarray(pd.get_dummies(test_y), dtype=np.int8) #然后转换成一数组
#这里就没用one hot
#print(train_y, Counter(train_y))
#print(test_y, Counter(test_y))
#print(train_y_matrix)
np.save("./UCI_data/np/x_train.npy", train_x)
np.save("./UCI_data/np/y_train.npy", train_y)
np.save("./UCI_data/np/x_test.npy", test_x)
np.save("./UCI_data/np/y_test.npy", test_y)