Real-world data representation with tensors
NO. 1 Tabular data
- Python offers several options for loading a CSV file quickly.
Start with the Wine Quality data set: csv
- Load your file and turn the resulting NumPy array into a PyTorch tensor.
import csv
import numpy as np
import torch
file = "E:/Deep Learning and PyTorch/No2 Deep Learning with PyTorch/dlwpt-code-master/data/p1ch4/tabular-wine/WineData.csv" # 位置
wine_np = np.loadtxt(file, dtype=np.float32, delimiter=";", skiprows=1)
---------------
wine_np >>>
array([[ 7. , 0.27, 0.36, ..., 0.45, 8.8 , 6. ],
[ 6.3 , 0.3 , 0.34, ..., 0.49, 9.5 , 6. ],
[ 8.1 , 0.28, 0.4 , ..., 0.44, 10.1 , 6. ],
...,
[ 6.5 , 0.24, 0.19, ..., 0.46, 9.4 , 6. ],
[ 5.5 , 0.29, 0.3 , ..., 0.38, 12.8 , 7. ],
[ 6. , 0.21, 0.38, ..., 0.32, 11.8 , 6. ]], dtype=float32)
---------------
col_list = next(csv.reader(open(file), delimiter=';')) #当用next()函数读取文件时:如果只执行一次默认读取第一行
wine_np.shape >>> (4898, 12)
col_list >>> ['fixed acidity',
'volatile acidity',
'citric acid',
'residual sugar',
'chlorides',
'free sulfur dioxide',
'total sulfur dioxide',
'density',
'pH',
'sulphates',
'alcohol',
'quality']
--------------
wine_tensor = torch.from_numpy(wine_np)
wine_tensor >>> tensor([[ 7.0000, 0.2700, 0.3600, ..., 0.4500, 8.8000, 6.0000],
[ 6.3000, 0.3000, 0.3400, ..., 0.4900, 9.5000, 6.0000],
[ 8.1000, 0.2800, 0.4000, ..., 0.4400, 10.1000, 6.0000],
...,
[ 6.5000, 0.2400, 0.1900, ..., 0.4600, 9.4000, 6.0000],
[ 5.5000, 0.2900, 0.3000, ..., 0.3800, 12.8000, 7.0000],
[ 6.0000, 0.2100, 0.3800, ..., 0.3200, 11.8000, 6.0000]])
- Interval, ordinal, and categorical values
- you typically remove the score from the tensor of input data and keep it in a separate tensor, so that you can use the score as the ground truth without it being input to your model(分开输入和输出)
x = wine_tensor[:, :-1]
x >>> tensor([[ 7.0000, 0.2700, 0.3600, ..., 3.0000, 0.4500, 8.8000],
[ 6.3000, 0.3000, 0.3400, ..., 3.3000, 0.4900, 9.5000],
[ 8.1000, 0.2800, 0.4000, ..., 3.2600, 0.4400, 10.1000],
...,
[ 6.5000, 0.2400, 0.1900, ..., 2.9900, 0.4600, 9.4000],
[ 5.5000, 0.2900, 0.3000, ..., 3.3400, 0.3800, 12.8000],
[ 6.0000, 0.2100, 0.3800, ..., 3.2600, 0.3200, 11.8000]])
y = wine_tensor[:,-1]
y >>> tensor([6., 6., 6., ..., 6., 7., 6.])
y.long() >>> tensor([6, 6, 6, ..., 6, 7, 6])
- One-hot encoding
在机器学习算法中,我们经常会遇到分类特征,例如:人的性别有男女,祖国有中国,美国,法国等。
这些特征值并不是连续的,而是离散的,无序的。通常我们需要对其进行特征数字化。
One-Hot编码,又称为一位有效编码,主要是采用N位状态寄存器来对N个状态进行编码,每个状态都由他独立的寄存器位,并且在任意时候只有一位有效。
One-Hot编码是分类变量作为二进制向量的表示。这首先要求将分类值映射到整数值。然后,每个整数值被表示为二进制向量,除了整数的索引之外,它都是零值,它被标记为1。
torch.unsqueeze
增加一个维度
a = torch.tensor([1,2,3,4,5])
a.shape
>>> torch.Size([5])
b = a.unsqueeze(0)
>>> tensor([[1, 2, 3, 4, 5]])
b.shape
>>> torch.Size([1, 5])
c = a.unsqueeze(1)
>>> tensor([[1],
[2],
[3],
[4],
[5]])
c.shape
>>> torch.Size([5, 1])
torch.scatter
(新容器).scatter(行/列, index=要编码的数的索引, 输入的数(可以为整型))
dim里面决定了索引的时候在新的容器里是向行还是向列
index索引值,在新容器中的位置的索引
(新容器).scatter(行/列, 要编码的数, 输入的数(可以为整型))
One-hot编码
box = torch.zeros(5,7) # 行数:样本的个数,列数:编码位数
encode = box.scatter(1, c, 1)
encode >>> tensor([[0., 1., 0., 0., 0., 0., 0.],
[0., 0., 1., 0., 0., 0., 0.],
[0., 0., 0., 1., 0., 0., 0.],
[0., 0., 0., 0., 1., 0., 0.],
[0., 0., 0., 0., 0., 1., 0.]])
- First, obtain means and standard deviations for each column(数据标准化):
mean = torch.mean(x, dim=0)
mean >>> tensor([6.8548e+00, 2.7824e-01, 3.3419e-01, 6.3914e+00, 4.5772e-02, 3.5308e+01,
1.3836e+02, 9.9403e-01, 3.1883e+00, 4.8985e-01, 1.0514e+01])
var = torch.var(x, dim=0)
var >>> tensor([7.1211e-01, 1.0160e-02, 1.4646e-02, 2.5726e+01, 4.7733e-04, 2.8924e+02,
1.8061e+03, 8.9455e-06, 2.2801e-02, 1.3025e-02, 1.5144e+00])
Normalize the data by subtracting the mean and dividing by the standard deviation, which helps with the learning process.
data_normalized = (x-mean)/torch.sqrt(var)
data_normalized >>> tensor([[ 1.7209e-01, -8.1764e-02, 2.1325e-01, ..., -1.2468e+00,
-3.4914e-01, -1.3930e+00],
[-6.5743e-01, 2.1587e-01, 4.7991e-02, ..., 7.3992e-01,
1.3467e-03, -8.2418e-01],
[ 1.4756e+00, 1.7448e-02, 5.4378e-01, ..., 4.7502e-01,
-4.3677e-01, -3.3662e-01],
...,
[-4.2042e-01, -3.7940e-01, -1.1915e+00, ..., -1.3131e+00,
-2.6152e-01, -9.0544e-01],
[-1.6054e+00, 1.1666e-01, -2.8253e-01, ..., 1.0048e+00,
-9.6250e-01, 1.8574e+00],
[-1.0129e+00, -6.7703e-01, 3.7852e-01, ..., 4.7502e-01,
-1.4882e+00, 1.0448e+00]])
- 将y的数据划分
bad_indexs = torch.le(y, 3) # 将质量3或3以下的数据取出来
bad_indexs >>> tensor([False, False, False, ..., False, False, False])
bad_indexs.sum() >>> tensor(20)
# 找到quality中<=3的数据
y[bad_indexes] >>> tensor([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3])
# 找到除quality外的x数据
x[bad_indexes] >>> tensor([[8.5000e+00, 2.6000e-01, 2.1000e-01, 1.6200e+01, 7.4000e-02, 4.1000e+01,
1.9700e+02, 9.9800e-01, 3.0200e+00, 5.0000e-01, 9.8000e+00],
[5.8000e+00, 2.4000e-01, 4.4000e-01, 3.5000e+00, 2.9000e-02, 5.0000e+00,
1.0900e+02, 9.9130e-01, 3.5300e+00, 4.3000e-01, 1.1700e+01],
[9.1000e+00, 5.9000e-01, 3.8000e-01, 1.6000e+00, 6.6000e-02, 3.4000e+01,
1.8200e+02, 9.9680e-01, 3.2300e+00, 3.8000e-01, 8.5000e+00],
[7.1000e+00, 3.2000e-01, 3.2000e-01, 1.1000e+01, 3.8000e-02, 1.6000e+01,
6.6000e+01, 9.9370e-01, 3.2400e+00, 4.0000e-01, 1.1500e+01],
[6.9000e+00, 3.9000e-01, 4.0000e-01, 4.6000e+00, 2.2000e-02, 5.0000e+00,
1.9000e+01, 9.9150e-01, 3.3100e+00, 3.7000e-01, 1.2600e+01],
[1.0300e+01, 1.7000e-01, 4.7000e-01, 1.4000e+00, 3.7000e-02, 5.0000e+00,
3.3000e+01, 9.9390e-01, 2.8900e+00, 2.8000e-01, 9.6000e+00],
[7.9000e+00, 6.4000e-01, 4.6000e-01, 1.0600e+01, 2.4400e-01, 3.3000e+01,
2.2700e+02, 9.9830e-01, 2.8700e+00, 7.4000e-01, 9.1000e+00],
[8.3000e+00, 3.3000e-01, 4.2000e-01, 1.1500e+00, 3.3000e-02, 1.8000e+01,
9.6000e+01, 9.9110e-01, 3.2000e+00, 3.2000e-01, 1.2400e+01],
[8.6000e+00, 5.5000e-01, 3.5000e-01, 1.5550e+01, 5.7000e-02, 3.5500e+01,
3.6650e+02, 1.0001e+00, 3.0400e+00, 6.3000e-01, 1.1000e+01],
[7.5000e+00, 3.2000e-01, 2.4000e-01, 4.6000e+00, 5.3000e-02, 8.0000e+00,
1.3400e+02, 9.9580e-01, 3.1400e+00, 5.0000e-01, 9.1000e+00],
[6.7000e+00, 2.5000e-01, 2.6000e-01, 1.5500e+00, 4.1000e-02, 1.1850e+02,
2.1600e+02, 9.9490e-01, 3.5500e+00, 6.3000e-01, 9.4000e+00],
[7.1000e+00, 4.9000e-01, 2.2000e-01, 2.0000e+00, 4.7000e-02, 1.4650e+02,
3.0750e+02, 9.9240e-01, 3.2400e+00, 3.7000e-01, 1.1000e+01],
[1.1800e+01, 2.3000e-01, 3.8000e-01, 1.1100e+01, 3.4000e-02, 1.5000e+01,
1.2300e+02, 9.9970e-01, 2.9300e+00, 5.5000e-01, 9.7000e+00],
[7.6000e+00, 4.8000e-01, 3.7000e-01, 1.2000e+00, 3.4000e-02, 5.0000e+00,
5.7000e+01, 9.9256e-01, 3.0500e+00, 5.4000e-01, 1.0400e+01],
[6.1000e+00, 2.0000e-01, 3.4000e-01, 9.5000e+00, 4.1000e-02, 3.8000e+01,
2.0100e+02, 9.9500e-01, 3.1400e+00, 4.4000e-01, 1.0100e+01],
[4.2000e+00, 2.1500e-01, 2.3000e-01, 5.1000e+00, 4.1000e-02, 6.4000e+01,
1.5700e+02, 9.9688e-01, 3.4200e+00, 4.4000e-01, 8.0000e+00],
[9.4000e+00, 2.4000e-01, 2.9000e-01, 8.5000e+00, 3.7000e-02, 1.2400e+02,
2.0800e+02, 9.9395e-01, 2.9000e+00, 3.8000e-01, 1.1000e+01],
[6.2000e+00, 2.3000e-01, 3.5000e-01, 7.0000e-01, 5.1000e-02, 2.4000e+01,
1.1100e+02, 9.9160e-01, 3.3700e+00, 4.3000e-01, 1.1000e+01],
[6.8000e+00, 2.6000e-01, 3.4000e-01, 1.5100e+01, 6.0000e-02, 4.2000e+01,
1.6200e+02, 9.9705e-01, 3.2400e+00, 5.2000e-01, 1.0500e+01],
[6.1000e+00, 2.6000e-01, 2.5000e-01, 2.9000e+00, 4.7000e-02, 2.8900e+02,
4.4000e+02, 9.9314e-01, 3.4400e+00, 6.4000e-01, 1.0500e+01]])
其中, <=3 中y的shape为torch.Size([20]);x的shape为torch.Size([20, 11])
划分数据并求均值和方差
bad中的所有数据为:
bad_data = wine_tensor[torch.le(y, 3)]
bad_data.shape >>> torch.Size([20, 12])
middle:
mid_data = wine_tensor[torch.lt(y,7) & torch.gt(y, 3)]
mid_data.shape >>> torch.Size([3818, 12])
great:
good_data = wine_tensor[torch.ge(y,7)]
good_data.shape >>> torch.Size([1060, 12])
均值和方差
bad_mean = torch.mean(bad_data, dim=0)
mid_mean = torch.mean(mid_data, dim=0)
good_mean = torch.mean(good_data, dim=0)
——————————————————————
bad_var = torch.var(bad_data, dim=0)
mid_var = torch.var(mid_data, dim=0)
good_var = torch.var(good_data, dim=0)
- last:
for i, args in enumerate(zip(col_list, bad_mean, mid_mean, good_mean)):
print('{:2} {:20} {:6.2f} {:6.2f} {:6.2f}'.format(i, *args))
>>> 0 fixed acidity 7.60 6.89 6.73
1 volatile acidity 0.33 0.28 0.27
2 citric acid 0.34 0.34 0.33
3 residual sugar 6.39 6.71 5.26
4 chlorides 0.05 0.05 0.04
5 free sulfur dioxide 53.33 35.42 34.55
6 total sulfur dioxide 170.60 141.83 125.25
7 density 0.99 0.99 0.99
8 pH 3.19 3.18 3.22
9 sulphates 0.47 0.49 0.50
10 alcohol 10.34 10.26 11.42
11 quality 3.00 5.53 7.17