#!H:\pytorch
# -*- coding:utf-8 -*-
#Author: Tangzhao
#content:pytorch
"""
we begin by creating an artificial dataset that is stored in a csv (comma-separated values)
file ../data/house_tiny.csv. Data
"""
import os
def mkdir_if_not_exist(path): #@save
if not isinstance(path,str):
path = os.path.join(*path)
if not os.path.exists(path):
os.makedirs(path)
# Below we write the dataset row by row into a csv file.
data_file = '../data/house_tiny.csv'
mkdir_if_not_exist('../data')
with open(data_file, 'w') as f:
f.write('NumRooms,Alley,Price\n') # Column names
f.write('NA,Pave,127500\n') # Each row is a data instance
f.write('2,NA,106000\n')
f.write('4,NA,178100\n')
f.write('NA,NA,140000\n')
# 导入pandas一行一行的读取数据,该数据有4行3列
# each row describes the number of rooms (“NumRooms”), the alley type (“Alley”), and the price (“Price”) of a house.
import pandas as pd
data=pd.read_csv(data_file)
print(data)
"""
NumRooms Alley Price
0 NaN Pave 127500
1 2.0 NaN 106000
2 4.0 NaN 178100
3 NaN NaN 140000
"""
# 2.2.2、处理丢失的数据
"""
我们注意到NaN是丢失的数据,常见的处理方法是:假设和删除,假设方法用1代替原来的数据,删除忽略丢失的数据
我们采用的是前一种方法,先把数据集划分为输入和输出,前两列为输入,最后一列为输出,并取第一列已经存在的数据平均值作为替代丢失的数据
"""
inputs,outputs = data.iloc[:,0:2],data.iloc[:,2]
inputs = inputs.fillna(inputs.mean())
print(inputs)
"""
NumRooms Alley
0 3.0 Pave
1 2.0 NaN
2 4.0 NaN
3 3.0 NaN
"""
"""
For categorical or discrete values in inputs, we consider “NaN” as a category. Since the “Alley” column only
takes two types of categorical values “Pave” and “NaN”, pandas can automatically convert this column to two
columns “Alley_Pave” and “Alley_nan”. A row whose alley type is “Pave” will set values of “Alley_Pave” and “Alley_nan”
to 1 and 0. A row with a missing alley type will set their values to 0 and 1.
"""
inputs = pd.get_dummies(inputs, dummy_na=True)
print(inputs)
"""
NumRooms Alley_Pave Alley_nan
0 3.0 1 0
1 2.0 0 1
2 4.0 0 1
3 3.0 0 1
"""
# 2.2.3、数据转换
# 现在数据都是数值的,可以进行数据转换
import torch
X,y = torch.tensor(inputs.values),torch.tensor(outputs.values)
print(X,y)
"""
tensor([[3., 1., 0.],
[2., 0., 1.],
[4., 0., 1.],
[3., 0., 1.]], dtype=torch.float64) tensor([127500, 106000, 178100, 140000])
"""
动手学习深度学习pytorch最新版-数据处理
最新推荐文章于 2024-03-14 23:03:57 发布