#%%
from __future__ import absolute_import, division, print_function, unicode_literals
import pathlib
import os
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, losses
print(tf.__version__)
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
# 在线下载汽车效能数据集
dataset_path = keras.utils.get_file("auto-mpg.data", "http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data")
print(dataset_path)
# 效能(公里数每加仑),气缸数,排量,马力,重量
# 加速度,型号年份,产地
column_names = ['MPG','Cylinders','Displacement','Horsepower','Weight',
'Acceleration', 'Model Year', 'Origin']
raw_dataset = pd.read_csv(dataset_path, names=column_names,
na_values = "?", comment='\t',
sep=" ", skipinitialspace=True)
print(raw_dataset)
dataset = raw_dataset.copy()
# 查看部分数据
dataset.tail()
dataset.head()
dataset
# 统计空白数据,并清除
dataset.isna().sum()
print(dataset.isna().sum())
# 处理类别型数据,其中origin列代表了类别1,2,3,分布代表产地:美国、欧洲、日本
# 其弹出这一列
origin = dataset.pop('Origin')
# 根据origin列来写入新列
dataset['USA'] = (origin == 1)*1.0
dataset['Europe'] = (origin == 2)*1.0
dataset['Japan'] = (origin == 3)*1.0
dataset.tail()
print(dataset.tail())
# 查看训练集的输入X的统计数据
train_stats = train_dataset.describe()
print(train_stats)
a = train_stats.pop("MPG")
print(a)
train_stats = train_stats.transpose()
print(train_stats)