# 定义解压.zip包函数
def unzip_file(zip_filepath, dest_path):
with zipfile.ZipFile(zip_filepath, 'r') as zip_ref:
zip_ref.extractall(dest_path)
# 对.zip进行解包
unzip_file('used_car_train_20200313.zip','./')
unzip_file('used_car_testB_20200421.zip','./')
# 定义One-Hot编码函数
def oneHotEncode(df, colNames):
for col in colNames:
dummies = pd.get_dummies(df[col], prefix=col)
df = pd.concat([df, dummies],axis=1)
df.drop([col], axis=1, inplace=True)
return df
# 处理离散数据
for col in cate_cols:
data[col] = data[col].fillna('-1')
data = oneHotEncode(data, cate_cols)
# 处理连续数据
for col in num_cols:
data[col] = data[col].fillna(0)
data[col] = (data[col]-data[col].min()) / (data[col].max()-data[col].min())
# 处理(可能)无关数据
data.drop(['name', 'regionCode'], axis=1, inplace=True)
data.columns
# 拿出测试集
data=data.reset_index(drop=True)
data = data.astype(float)
test_data = data[pd.isna(data.price)]
X_id=test_data['SaleID']
del test_data['SaleID']
del test_data['price']
X_result=torch.tensor(test_data.values, dtype=torch.float32)
test_data.to_csv('one_hot_testB.csv')
# 拿出训练集
train_data = data.drop(data[pd.isna(data.price)].index)
train_data.to_csv('one_hot_train.csv')
y=train_data['price']
del train_data['price']
del train_data['SaleID']
X=torch.tensor(train_data.values, dtype=torch.float32)
y=torch.Tensor(y)