阿里二手车

%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
import zipfile  
import re
import numpy as np
import torch
# 定义解压.zip包函数
def unzip_file(zip_filepath, dest_path): 
    with zipfile.ZipFile(zip_filepath, 'r') as zip_ref:  
        zip_ref.extractall(dest_path) 
# 对.zip进行解包
unzip_file('used_car_train_20200313.zip','./')
unzip_file('used_car_testB_20200421.zip','./')
test_data = pd.read_csv('used_car_testB_20200421.csv', sep=' ')
train_data = pd.read_csv('used_car_train_20200313.csv', sep=' ')
test_data.to_csv('used_car_testB.csv')
train_data.to_csv('used_car_train.csv')
data = pd.concat([train_data, test_data])
data = data.replace('-', '-1')
data.notRepairedDamage = data.notRepairedDamage.astype('float32')
data.loc[data['power']>600,'power'] = 600
cate_cols=['model', 'brand', 'bodyType', 'fuelType', 'gearbox', 'seller', 'notRepairedDamage']
num_cols=['regDate', 'creatDate', 'power', 'kilometer', 'v_0', 'v_1', 'v_2', 'v_3', 'v_4', 'v_5', 'v_6', 'v_7', 'v_8', 'v_9', 'v_10','v_11', 'v_12', 'v_13', 'v_14']
# 定义One-Hot编码函数
def oneHotEncode(df, colNames):
    for col in colNames:
        dummies = pd.get_dummies(df[col], prefix=col)
        df = pd.concat([df, dummies],axis=1)
        df.drop([col], axis=1, inplace=True)
    return df
# 处理离散数据
for col in cate_cols:
    data[col] = data[col].fillna('-1')
data = oneHotEncode(data, cate_cols)

# 处理连续数据
for col in num_cols:
    data[col] = data[col].fillna(0)
    data[col] = (data[col]-data[col].min()) / (data[col].max()-data[col].min())

# 处理(可能)无关数据 
data.drop(['name', 'regionCode'], axis=1, inplace=True)

data.columns
Index(['SaleID', 'regDate', 'power', 'kilometer', 'offerType', 'creatDate',
       'price', 'v_0', 'v_1', 'v_2',
       ...
       'fuelType_6.0', 'fuelType_-1', 'gearbox_0.0', 'gearbox_1.0',
       'gearbox_-1', 'seller_0', 'seller_1', 'notRepairedDamage_-1.0',
       'notRepairedDamage_0.0', 'notRepairedDamage_1.0'],
      dtype='object', length=336)
# 拿出测试集
data=data.reset_index(drop=True)
data = data.astype(float)
test_data = data[pd.isna(data.price)]
X_id=test_data['SaleID']
del test_data['SaleID']
del test_data['price']
X_result=torch.tensor(test_data.values, dtype=torch.float32)
test_data.to_csv('one_hot_testB.csv') 
# 拿出训练集
train_data = data.drop(data[pd.isna(data.price)].index)
train_data.to_csv('one_hot_train.csv') 
y=train_data['price']
del train_data['price']
del train_data['SaleID']
X=torch.tensor(train_data.values, dtype=torch.float32)
y=torch.Tensor(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,random_state=512)
lr1=RandomForestRegressor().fit(X_train,y_train)#随机森林回归模型
---------------------------------------------------------------------------

KeyboardInterrupt                         Traceback (most recent call last)

Cell In[12], line 1
----> 1 lr1=RandomForestRegressor().fit(X_train,y_train)


File ~\anaconda3\envs\pytorch\lib\site-packages\sklearn\base.py:1152, in _fit_context.<locals>.decorator.<locals>.wrapper(estimator, *args, **kwargs)
   1145     estimator._validate_params()
   1147 with config_context(
   1148     skip_parameter_validation=(
   1149         prefer_skip_nested_validation or global_skip_validation
   1150     )
   1151 ):
-> 1152     return fit_method(estimator, *args, **kwargs)


File ~\anaconda3\envs\pytorch\lib\site-packages\sklearn\ensemble\_forest.py:456, in BaseForest.fit(self, X, y, sample_weight)
    445 trees = [
    446     self._make_estimator(append=False, random_state=random_state)
    447     for i in range(n_more_estimators)
    448 ]
    450 # Parallel loop: we prefer the threading backend as the Cython code
    451 # for fitting the trees is internally releasing the Python GIL
    452 # making threading more efficient than multiprocessing in
    453 # that case. However, for joblib 0.12+ we respect any
    454 # parallel_backend contexts set at a higher level,
    455 # since correctness does not rely on using threads.
--> 456 trees = Parallel(
    457     n_jobs=self.n_jobs,
    458     verbose=self.verbose,
    459     prefer="threads",
    460 )(
    461     delayed(_parallel_build_trees)(
    462         t,
    463         self.bootstrap,
    464         X,
    465         y,
    466         sample_weight,
    467         i,
    468         len(trees),
    469         verbose=self.verbose,
    470         class_weight=self.class_weight,
    471         n_samples_bootstrap=n_samples_bootstrap,
    472     )
    473     for i, t in enumerate(trees)
    474 )
    476 # Collect newly grown trees
    477 self.estimators_.extend(trees)


File ~\anaconda3\envs\pytorch\lib\site-packages\sklearn\utils\parallel.py:65, in Parallel.__call__(self, iterable)
     60 config = get_config()
     61 iterable_with_config = (
     62     (_with_config(delayed_func, config), args, kwargs)
     63     for delayed_func, args, kwargs in iterable
     64 )
---> 65 return super().__call__(iterable_with_config)


File ~\anaconda3\envs\pytorch\lib\site-packages\joblib\parallel.py:1918, in Parallel.__call__(self, iterable)
   1916     output = self._get_sequential_output(iterable)
   1917     next(output)
-> 1918     return output if self.return_generator else list(output)
   1920 # Let's create an ID that uniquely identifies the current call. If the
   1921 # call is interrupted early and that the same instance is immediately
   1922 # re-used, this id will be used to prevent workers that were
   1923 # concurrently finalizing a task from the previous call to run the
   1924 # callback.
   1925 with self._lock:


File ~\anaconda3\envs\pytorch\lib\site-packages\joblib\parallel.py:1847, in Parallel._get_sequential_output(self, iterable)
   1845 self.n_dispatched_batches += 1
   1846 self.n_dispatched_tasks += 1
-> 1847 res = func(*args, **kwargs)
   1848 self.n_completed_tasks += 1
   1849 self.print_progress()


File ~\anaconda3\envs\pytorch\lib\site-packages\sklearn\utils\parallel.py:127, in _FuncWrapper.__call__(self, *args, **kwargs)
    125     config = {}
    126 with config_context(**config):
--> 127     return self.function(*args, **kwargs)


File ~\anaconda3\envs\pytorch\lib\site-packages\sklearn\ensemble\_forest.py:188, in _parallel_build_trees(tree, bootstrap, X, y, sample_weight, tree_idx, n_trees, verbose, class_weight, n_samples_bootstrap)
    185     elif class_weight == "balanced_subsample":
    186         curr_sample_weight *= compute_sample_weight("balanced", y, indices=indices)
--> 188     tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False)
    189 else:
    190     tree.fit(X, y, sample_weight=sample_weight, check_input=False)


File ~\anaconda3\envs\pytorch\lib\site-packages\sklearn\base.py:1152, in _fit_context.<locals>.decorator.<locals>.wrapper(estimator, *args, **kwargs)
   1145     estimator._validate_params()
   1147 with config_context(
   1148     skip_parameter_validation=(
   1149         prefer_skip_nested_validation or global_skip_validation
   1150     )
   1151 ):
-> 1152     return fit_method(estimator, *args, **kwargs)


File ~\anaconda3\envs\pytorch\lib\site-packages\sklearn\tree\_classes.py:1320, in DecisionTreeRegressor.fit(self, X, y, sample_weight, check_input)
   1290 @_fit_context(prefer_skip_nested_validation=True)
   1291 def fit(self, X, y, sample_weight=None, check_input=True):
   1292     """Build a decision tree regressor from the training set (X, y).
   1293 
   1294     Parameters
   (...)
   1317         Fitted estimator.
   1318     """
-> 1320     super()._fit(
   1321         X,
   1322         y,
   1323         sample_weight=sample_weight,
   1324         check_input=check_input,
   1325     )
   1326     return self


File ~\anaconda3\envs\pytorch\lib\site-packages\sklearn\tree\_classes.py:443, in BaseDecisionTree._fit(self, X, y, sample_weight, check_input, missing_values_in_feature_mask)
    432 else:
    433     builder = BestFirstTreeBuilder(
    434         splitter,
    435         min_samples_split,
   (...)
    440         self.min_impurity_decrease,
    441     )
--> 443 builder.build(self.tree_, X, y, sample_weight, missing_values_in_feature_mask)
    445 if self.n_outputs_ == 1 and is_classifier(self):
    446     self.n_classes_ = self.n_classes_[0]


KeyboardInterrupt: 
lr2=LinearRegression().fit(X_train,y_train)#直线回归模型
print('训练集得分:{:.3f}'.format(lr1.score(X_train,y_train)))
print('测试集得分:{:.3f}'.format(lr1.score(X_test,y_test)))
print('训练集得分:{:.3f}'.format(lr2.score(X_train,y_train)))
print('测试集得分:{:.3f}'.format(lr2.score(X_test,y_test)))
pred=lr1.predict(X_result)
res=pd.DataFrame(pred, columns=['price']) 
X_id=X_id.reset_index(drop=True)
submission = pd.concat([X_id, res['price']], axis=1)
submission.to_csv('submission.csv',index=False)
submission

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值