kaggle Categorical Feature Encoding Challenge II

df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

id_test = df_test['id']

y_train=df_train['target']

df_train.drop(['id','target','bin_3','nom_6'],axis=1,inplace=True)
df_test.drop(['id','bin_3','nom_6'],axis=1,inplace=True)

obj=df_train.select_dtypes(include='object').columns
con=df_train.select_dtypes(exclude='object').columns

for col in con:
    df_train[col].fillna(int(-1),inplace=True)
    df_test[col].fillna(int(-1),inplace=True)
for col in obj:
    df_train[col].fillna(str(-1),inplace=True)
    df_test[col].fillna(str(-1),inplace=True)

df_train['ord_6']=df_train['ord_5'].map(lambda x:x[0])
df_train['ord_7']=df_train['ord_5'].map(lambda x:x[1])
df_test['ord_6']=df_test['ord_5'].map(lambda x:x[0])
df_test['ord_7']=df_test['ord_5'].map(lambda x:x[1])

df_train.drop(['ord_5'],axis=1,inplace=True)
df_test.drop(['ord_5'],axis=1,inplace=True)

obj=['ord_7']
con=[col for col in df_train.columns if col not in obj]

ord=OrdinalEncoder()
ord.fit(list(df_train[obj].values) + list(df_test[obj].values))
df_train[obj]=pd.DataFrame(ord.transform(df_train[obj].values))
df_test[obj]=pd.DataFrame(ord.transform(df_test[obj].values))

scaler=MinMaxScaler()
scaler.fit(list(df_train[obj].values) + list(df_test[obj].values))
df_train[obj]=pd.DataFrame(scaler.transform(df_train[obj].values))
df_test[obj]=pd.DataFrame(scaler.transform(df_test[obj].values))

hot=OneHotEncoder(sparse=True)
hot.fit(list(df_train[con].values) + list(df_test[con].values))
train=hot.transform(df_train[con].values)
test=hot.transform(df_test[con].values)

df_train.drop(con,axis=1,inplace=True)
df_test.drop(con,axis=1,inplace=True)

df_train =scipy.sparse.hstack([train,scipy.sparse.coo_matrix(df_train)]).tocsr()
df_test =scipy.sparse.hstack([test,scipy.sparse.coo_matrix(df_test)]).tocsr()

model = LogisticRegression(penalty='l2', C=float(0.07334286390056426), class_weight={0: 1, 1: 1.32}, random_state=42, solver='lbfgs', max_iter=2011,  fit_intercept=True,verbose=0,n_jobs=-1)
model.fit(df_train,y_train)
mean_pred = pd.DataFrame(model.predict_proba(df_test)[:,1])
mean_pred.index = id_test
mean_pred.columns = ['target']
mean_pred.to_csv('sub.csv', index_label='id', index=True) 

score:0.78797

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值