1.特征工程的作用决定上限!
梯度下降算法 需要对数据进行标准化
自己做的代码:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn import preprocessing
train=pd.read_csv("C:\\Users\\Administrator.SC-201903262346\\Desktop\\train_set - 22.csv")
test=pd.read_csv("C:\\Users\\Administrator.SC-201903262346\\Desktop\\test_set -11.csv")
train=train.replace({"job":{"unknown":"blue-collar"}})
test=test.replace({"job":{"unknown":"blue-collar"}})
train.loc[train["age"]<40,"age_code"]= 0
train.loc[((train["age"]>=40) | (train["age"]<60)) ,"age_code"]= 1
train.loc[train["age"]>=60 ,"age_code"]= 2
test.loc[test["age"]<40,"age_code"]= 0
test.loc[((test["age"]>=40) | (test["age"]<60)) ,"age_code"]= 1
test.loc[test["age"]>=60 ,"age_code"]= 2
train.drop("age",axis=1,inplace=True)
test.drop("age",axis=1,inplace=True)
train.drop("ID",axis=1,inplace=True)
test.drop("ID",axis=1,inplace=True)
train.drop(["day","month"],axis=1,inplace=True)
test.drop(["day","month"],axis=1,inplace=True)
le=preprocessing.LabelEncoder()
le_job=le.fit(train["job"])
job_label=le_job.transform(train["job"])
job_label=pd.Series(job_label)
train["job_label"]=job_label
le_job_test=le.fit(test["job"])
job_label_test=le_job.transform(test["job"])
job_label=pd.Series(job_label_test)
test["job_label"]=job_label
train=pd.get_dummies(train,columns=["job_label","marital","education","default","housing","contact","poutcome"])
test=pd.get_dummies(test,columns=["job_label","marital","education","default","housing","contact","poutcome"])
train=pd.get_dummies(train,"loan")
test=pd.get_dummies(test,"loan")
x=train.drop("y",axis=1)
y=train["y"]
SS=preprocessing.StandardScaler ()
SS.fit(train[["balance","duration","campaign","pdays","previous"]])
train[["balance","duration","campaign","pdays","previous"]]=SS.transform(train[["balance","duration","campaign","pdays","previous"]])
lr=LogisticRegression()
X_train,X_test,y_train,y_test=train_test_split(x,y,test_size=0.3)
lr.fit(X_train,y_train)
y_pre=lr.predict(X_test)
f1_score(y_test,y_pre)
y=lr.predict(test)
a=pd.Series(y)