数据来源
是在kaggle上下载的,或者来自某个社区的数据集,点赞后私信我拿数据集哦~
全流程
1,导入包
import warnings
from pandas_profiling import ProfileReport
from lightgbm import LGBMClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import roc_curve
from sklearn.metrics import f1_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from sklearn.preprocessing import label_binarize
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
from collections import Counter
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style("darkgrid")
warnings.filterwarnings("ignore")
%matplotlib inline
2,导入数据并查看前五个数据
df=pd.read_csv(r'C:\Users\hp\Desktop\python\kaggle\train_set.csv')
tf=pd.read_csv(r'C:\Users\hp\Desktop\python\kaggle\test_set.csv')
df.head()

3,据常识判断,利用df.drop()函数将无用变量删掉
df=df.drop(['ID','marital','month'],axis=1)
tf=tf.drop(['ID','marital','month'],axis=1)
df.head()
4,查看数据整体信息
df.info()

可以看到,整个训练集有25317个数据,并且无缺失值,因此省去了处理缺失值的步骤
5,为方便处理,将文字信息且类别多的信息删掉。正常情况下,可采用独热编码或者二进制编码加以处理
df=df.drop(['job','education'],axis=1)
tf=tf.drop(['job','education'],axis=1)
6,查看剩余信息中的类别特征(图略)
Default_no=df.y[df.default=='no'].value_counts()
Default_yes=df.y[df.default=='yes'].value_counts()
dff=pd.DataFrame({
'de':Default_yes,'notde':Default_no})
u=df.y[df.contact=='unknown'].value_counts()
c=df.y[df.contact=='cellular'].value_counts()
t=df.y[df.contact=='telephone'].value_counts()
dfff=pd.DataFrame({
'u':u,'t':t,'c':c})
ln=df.y[df.loan=='no'].value_counts()
ly=df.y[df.loan=='yes'].value_counts()
dffff=pd.DataFrame({
'yes':ly,'no':ln})
dff.plot(kind='bar',stacked=True)
dfff.plot(kind='bar',stacked=True)
dffff.plot(kind='bar',stacked=True)
plt.show()
7,将default,contact,loan特征进行编码
def binaryFeature(data)

最低0.47元/天 解锁文章

被折叠的 条评论
为什么被折叠?



