一段python算法实战的代码
# coding: utf-8
# In[1]:
get_ipython().magic('matplotlib inline')
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression as LR
# In[2]:
data = pd.read_csv("rankingcard.csv",index_col=0)
# In[5]:
##重复数据剔除
data.drop_duplicates(inplace=True)
# In[6]:
data.info()
# In[10]:
data['NumberOfDependents'].fillna(data['NumberOfDependents'].mean(),inplace=True)
# In[11]:
data.info()
# In[16]:
x = data.iloc[:,1:]
y = data['SeriousDlqin2yrs']
# In[17]:
y_fill = data['MonthlyIncome']
# In[34]:
X = pd.concat([x.loc[:,x.columns !='MonthlyIncome'],y],axis=1)
Y = y_fill
X_train = X[y_fill.notnull()]
X_test = X[y_fill.isnull()]
Y_train = Y[y_fill.notnull()]
Y_test = Y[y_fill.isnull()]
# In[36]:
from sklearn.ensemble import RandomForestRegressor
rfc_mode = RandomForestRegressor(n_estimators=10)
rfc_mode.fit(X_train,Y_train)
# In[37]:
Y_predict = rfc_mode.predict(X_test)
# In[38]:
Y_predict
# In[39]:
data.loc[data['MonthlyIncome'].isnull(),"MonthlyIncome"] = Y_predict
# In[40]:
data.info()
# In[43]:
(data["age"] == 0).sum()
data = data[data['age'] != 0]
# In[45]:
data.index = range(data.shape[0])
# In[54]:
x_data = data.iloc[:,1:]
y_data = data['SeriousDlqin2yrs']
# In[55]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x_data,y_data,test_size = 0.3,random_state = 10)
# In[56]:
from sklearn.ensemble import RandomForestClassifier
forst_mode = RandomForestClassifier(n_estimators=50,random_state=10,n_jobs=-1)
forst_mode.fit(x_train,y_train)
# In[57]:
yp = forst_mode.predict(x_test)
# In[58]:
def cm_plot(y, yp):
from sklearn.metrics import confusion_matrix # 导入混淆矩阵函数
cm = confusion_matrix(y, yp) # 混淆矩阵
import matplotlib.pyplot as plt
plt.matshow(cm, cmap=plt.cm.Greens)
plt.colorbar()
for x in range(len(cm)):
for y in range(len(cm)):
plt.annotate(cm[x, y], xy=(x, y), horizontalalignment='center', verticalalignment='center')
plt.ylabel('Predicted label')
plt.xlabel('True label')
return plt
# In[59]:
cm_plot(y_test,yp)
# In[62]:
data.shape
# In[63]:
## 1 10009
10009/149390
# In[64]:
from imblearn.over_sampling import SMOTE
# In[65]:
sm = SMOTE(random_state=10)
# In[66]:
x_new_data,y_new_data = sm.fit_sample(x_data,y_data)
# In[70]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x_new_data,y_new_data,test_size = 0.3,random_state = 10)
# In[71]:
from sklearn.ensemble import RandomForestClassifier
forst_mode = RandomForestClassifier(n_estimators=50,random_state=10,n_jobs=-1)
forst_mode.fit(x_train,y_train)
# In[73]:
yp = forst_mode.predict(x_test)
# In[74]:
cm_plot(y_test,yp)
# In[78]:
(38379+38074)/83629
# In[80]:
38074/(38074+3771)
逐句解析
get_ipython().magic('matplotlib inline')
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression as LR
data = pd.read_csv("rankingcard.csv",index_col=0)
data.info()
##重复数据剔除
data.drop_duplicates(inplace=True)
data.info()
#平均值填充NumberOfDependents空值
data['NumberOfDependents'].fillna(data['NumberOfDependents'].mean(),inplace=True)
data.info()
#随机森林填充MonthlyIncome空值
x = data.iloc[:,1:]
y = data['SeriousDlqin2yrs']
# In[17]:
y_fill = data['MonthlyIncome']
# In[34]:
X = pd.concat([x.loc[:,x.columns !='MonthlyIncome'],y],axis=1)
Y = y_fill
X_train = X[y_fill.notnull()]
X_test = X[y_fill.isnull()]
Y_train = Y[y_fill.notnull()]
Y_test = Y[y_fill.isnull()]
#随机森林回归树 预测y
from sklearn.ensemble import RandomForestRegressor
rfc_mode = RandomForestRegressor(n_estimators=10)
rfc_mode.fit(X_train,Y_train)
Y_predict = rfc_mode.predict(X_test)
data.loc[data['MonthlyIncome'].isnull(),"MonthlyIncome"] = Y_predict
#异常值处理
(data["age"] == 0).sum()
data = data[data['age'] != 0]
data.index = range(data.shape[0]) #感觉这句没意义
x_data = data.iloc[:,1:]
y_data = data['SeriousDlqin2yrs']
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x_data,y_data,test_size = 0.3,random_state = 10)
from sklearn.ensemble import RandomForestClassifier
forst_mode = RandomForestClassifier(n_estimators=50,random_state=10,n_jobs=-1)
forst_mode.fit(x_train,y_train)
# In[57]:
yp = forst_mode.predict(x_test)
# In[58]:
def cm_plot(y, yp):
from sklearn.metrics import confusion_matrix # 导入混淆矩阵函数
cm = confusion_matrix(y, yp) # 混淆矩阵
import matplotlib.pyplot as plt
plt.matshow(cm, cmap=plt.cm.Greens)
plt.colorbar()
for x in range(len(cm)):
for y in range(len(cm)):
plt.annotate(cm[x, y], xy=(x, y), horizontalalignment='center', verticalalignment='center')
plt.ylabel('Predicted label')
plt.xlabel('True label')
return plt
# In[59]:
cm_plot(y_test,yp)
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=10)
x_new_data,y_new_data = sm.fit_sample(x_data,y_data)
# In[70]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x_new_data,y_new_data,test_size = 0.3,random_state = 10)
from sklearn.ensemble import RandomForestClassifier
forst_mode = RandomForestClassifier(n_estimators=50,random_state=10,n_jobs=-1)
forst_mode.fit(x_train,y_train)
# In[73]:
yp = forst_mode.predict(x_test)
# In[74]:
cm_plot(y_test,yp)