一段python算法实战的代码

最新推荐文章于 2023-11-29 10:38:03 发布

AnalogElectronic

最新推荐文章于 2023-11-29 10:38:03 发布

阅读量462

点赞数

分类专栏： python数据人工智能

本文链接：https://blog.csdn.net/AnalogElectronic/article/details/112195330

版权

python数据人工智能专栏收录该内容

23 篇文章 0 订阅

订阅专栏

一段python算法实战的代码


# coding: utf-8

# In[1]:


get_ipython().magic('matplotlib inline')
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression as LR


# In[2]:


data = pd.read_csv("rankingcard.csv",index_col=0)


# In[5]:


##重复数据剔除
data.drop_duplicates(inplace=True)


# In[6]:


data.info()


# In[10]:


data['NumberOfDependents'].fillna(data['NumberOfDependents'].mean(),inplace=True)


# In[11]:


data.info()


# In[16]:


x = data.iloc[:,1:]
y = data['SeriousDlqin2yrs']


# In[17]:


y_fill = data['MonthlyIncome']


# In[34]:


X = pd.concat([x.loc[:,x.columns !='MonthlyIncome'],y],axis=1)
Y = y_fill

X_train = X[y_fill.notnull()]
X_test = X[y_fill.isnull()]
Y_train = Y[y_fill.notnull()]
Y_test = Y[y_fill.isnull()]


# In[36]:


from sklearn.ensemble import  RandomForestRegressor
rfc_mode = RandomForestRegressor(n_estimators=10)
rfc_mode.fit(X_train,Y_train)


# In[37]:


Y_predict = rfc_mode.predict(X_test)


# In[38]:


Y_predict


# In[39]:


data.loc[data['MonthlyIncome'].isnull(),"MonthlyIncome"] = Y_predict


# In[40]:


data.info()


# In[43]:


(data["age"] == 0).sum()
data = data[data['age'] != 0]


# In[45]:


data.index = range(data.shape[0])


# In[54]:


x_data = data.iloc[:,1:]
y_data = data['SeriousDlqin2yrs']


# In[55]:


from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x_data,y_data,test_size = 0.3,random_state = 10)


# In[56]:


from sklearn.ensemble import RandomForestClassifier
forst_mode = RandomForestClassifier(n_estimators=50,random_state=10,n_jobs=-1)
forst_mode.fit(x_train,y_train)


# In[57]:


yp = forst_mode.predict(x_test)


# In[58]:


def cm_plot(y, yp):
    from sklearn.metrics import confusion_matrix  # 导入混淆矩阵函数
    cm = confusion_matrix(y, yp)  # 混淆矩阵
    import matplotlib.pyplot as plt
    plt.matshow(cm, cmap=plt.cm.Greens)
    plt.colorbar()
    for x in range(len(cm)):
        for y in range(len(cm)):
            plt.annotate(cm[x, y], xy=(x, y), horizontalalignment='center', verticalalignment='center')
    plt.ylabel('Predicted label')
    plt.xlabel('True label')
    return plt


# In[59]:


cm_plot(y_test,yp)


# In[62]:


data.shape


# In[63]:


## 1 10009
10009/149390


# In[64]:


from imblearn.over_sampling import SMOTE


# In[65]:


sm = SMOTE(random_state=10)


# In[66]:


x_new_data,y_new_data = sm.fit_sample(x_data,y_data)


# In[70]:


from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x_new_data,y_new_data,test_size = 0.3,random_state = 10)


# In[71]:


from sklearn.ensemble import RandomForestClassifier
forst_mode = RandomForestClassifier(n_estimators=50,random_state=10,n_jobs=-1)
forst_mode.fit(x_train,y_train)


# In[73]:


yp = forst_mode.predict(x_test)


# In[74]:


cm_plot(y_test,yp)


# In[78]:


(38379+38074)/83629


# In[80]:


38074/(38074+3771)

逐句解析

get_ipython().magic('matplotlib inline')
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression as LR
data = pd.read_csv("rankingcard.csv",index_col=0)
data.info()

在这里插入图片描述

##重复数据剔除
data.drop_duplicates(inplace=True)
data.info()

在这里插入图片描述

#平均值填充NumberOfDependents空值
data['NumberOfDependents'].fillna(data['NumberOfDependents'].mean(),inplace=True)
data.info()

在这里插入图片描述

#随机森林填充MonthlyIncome空值
x = data.iloc[:,1:]
y = data['SeriousDlqin2yrs']


# In[17]:


y_fill = data['MonthlyIncome']


# In[34]:


X = pd.concat([x.loc[:,x.columns !='MonthlyIncome'],y],axis=1)
Y = y_fill

X_train = X[y_fill.notnull()]
X_test = X[y_fill.isnull()]
Y_train = Y[y_fill.notnull()]
Y_test = Y[y_fill.isnull()]
#随机森林回归树 预测y
from sklearn.ensemble import  RandomForestRegressor
rfc_mode = RandomForestRegressor(n_estimators=10)
rfc_mode.fit(X_train,Y_train)
Y_predict = rfc_mode.predict(X_test)
data.loc[data['MonthlyIncome'].isnull(),"MonthlyIncome"] = Y_predict

#异常值处理
(data["age"] == 0).sum()

data = data[data['age'] != 0]
data.index = range(data.shape[0]) #感觉这句没意义
x_data = data.iloc[:,1:]
y_data = data['SeriousDlqin2yrs']

from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x_data,y_data,test_size = 0.3,random_state = 10)

from sklearn.ensemble import RandomForestClassifier
forst_mode = RandomForestClassifier(n_estimators=50,random_state=10,n_jobs=-1)
forst_mode.fit(x_train,y_train)


# In[57]:


yp = forst_mode.predict(x_test)


# In[58]:


def cm_plot(y, yp):
    from sklearn.metrics import confusion_matrix  # 导入混淆矩阵函数
    cm = confusion_matrix(y, yp)  # 混淆矩阵
    import matplotlib.pyplot as plt
    plt.matshow(cm, cmap=plt.cm.Greens)
    plt.colorbar()
    for x in range(len(cm)):
        for y in range(len(cm)):
            plt.annotate(cm[x, y], xy=(x, y), horizontalalignment='center', verticalalignment='center')
    plt.ylabel('Predicted label')
    plt.xlabel('True label')
    return plt


# In[59]:


cm_plot(y_test,yp)

在这里插入图片描述


from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=10)
x_new_data,y_new_data = sm.fit_sample(x_data,y_data)


# In[70]:


from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x_new_data,y_new_data,test_size = 0.3,random_state = 10)

from sklearn.ensemble import RandomForestClassifier
forst_mode = RandomForestClassifier(n_estimators=50,random_state=10,n_jobs=-1)
forst_mode.fit(x_train,y_train)


# In[73]:


yp = forst_mode.predict(x_test)


# In[74]:

cm_plot(y_test,yp)