IBM 数据分析HR-1


import numpy as np 
import pandas as pd
from subprocess import check_output

import plotly.offline as py
py.init_notebook_mode(connected=True)

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.metrics import f1_score
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, log_loss
attrition=pd.read_csv(r"C:\Users\Administrator\Desktop\hr\HR_Attrition_analysis-master\data\WA_Fn-UseC_-HR-Employee-Attrition.csv")
attrition.head()

print(attrition.shape)
attrition.dtypes

fig = plt.figure(figsize=(5, 5))
y = ["No", "Yes"]
ax = sns.categorical.barplot(y, np.array(attrition.Attrition.value_counts(normalize=True)), saturation=1)
ax.set_xticklabels(y)
ax.set_title("Attrition")
ax.set_xlabel("")
ax.set_ylabel("Frequency")
ax.set_ylim([0,1])
plt.show()

attrition.Attrition_numeric = attrition.Attrition
attrition.loc[attrition.Attrition == 'Yes','Attrition_numeric'] = 1
attrition.loc[attrition.Attrition == 'No','Attrition_numeric'] = 0
attrition.describe()

fig = plt.figure(figsize=(10, 60))
cols = 3
target_column = "Attrition"
rows = np.ceil(float(attrition.shape[1] / cols))
for i, column in enumerate(attrition.columns):
    if target_column == column:
        continue
    ax = fig.add_subplot(rows, cols, i+1)
    ax.set_title(column)
    if attrition.dtypes[column] == np.object:
        cts = attrition[[target_column, column]]
        cts = cts.groupby([target_column, column]).size()
        cts.unstack().T.plot(kind="bar", ax=ax, stacked=True, alpha=1)
    else:
        cts = attrition[[target_column, column]]
        #(xmin, xmax) = (min(cts[column].tolist()), max(cts[column].tolist()))
        cts.groupby(target_column)[column].plot(
            bins=16,
            kind="hist",
            stacked=True,
            alpha=1,
            legend=True,
            ax=ax,
            #range=[xmin, max]
        )
plt.tight_layout()

。。。。

f, ax = plt.subplots(figsize=(35, 15))
sns.heatmap(attrition.corr(), annot=True)

fig = plt.figure(figsize=(5, 5))
ax = fig.add_subplot(1,1,1)
ax = attrition.corr().ix["Attrition_numeric"].drop("Attrition_numeric").drop("StandardHours").drop("EmployeeCount").sort_values().plot(kind="barh", figsize=(6, 8), ax=ax)
ax.tick_params(axis='y', which='major', labelsize=12)
ax.set_title("Attrititon Corelation")
plt.tight_layout()

attrition = attrition[['Attrition',
                   'Age',
                   'BusinessTravel',
                   'DailyRate',
                   'Department',
                   'DistanceFromHome',
                   'Education',
                   'EducationField',
                   'EmployeeCount',
                   'EmployeeNumber',
                   'EnvironmentSatisfaction',
                   'Gender',
                   'HourlyRate',
                   'JobInvolvement',
                   'JobLevel',
                   'JobRole',
                   'JobSatisfaction',
                   'MaritalStatus',
                   'MonthlyIncome',
                   'MonthlyRate',
                   'NumCompaniesWorked',
                   'Over18',
                   'OverTime',
                   'PercentSalaryHike',
                   'PerformanceRating',
                   'RelationshipSatisfaction',
                   'StandardHours',    
                   'StockOptionLevel',
                   'TotalWorkingYears',
                   'TrainingTimesLastYear',
                   'WorkLifeBalance',
                   'YearsAtCompany',
                   'YearsInCurrentRole',
                   'YearsSinceLastPromotion',
                   'YearsWithCurrManager']]
dataset = attrition.drop(['EmployeeCount'], axis=1)
dataset = dataset.drop(['StandardHours'], axis=1)
dataset = dataset.drop(['Over18'], axis=1) 
dataset = dataset.drop(['EmployeeNumber'], axis=1)
dataset = dataset.drop(['PerformanceRating'], axis=1)
dataset.head()
X = dataset.iloc[:, 1:]
y = dataset.iloc[:, 0]
X.head()

y.head()

from sklearn.preprocessing import LabelEncoder 
def lbl_enc(var_x): 
    lbl_enc = LabelEncoder() 
    lbl_enc.fit(y[var_x]) 
    var_x_lbl = var_x 
    y[var_x_lbl] = lbl_enc.transform(y[var_x])
y = pd.DataFrame(y)
lbl_enc('Attrition')
X = pd.get_dummies(X)
X.shape
X.head()  #可怕哑变量之后维度增长很多

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
dropout = 0.1
epochs = 100
batch_size = 30
optimizer = 'adam'
k = 20
seed = 0
parameter = {
    'n_jobs': -1,
    'n_estimators': 800,
    'warm_start': True, 
    'max_features': 0.3,
    'max_depth': 9,
    'min_samples_leaf': 2,
    'max_features' : 'sqrt',
    'random_state' : seed,
    'verbose': 0
}
rf = RandomForestClassifier(**parameter)
rf = rf.fit(X_train, y_train)
print("Fitting of Random Forest as finished")
rf_predictions = rf.predict(X_test)
print("Predictions finished")
accuracy_score(y_test, rf_predictions)
0.8503401360544217
from sklearn.metrics import f1_score
f1_score(y_test, rf_predictions)
0.33333333333333337
importances = rf.feature_importances_
importances


import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go

# Scatter plot 
trace = go.Scatter(
    y = rf.feature_importances_,
    x = X.columns.values,
    #x = attrition_final.columns.values,
    
    mode='markers',
    marker=dict(
        sizemode = 'diameter',
        sizeref = 1,
        size = 6,
        #size= rf.feature_importances_,
        #color = np.random.randn(500),
        color = rf.feature_importances_,
        colorscale='Portland',
        showscale=True
    ),
    text = X.columns.values
)
data = [trace]

layout= go.Layout(
    autosize= True,
    title= 'Random Forest Feature Importance',
    hovermode= 'closest',
     xaxis= dict(
         ticklen= 5,
         showgrid=False,
        zeroline=False,
        showline=False
     ),
    yaxis=dict(
        title= 'Feature Importance',
        showgrid=False,
        zeroline=False,
        ticklen= 5,
        gridwidth= 2
    ),
    showlegend= False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig,filename='scatter2010')

from sklearn import tree
from IPython.display import Image as PImage
from subprocess import check_call
from PIL import Image, ImageDraw, ImageFont
import re
import pydotplus
decision_tree = tree.DecisionTreeClassifier(max_depth = 4)
decision_tree.fit(X_train, y_train)

y_pred = decision_tree.predict(X_test)

with open("tree1.dot", 'w') as f:
    f= tree.export_graphviz(decision_tree,
                              out_file=f,
                              max_depth = 4,
                              impurity = False,
                              feature_names = X.columns.values,
                              class_names = ['No', 'Yes'],
                              rounded = True,
                              filled= True )
check_call(['dot','-Tpng','tree1.dot','-o','tree1.png'])
img = Image.open("tree1.png")
draw = ImageDraw.Draw(img)
img.save('sample-out.png')
PImage("sample-out.png") #图片打印

sgd_params = {'alpha': 0.0001,
              'class_weight': None,
              'l1_ratio': 1,
              'loss': 'log',
              'n_iter': 908,
              'penalty': 'elasticnet',
              'random_state': 1, 
              'shuffle': True}
clf = SGDClassifier(**sgd_params)
clf.fit(train_data_df, y_train)
sgd = clf.fit(train_data_df, y_train)
print("Fitting of SGDClassifier as finished")
print("Accuracy: {}".format(clf.score(test_data_df, y_test)))
Accuracy: 0.8333333333333334
sgd_pred = clf.predict(test_data_df)
f1_score(y_test, sgd_pred)
0.3098591549295775
















  • 3
    点赞
  • 9
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值