import numpy as np
import pandas as pd
from subprocess import check_output
import plotly.offline as py
py.init_notebook_mode(connected=True)
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics import f1_score
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, log_loss
attrition=pd.read_csv(r"C:\Users\Administrator\Desktop\hr\HR_Attrition_analysis-master\data\WA_Fn-UseC_-HR-Employee-Attrition.csv")
attrition.head()
print(attrition.shape)
attrition.dtypes
fig = plt.figure(figsize=(5, 5))
y = ["No", "Yes"]
ax = sns.categorical.barplot(y, np.array(attrition.Attrition.value_counts(normalize=True)), saturation=1)
ax.set_xticklabels(y)
ax.set_title("Attrition")
ax.set_xlabel("")
ax.set_ylabel("Frequency")
ax.set_ylim([0,1])
plt.show()
attrition.Attrition_numeric = attrition.Attrition
attrition.loc[attrition.Attrition == 'Yes','Attrition_numeric'] = 1
attrition.loc[attrition.Attrition == 'No','Attrition_numeric'] = 0
attrition.describe()
fig = plt.figure(figsize=(10, 60))
cols = 3
target_column = "Attrition"
rows = np.ceil(float(attrition.shape[1] / cols))
for i, column in enumerate(attrition.columns):
if target_column == column:
continue
ax = fig.add_subplot(rows, cols, i+1)
ax.set_title(column)
if attrition.dtypes[column] == np.object:
cts = attrition[[target_column, column]]
cts = cts.groupby([target_column, column]).size()
cts.unstack().T.plot(kind="bar", ax=ax, stacked=True, alpha=1)
else:
cts = attrition[[target_column, column]]
#(xmin, xmax) = (min(cts[column].tolist()), max(cts[column].tolist()))
cts.groupby(target_column)[column].plot(
bins=16,
kind="hist",
stacked=True,
alpha=1,
legend=True,
ax=ax,
#range=[xmin, max]
)
plt.tight_layout()
![](https://i-blog.csdnimg.cn/blog_migrate/eaf0b78d8e1dd07dbb71fcb3eb8bc228.png)
。。。。
f, ax = plt.subplots(figsize=(35, 15))
sns.heatmap(attrition.corr(), annot=True)
fig = plt.figure(figsize=(5, 5))
ax = fig.add_subplot(1,1,1)
ax = attrition.corr().ix["Attrition_numeric"].drop("Attrition_numeric").drop("StandardHours").drop("EmployeeCount").sort_values().plot(kind="barh", figsize=(6, 8), ax=ax)
ax.tick_params(axis='y', which='major', labelsize=12)
ax.set_title("Attrititon Corelation")
plt.tight_layout()
attrition = attrition[['Attrition',
'Age',
'BusinessTravel',
'DailyRate',
'Department',
'DistanceFromHome',
'Education',
'EducationField',
'EmployeeCount',
'EmployeeNumber',
'EnvironmentSatisfaction',
'Gender',
'HourlyRate',
'JobInvolvement',
'JobLevel',
'JobRole',
'JobSatisfaction',
'MaritalStatus',
'MonthlyIncome',
'MonthlyRate',
'NumCompaniesWorked',
'Over18',
'OverTime',
'PercentSalaryHike',
'PerformanceRating',
'RelationshipSatisfaction',
'StandardHours',
'StockOptionLevel',
'TotalWorkingYears',
'TrainingTimesLastYear',
'WorkLifeBalance',
'YearsAtCompany',
'YearsInCurrentRole',
'YearsSinceLastPromotion',
'YearsWithCurrManager']]
dataset = attrition.drop(['EmployeeCount'], axis=1)
dataset = dataset.drop(['StandardHours'], axis=1)
dataset = dataset.drop(['Over18'], axis=1)
dataset = dataset.drop(['EmployeeNumber'], axis=1)
dataset = dataset.drop(['PerformanceRating'], axis=1)
dataset.head()
![](https://i-blog.csdnimg.cn/blog_migrate/c49bb610e1713f8d0233c3b1da0b85b0.png)
X = dataset.iloc[:, 1:]
y = dataset.iloc[:, 0]
X.head()
y.head()
from sklearn.preprocessing import LabelEncoder
def lbl_enc(var_x):
lbl_enc = LabelEncoder()
lbl_enc.fit(y[var_x])
var_x_lbl = var_x
y[var_x_lbl] = lbl_enc.transform(y[var_x])
y = pd.DataFrame(y)
lbl_enc('Attrition')
X = pd.get_dummies(X)
X.shape
X.head() #可怕哑变量之后维度增长很多
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
dropout = 0.1
epochs = 100
batch_size = 30
optimizer = 'adam'
k = 20
seed = 0
parameter = {
'n_jobs': -1,
'n_estimators': 800,
'warm_start': True,
'max_features': 0.3,
'max_depth': 9,
'min_samples_leaf': 2,
'max_features' : 'sqrt',
'random_state' : seed,
'verbose': 0
}
rf = RandomForestClassifier(**parameter)
rf = rf.fit(X_train, y_train)
print("Fitting of Random Forest as finished")
rf_predictions = rf.predict(X_test)
print("Predictions finished")
accuracy_score(y_test, rf_predictions)
0.8503401360544217
from sklearn.metrics import f1_score
f1_score(y_test, rf_predictions)
0.33333333333333337
importances = rf.feature_importances_
importances
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
# Scatter plot
trace = go.Scatter(
y = rf.feature_importances_,
x = X.columns.values,
#x = attrition_final.columns.values,
mode='markers',
marker=dict(
sizemode = 'diameter',
sizeref = 1,
size = 6,
#size= rf.feature_importances_,
#color = np.random.randn(500),
color = rf.feature_importances_,
colorscale='Portland',
showscale=True
),
text = X.columns.values
)
data = [trace]
layout= go.Layout(
autosize= True,
title= 'Random Forest Feature Importance',
hovermode= 'closest',
xaxis= dict(
ticklen= 5,
showgrid=False,
zeroline=False,
showline=False
),
yaxis=dict(
title= 'Feature Importance',
showgrid=False,
zeroline=False,
ticklen= 5,
gridwidth= 2
),
showlegend= False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig,filename='scatter2010')
from sklearn import tree
from IPython.display import Image as PImage
from subprocess import check_call
from PIL import Image, ImageDraw, ImageFont
import re
import pydotplus
decision_tree = tree.DecisionTreeClassifier(max_depth = 4)
decision_tree.fit(X_train, y_train)
y_pred = decision_tree.predict(X_test)
with open("tree1.dot", 'w') as f:
f= tree.export_graphviz(decision_tree,
out_file=f,
max_depth = 4,
impurity = False,
feature_names = X.columns.values,
class_names = ['No', 'Yes'],
rounded = True,
filled= True )
check_call(['dot','-Tpng','tree1.dot','-o','tree1.png'])
img = Image.open("tree1.png")
draw = ImageDraw.Draw(img)
img.save('sample-out.png')
PImage("sample-out.png") #图片打印
sgd_params = {'alpha': 0.0001,
'class_weight': None,
'l1_ratio': 1,
'loss': 'log',
'n_iter': 908,
'penalty': 'elasticnet',
'random_state': 1,
'shuffle': True}
clf = SGDClassifier(**sgd_params)
clf.fit(train_data_df, y_train)
sgd = clf.fit(train_data_df, y_train)
print("Fitting of SGDClassifier as finished")
print("Accuracy: {}".format(clf.score(test_data_df, y_test)))
Accuracy: 0.8333333333333334
sgd_pred = clf.predict(test_data_df)
f1_score(y_test, sgd_pred)
0.3098591549295775