import pandas as pd
talent_data = pd.read_csv("./train.csv")
num_cols = ["Age", "MonthlyIncome", "NumCompaniesWorked", "PercentSalaryHike", "PerformanceRating",
"StandardHours", "TotalWorkingYears", "YearsAtCompany",
"YearsInCurrentRole", "YearsSinceLastPromotion"]
cat_cols=["Gender","MaritalStatus","OverTime"]
ord_cols=["DistanceFromHome","Education","EnvironmentSatisfaction","JobInvolvement",
"JobLevel","JobSatisfaction","RelationshipSatisfaction","StockOptionLevel",
"TrainingTimesLastYear","WorkLifeBalance"]
target_col=["Attrition"]
total_data=num_cols+ord_cols+cat_cols
use_data=talent_data[total_data+target_col]
neg_data = use_data[use_data["Attrition"] == 0]
pos_data = use_data[use_data["Attrition"] == 1]
print("正负样本比例:", len(pos_data)/len(neg_data))
print("离职:",len(pos_data))
print("未离职:",len(neg_data))