人岗精确匹配模型
文章目录
1.比赛及数据概况
- 比赛任务:找出符合招聘岗位的求职者简历。以f1_score衡量预测效果。
- 比赛提供6张表:求职者基本信息表person.csv,求职意向表person_cv.csv,工作经历表person_job_hist.csv,专业证书表person_pro_cert.csv,项目经验表person_project.csv,招聘岗位信息表recruit.csv。详细情况如下图。
其中,recruit_folder表中有70774条数据。
总共有[‘person.csv’,
‘person_cv.csv’,
‘person_job_hist.csv’,
‘person_project.csv’,
‘person_pro_cert.csv’,
‘recruit.csv’,
‘recruit_folder.csv’]表格
2.衍生变量
2.1 变量衍生思路
2.1.1 person_cv表
- 自我介绍长度。:SELF_COMMENT_COUNT
2.1.2 person_job_hist表
- 成就长度。ACHIEVEMENT_COUNT
- 每个人工作经历的个数;每个人成就长度的均值;每个人成就长度的总和。POSITION_COUNT,ACHIEVEMENT_MEAN,ACHIEVEMENT_SUM
2.1.3 person_project表
- 每个人项目经历个数。:PROJECT_NAME_COUNT
2.1.4 person_pro_cert表
- 每个人专业证书数量。:PRO_CERT_DSP_COUNT
2.1.5 recruit表
- 相关要求长度。DETAIL_COUNT
- 每个人成功投递岗位的数量。AVERAGE_LABEL_OF_PERSON(用kfold,避免自相关)
- 每个岗位有多少人投递成功。AVERAGE_LABEL_OF_RECRUIT(用kfold,避免自相关)
2.1.6 recruit_folder表
- 每个人投递多少岗位。RECRUIT_COUNT
- 每个岗位多少人投递。PERSON_COUNT
2.1.6 原生变量
- 所学专业编码:MAJOR_PERSON_ENCODER
- 岗位要求专业编码:MAJOR_ENCODER
- 最近所处行业编码:LAST_INDUSTRY_ENCODER
- 岗位编码:POSITION_ENCODER
- 最近岗位编码:LAST_POSITION_ENCODER
- 招聘岗位名称编码JOB_TITLE_ENCODER
- 性别:GENDER
- 工作年限:WORK_YEARS
- 最快到岗天数:AVAILABLE_IN_DAYS
- 经历的最高学历:HIGHEST_EDU
- 岗位要求的最低学历:LOWER_EDU
- 年龄:AGE
- 是否在同一地点:if_in_same_place
- 员工类型:PERSON_TYPE_CODE
2.2 特征选取结果
'RECRUIT_ID', 'PERSON_ID', 'LABEL', 'GENDER', 'WORK_YEARS', 'HIGHEST_EDU', 'AGE', 'AVAILABLE_IN_DAYS', 'SELF_COMMENT_COUNT', 'PERSON_TYPE_CODE','LOWER_EDU', 'WORK_YEARS_RANGE', 'DETAIL_COUNT', 'PROJECT_NAME_COUNT', 'POSITION_COUNT', 'ACHIEVEMENT_MEAN', 'ACHIEVEMENT_SUM', 'PRO_CERT_DSP_COUNT', 'RECRUIT_COUNT', 'PERSON_COUNT', 'if_in_same_place', 'MAJOR_PERSON_ENCODER', 'MAJOR_ENCODER','JOB_TITLE_ENCODER','AVERAGE_LABEL_OF_PERSON','AVERAGE_LABEL_OF_RECRUIT'
3.模型训练
lgbm=lightgbm.train(train_set=lightgbm.Dataset(x_train, label=y_train)
, num_boost_round=500, params={"objective": "binary", "learning_rate": 0.03, "max_depth": 6, "num_leaves": 32, "verbose": -1, "bagging_fraction": 0.8, "feature_fraction": 0.8})
y_test["预测打分"] = lgbm.predict(x_test)
y_test = y_test.sort_values("预测打分", ascending=False, ignore_index=True)
y_test["预测"] = 0
# 由于正样本占比15%,故将预测得分排序取前15%为正样本。
y_test.loc[:int(0.15 * len(y_test)), ["预测"]] = 1
print('f1_score is %s' %(metrics.f1_score(y_test['LABEL'],y_test['预测'])))
4.模型效果
- 验证集验证结果为0.8540
- 线上提交得分为0.8559
线上测试结果如下图。
5.困难点
1.每个人匹配成功的岗位数如何获取
# 整合信息,df_1为标签表,df_2为特征表
def all_info(df_1,df_2):
df_2_PERSON_LABEL=df_2.groupby('PERSON_ID').aggregate({'LABEL':'mean'}).reset_index()
df_2_PERSON_LABEL.columns=['PERSON_ID','AVERAGE_LABEL_OF_PERSON']
df_2_RECRUIT_LABEL=df_2.groupby('RECRUIT_ID').aggregate({'LABEL':'mean'}).reset_index()
df_2_RECRUIT_LABEL.columns=['RECRUIT_ID','AVERAGE_LABEL_OF_RECRUIT']
df_1=pd.merge(df_1,df_person,on='PERSON_ID',how='left')
df_1=pd.merge(df_1,df_person_cv,on='PERSON_ID',how='left')
df_1=pd.merge(df_1,df_recruit,on='RECRUIT_ID',how='left')
df_1=pd.merge(df_1,df_person_project_info,on='PERSON_ID',how='left')
df_1=pd.merge(df_1,df_person_job_hist_info,on='PERSON_ID',how='left')
df_1=pd.merge(df_1,df_person_pro_cert_info,on='PERSON_ID',how='left')
df_1=pd.merge(df_1,train_test_pi_info,on='PERSON_ID',how='left')
df_1=pd.merge(df_1,train_test_ri_info,on='RECRUIT_ID',how='left')
df_1=pd.merge(df_1,df_2_PERSON_LABEL,on='PERSON_ID',how='left')
df_1=pd.merge(df_1,df_2_RECRUIT_LABEL,on='RECRUIT_ID',how='left')
df_1['if_in_same_place']=(df_1['LOCATION_PERSON_CV']==df_1['LOCATION']).astype('float')
return df_1
# df_all_info为训练集总表
fold=4
df_all_info=pd.DataFrame()
for i in range(fold):
df_1=train.loc[train.index%fold==i].reset_index(drop=True)
df_2=train.loc[train.index%fold!=i].reset_index(drop=True)
df_all_info_1=all_info(df_1,df_2)
df_all_info=pd.concat([df_all_info,df_all_info_1],ignore_index=True)
print(df_all_info.columns)
2.NLP:处理文本信息
读取停用词
stop_word=open('D:\\proj\\sodic_2021\\stop_words_theodore.txt','r',encoding='utf-8')
stop_word_list=[]
for i in stop_word.readlines():
stop_word_list.append(i.strip())
stop_word_list[0:10]
构建专业词汇库,以计算major_person和major_recruit相似度
import jieba
# 导入自定义词库
jieba.load_userdict('D:\\proj\\sodic_2021\\job_title.txt')
major_list=[]
for i in range(len(df_all_info)):
# 如果MAJOR_RECRUIT为空,则跳过
if df_all_info.loc[i,'MAJOR_RECRUIT'] is np.NAN:
continue
else:
major=jieba.lcut(df_all_info.loc[i,'MAJOR_RECRUIT'])
for j in major:
# 若出现停用词,则去除掉
if j in stop_word_list:
major.remove(j)
major_list.append(major)
for i in range(len(df_all_info)):
if df_all_info.loc[i,'MAJOR_PERSON'] is np.NAN:
continue
else:
major=jieba.lcut(df_all_info.loc[i,'MAJOR_PERSON'])
for j in major:
if j in stop_word_list:
major.remove(j)
major_list.append(major)
print(major_list[0],major_list[7])
用major_recruit and major_person计算相似度
from gensim.models import Word2Vec
import gensim
model=gensim.models.Word2Vec(major_list,min_count=1,sg=1,window=5)
生成simi_major,计算方式为:每个词相似性的平均值,用于计算专业相似度
for i in range(len(df_all_info)):
if df_all_info.loc[i,'MAJOR_RECRUIT'] is np.NaN:
df_all_info.loc[i,'simi_major']=1
else:
if df_all_info.loc[i,'MAJOR_PERSON'] is np.NAN:
df_all_info.loc[i,'simi_major']=0
else:
major_person_simi=jieba.lcut(df_all_info.loc[i,'MAJOR_PERSON'])
major_recruit_simi=jieba.lcut(df_all_info.loc[i,'MAJOR_RECRUIT'])
for j in major_person_simi:
if j in stop_word_list:
major_person_simi.remove(j)
for k in major_recruit_simi:
if k in stop_word_list:
major_recruit_simi.remove(k)
if len(major_recruit_simi)==0:
df_all_info.loc[i,'simi_major']=1
else:
if len(major_person_simi)==0:
df_all_info.loc[i,'simi_major']=0
else:
simi=0
for l in major_person_simi:
for m in major_recruit_simi:
simi=simi+model.wv.similarity(l,m)
simi_all=simi/(len(major_person_simi)*len(major_recruit_simi))
df_all_info.loc[i,'simi_major']=simi_all
print(df_all_info['simi_major'].value_counts())
生成simi_major_max,计算方式为:取最高相似性的值,用于计算专业相似度
for i in range(len(df_all_info)):
if df_all_info.loc[i,'MAJOR_RECRUIT'] is np.NaN:
df_all_info.loc[i,'simi_major_max']=1
else:
if df_all_info.loc[i,'MAJOR_PERSON'] is np.NAN:
df_all_info.loc[i,'simi_major_max']=0
else:
major_person_simi=jieba.lcut(df_all_info.loc[i,'MAJOR_PERSON'])
major_recruit_simi=jieba.lcut(df_all_info.loc[i,'MAJOR_RECRUIT'])
for j in major_person_simi:
if j in stop_word_list:
major_person_simi.remove(j)
for k in major_recruit_simi:
if k in stop_word_list:
major_recruit_simi.remove(k)
if len(major_recruit_simi)==0:
df_all_info.loc[i,'simi_major_max']=1
else:
if len(major_person_simi)==0:
df_all_info.loc[i,'simi_major_max']=0
else:
simi=0
for l in major_person_simi:
for m in major_recruit_simi:
simi_process=model.wv.similarity(l,m)
if simi_process>=simi:
simi=simi_process
else:
continue
df_all_info.loc[i,'simi_major_max']=simi
print(df_all_info['simi_major_max'].value_counts())
将特长分词并计算tfidf值
import jieba
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
df_all_info['SPECILTY_JIEBA']=0
jieba.load_userdict('D:\\proj\\sodic_2021\\job_title.txt')
for i in range(len(df_all_info)):
# word_list=[]
if df_all_info.loc[i,'SPECILTY'] is np.NAN:
continue
else:
word=jieba.cut(df_all_info.loc[i,'SPECILTY'])
word_cut=''
for j in word:
word_cut=word_cut+' '+j.upper()
# word_list.append(word_cut)
df_all_info.loc[i,'SPECILTY_JIEBA']=word_cut
选取满足指定tfidf值的词
df_all_info['SPECILTY_JIEBA'].replace(0,'None',inplace=True)
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
df_all_info_specilty_jieba_list=df_all_info['SPECILTY_JIEBA'].tolist()
tfidf=TfidfVectorizer(min_df=0.01,max_df=0.9,stop_words=stop_word_list)
tf_matrix=tfidf.fit_transform(df_all_info_specilty_jieba_list)
word=tfidf.get_feature_names()
weight=tf_matrix.toarray()
print(weight)
weight_pd=pd.DataFrame(weight,columns=word)
print(weight_pd.columns)
print(weight_pd.shape)
ds=stop_word_list)
tf_matrix=tfidf.fit_transform(df_all_info_specilty_jieba_list)
word=tfidf.get_feature_names()
weight=tf_matrix.toarray()
print(weight)
weight_pd=pd.DataFrame(weight,columns=word)
print(weight_pd.columns)
print(weight_pd.shape)