1,爬简历
import requests
import lxml
from bs4 import BeautifulSoup
import pandas as pd
url='http://cv.qiaobutang.com/lp/53994d870cf2dda0896a66b1' #简历地址
url_wangye=requests.get(url) #得到网页源码
url_html=BeautifulSoup(url_wangye.text,'lxml') #转成html
jianli=url_html.find('div',class_="resume").text.strip()
存到本地
#转换为DataFrame用于,用于存储
b=[]
b.append(jianli)
a=pd.DataFrame()
a['col']=b
a.to_csv('D:/test/jianli.csv',encoding='gb18030')
2,内容推荐
import pandas as pd
import jieba
from gensim import corpora,models,similarities
from wordcloud import STOPWORDS
df=pd.read_csv('jobResult.csv',encoding='gb18030')
jianli=pd.read_csv('jianli.csv',encoding='gb18030')
2.1分词
#分词 得到语料库
texts=[]
for line in df.description:
seg=jieba.lcut(line)
seg_clean=[]
for word in seg:
if word =='\n' or len(word)<2:
continue
elif word in STOPWORDS:
continue
else:
seg_clean.append(word)
texts.append(seg_clean)
#分词 把简历分词
jianli_split=[]
for line in jianli.col:
seg=jieba.lcut(line)
seg_clean=[]
for word in seg:
if word =='\n' or len(word)<2:
continue
elif word in STOPWORDS:
continue
else:
seg_clean.append(word)
jianli_split.append(seg_clean)
2.2 把分词映射为向量
dic=corpora.Dictionary(texts) # 使用corpora创建词典
corpus=[dic.doc2bow(text)for text in texts] #使用doc2bow将语料库映射为向量
jianli_target=[dic.doc2bow(j) for j in jianli_split] #使用doc2bow将简历映射为向量
jianli_target=jianli_target[0] #脱去一层[]
2.3 将计数向量转化为特征向量
#初始化tfidf模型,用于将计数向量转化为tfidf特征向量
tfidf=models.TfidfModel(corpus)
#将语料库转换为tfidf特征向量
corpus_tfidf=tfidf[corpus]
#将简历转换为tfidf特征向量
jianli_target_tfidf=tfidf[jianli_target]
2.4 计算相似度
raw_matrix=similarities.MatrixSimilarity(corpus_tfidf) #计算相似度
simility=raw_matrix[jianli_target_tfidf]
2.5 输出相似度高的结果
simility_list=list(enumerate(simility)) #给相似度数值前面加行号索引,为了用行号查找
simility_list_sorted=sorted(simility_list,key=lambda x: x[1],reverse=True) #排序
for i in simility_list_sorted[:10]:
print(df.loc[i[0],['company','title']])
print('----------')
思路