前程无忧每一页面显示的工作条数很有限,而且排名靠前的工作也并非是最好的,而且详细信息还得点进去看,以至于一般求职者根本看不了多少工作信息,也就无法筛选出满意的工作,这也是导致求职者海投简历的原因。 在我爬取了大量工作信息后,发现在前程无忧里面搜索关键词会有相当多的根本不相干的工作,垃圾信息相当多,如果只是随便看看那么很容易只看到垃圾信息。
import requests
from lxml import etree
import time
import random
import pandas as pd
import re
def get_detail(detail_url):
resp1 = requests.get(detail_url, headers=HEADERS)
text1 = resp1.content.decode('gbk', errors='ignore')
html1 = etree.HTML(text1)
ask = str(html1.xpath("/html/body/div[3]/div[2]/div[3]/div[1]/div/p/text()")).replace(' ','')
ask = re.sub("\['", '', ask)
ask = re.sub("']", '', ask)
return ask
def get_detail_urls(url,HEADERS):
resp = requests.get(url,headers=HEADERS)
text = resp.content.decode('gbk',errors='ignore')
html = etree.HTML(text)
detail_url = html.xpath("//table[@class='tbspan']//a/@href")
detail_url_1 = html.xpath("//table[@class='tbspan']//a/text()")
detail_url_2 = []
for i in range(len(detail_url)):
detail_url_2.append('http://www.ygdy8.net' + detail_url[i])
data = pd.DataFrame({'film_name':detail_url_1,'detail_url':detail_url_2})
data.to_csv('film.csv', index=False, sep=';', mode='a', header=False)
HEADERS = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'}
'''
for i in range(1,100):
url = 'http://www.ygdy8.net/html/gndy/dyzz/list_23_{}.html'.format(i)
time.sleep(random.random())
print('第{}页'.format(i))
get_detail_urls(url, HEADERS)
'''
def get_jobs(HEADERS):
job_ = []
salary_ = []
area_ = []
company_ = []
detail_url_ =[]
ask_ = []
try:
for page in range(1, 200):
url = 'https://search.51job.com/list/090200,000000,0000,00,9,99,%2B,2,{}.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='.format(
page)
resp = requests.get(url, headers=HEADERS)
text = resp.content.decode('gbk', errors='ignore')
html = etree.HTML(text)
print('第%s页' % page)
for i in range(4, 54):
job = html.xpath("//*[@id='resultList']/div[{}]/p/span/a/text()".format(i))[0].replace(' ', '')
job = re.sub("\r\n", "", job)
salary = html.xpath("//*[@id='resultList']/div[{}]/span[3]/text()".format(i))
area = html.xpath("//*[@id='resultList']/div[{}]/span[2]/text()".format(i))[0]
company = html.xpath("//*[@id='resultList']/div[{}]/span[1]/a/text()".format(i))[0].replace(' ', '')
detail_url = str(html.xpath("//*[@id='resultList']/div[{}]/p/span/a/@href".format(i)))
detail_url = re.sub("\['", '', detail_url)
detail_url = re.sub("']", '', detail_url)
ask = get_detail(detail_url)
ask_.append(ask)
job_.append(job)
salary_.append(salary)
area_.append(area)
company_.append(company)
detail_url_.append(url)
except:
pass
print(job_, '\n', salary_, '\n', company_, '\n', detail_url_)
print(ask_)
return job_, salary_, area_, company_, detail_url_, ask_
job_, salary_, area_, company_, detail_url_, ask_ = get_jobs(HEADERS)
df1 = pd.DataFrame({'job':job_, 'salary':salary_, 'company':company_, 'ask':ask_, 'detail_url':detail_url_,})
writer=pd.ExcelWriter('test.xlsx')
df1.to_excel(writer,'Sheet1')
writer.save()
writer.close()
1620

被折叠的 条评论
为什么被折叠?



