from urllib.request import Request, urlopen
import bs4
import requests
import re
import json
import xlwt
workbook = xlwt.Workbook(encoding='utf-8')
worksheet = workbook.add_sheet('1')
worksheet.write(0, 0, label='序号')
worksheet.write(0, 1, label='工作名称')
worksheet.write(0, 2, label='公司名称')
worksheet.write(0, 3, label='工作地区')
worksheet.write(0, 4, label='公司属性')
worksheet.write(0, 5, label='职位要求')
worksheet.write(0, 6, label='工作链接')
worksheet.write(0, 7, label='职责要求')
z = 1
for x in range(1,6):
url=('https://search.51job.com/list/080200,000000,0000,00,9,99,%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE,2,'+str(x))+'.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'}
ret = Request(url=url, headers=header)
html = urlopen(ret)
bs = bs4.BeautifulSoup(html, 'html.parser')
allinfos=bs.find_all('script', type="text/javascript")
for i in allinfos:
t = i.get_text()
if len(t) > 0:
t = t.replace('window.__SEARCH_RESULT__ = ', '')
x = json.loads(t)
y = x["engine_search_result"]
for j in range(1, len(y)):
company_names = y[j]['company_name']
job_names = y[j]['job_name']
salarys = y[j]['providesalary_text']
hrefs = y[j]['job_href']
worksheet.write(z, 0, z)
worksheet.write(z, 1, y[j]['job_name'])
worksheet.write(z, 2, y[j]['company_name'])
worksheet.write(z, 3, y[j]['workarea_text'])
worksheet.write(z, 4, y[j]['companytype_text'])
worksheet.write(z, 5, y[j]['attribute_text'])
worksheet.write(z, 6, y[j]['job_href'])
urls = hrefs
infos = Request(urls, headers=header)
htmls = urlopen(infos)
bss = bs4.BeautifulSoup(htmls, 'html.parser')
try:
texts = bss.find('div', {"class": 'bmsg job_msg inbox'}).get_text().split()
job_requests = "".join(texts)
worksheet.write(z, 7, label=job_requests)
except:
worksheet.write(z, 7, label=' ')
z+=1
workbook.save('qcwygz.xls')
前程无忧 链接 序号
最新推荐文章于 2024-08-01 11:20:11 发布