爬下招聘信息
脚本如下
# -*- coding: utf-8 -*-
import re
import urllib.request
import urllib.error
url = "https://sou.zhaopin.com/?jl=653&jt=9000000000000,9000300000000,9000300110000&kw=Java%E5%BC%80%E5%8F%91&kt=3"
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36' # 通过浏览器中输入chrome://version/获取,即用户代理
req = urllib.request.Request(url, headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'})
try:
response = urllib.request.urlopen(url)
content = response.read().decode("utf-8")
print(content)
pattern = re.compile(
'<span class="address".*?>(.*?)</span>.*?<span class="name".*?>(.*?)</span>.*?<p class="job_salary">(.*?)<i class="unit">(.*?)</i>.*?<div class=”comp.name".*?<a href.*?>(.*?)></a>',re.S)
items = re.findall(pattern, content)
dress = u"地址"
jobs = u"职位信息"
salary = u"薪资待遇"
compay = u"公司名称"
print(dress, jobs, salary, compay)
for item in items:
print(item[0], item[1], item[2], item[3], item[4])
except urllib.URLError as e:
if hasattr(e, "code"):
print(e.code)
if hasattr(e, "code"):
print(e.reason)
爬出来一堆数据,感觉还需要优化,欢迎提意见!
(随意截取一段^^)