import re
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
# 通过获取关键字职位数量
def numberPositionsByKeyword(searchWord):
# 创建chrome参数对象
chrome_options = Options()
# 把chrome设置成无界面模式,不论windows还是linux都可以,自动适配对应参数
chrome_options.set_headless()
# 模拟浏览器打开网页
url = "https://search.51job.com/list/070200,000000,0000,00,9,99," + searchWord + ",2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="
browser = webdriver.Chrome(executable_path = 'C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe',chrome_options = chrome_options)
browser.get(url)
# 设置智能等待时间
browser.implicitly_wait(20)
pagestr = browser.page_source
# 正则表达式 ()只要括号内的数据
restr = """<div class="rt">([\s\S]*?)</div>"""
regex = re.compile(restr, re.IGNORECASE)
myList = regex.findall(pagestr)
changeStr = myList[0].strip()
restr = "(\\d+)"
regex = re.compile(restr, re.IGNORECASE)
myList = regex.findall(changeStr)
browser.quit()
return myList[0]
numberPositionsByKeyword("数据分析师")
利用selenium实现动态网页的爬取
最新推荐文章于 2024-01-18 16:45:03 发布