利用selenium实现动态网页的爬取

最新推荐文章于 2024-01-18 16:45:03 发布

wifi连不上

最新推荐文章于 2024-01-18 16:45:03 发布

阅读量323

点赞数

分类专栏： python

本文链接：https://blog.csdn.net/wifi_wuxian/article/details/103717897

版权

python 专栏收录该内容

28 篇文章 2 订阅

订阅专栏

import re
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

# 通过获取关键字职位数量
def numberPositionsByKeyword(searchWord):
    # 创建chrome参数对象
    chrome_options = Options()
    # 把chrome设置成无界面模式,不论windows还是linux都可以，自动适配对应参数
    chrome_options.set_headless()
    # 模拟浏览器打开网页
    url = "https://search.51job.com/list/070200,000000,0000,00,9,99," + searchWord + ",2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="
    browser = webdriver.Chrome(executable_path = 'C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe',chrome_options = chrome_options)
    browser.get(url)
    # 设置智能等待时间
    browser.implicitly_wait(20)
    pagestr = browser.page_source
    # 正则表达式    ()只要括号内的数据
    restr = """<div class="rt">([\s\S]*?)</div>"""
    regex = re.compile(restr, re.IGNORECASE)
    myList = regex.findall(pagestr)
    changeStr = myList[0].strip()
    restr = "(\\d+)"
    regex = re.compile(restr, re.IGNORECASE)
    myList = regex.findall(changeStr)
    browser.quit()
    return myList[0]

numberPositionsByKeyword("数据分析师")

wifi连不上

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
利用selenium实现动态网页的爬取

import refrom selenium import webdriverfrom selenium.webdriver.chrome.options import Options# 通过获取关键字职位数量def numberPositionsByKeyword(searchWord): # 创建chrome参数对象 chrome_options = Options(...
复制链接

扫一扫