python 使用selenium爬取拉钩网

最新推荐文章于 2022-06-10 08:30:00 发布

bajiao7928

最新推荐文章于 2022-06-10 08:30:00 发布

阅读量214

点赞数

文章标签： python

原文链接：http://www.cnblogs.com/a595452248/p/11506171.html

版权

一、爬去方式

　　　用一般的爬取方式会发现得不到任何信息，所以我们选择selenium来爬取数据

二、下面为源码

from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time

import urllib.parse

class Lagou(object):

    def init(self):
        self.flag = True#判断什么时候结束程序
        opt = webdriver.chrome.options.Options()
        opt.set_headless()
        self.driver = webdriver.Chrome(chrome_options=opt)#隐藏游览器界面
        self.wait = WebDriverWait(self.driver,10)#设置等待时间
        self.job = input('请输入想了解的职业')
        cookie = input('请输入cookie：')
        for item in cookie.split(';'):
            k, v = item.strip().split('=')
            self.driver.add_cookie({'name': k, 'value': v})
        self.url = 'https://www.lagou.com/jobs/list_'+ urllib.parse.quote(self.job)+'?&cl=false&fromSearch=true&labelWords=&suginput='
        self.driver.get(self.url)

    def get_html(self):#获取数据
        try:
            link = self.wait.until(EC.presence_of_all_elements_located((By.XPATH,'//a[@class="position_link"]')))
            jobs = self.wait.until(EC.presence_of_all_elements_located((By.XPATH,'//a[@class="position_link"]/h3')))
            add = self.wait.until(EC.presence_of_all_elements_located((By.XPATH,'//a[@class="position_link"]/span/em')))
            li_b_l = self.wait.until(EC.presence_of_all_elements_located((By.XPATH,'//div[@class="p_bot"]//div[@class="li_b_l"]')))
            list_item_bot = self.wait.until(
                EC.presence_of_all_elements_located((By.XPATH, '//div[@class="list_item_bot"]//div[@class="li_b_l"]')))
            for i,q,w,e,r in zip(link,jobs,add,li_b_l,list_item_bot):
                dict = {
                    '链接':i.get_attribute('href'),
                    '职业':q.text,
                    '公司地址':w.text,
                    '工资和要求':e.text,
                    '技能要求':r.text
                             }
                with open('lagou.json', 'a+') as f:
                    f.write(str(dict)+'\n')
        except Exception as e:
             self.flag = False

    def get_next_page(self):#翻页
        count = 0
        while self.flag:
            count += 1
            next = self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME,'pager_next')))
            next.click()
            time.sleep(3)
            self.get_html()
            print('正在爬取第%d页'%count)

if __name__ == '__main__':
    l = Lagou()
    l.init()
    l.get_html()
    l.get_next_page()

转载于:https://www.cnblogs.com/a595452248/p/11506171.html

bajiao7928

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
python 使用selenium爬取拉钩网

一、爬去方式　　　用一般的爬取方式会发现得不到任何信息，所以我们选择selenium来爬取数据二、下面为源码from selenium import webdriverfrom selenium.webdriver.support.wait import WebDriverWaitfrom selenium.webdriver.support import expecte...
复制链接

扫一扫