抓取拉勾网中的数据

最新推荐文章于 2021-02-27 18:37:00 发布

mr.ocean

最新推荐文章于 2021-02-27 18:37:00 发布

阅读量342

点赞数 1

分类专栏：爬虫 python 文章标签： selenium

本文链接：https://blog.csdn.net/qq_45020131/article/details/103216353

版权

python 同时被 2 个专栏收录

31 篇文章

订阅专栏

爬虫

11 篇文章

订阅专栏

利用selenium和chromdriver进行数据抓取

再使用selenium之前，我尝试过使用requests库对数据进行抓取，虽然代码也可以运行，但是比较麻烦，而且其中的cookies信息坚持不了多久，所以不能够完整的将数据给爬取下来，如下图：
在这里插入图片描述
在这里顺便把代码也一下复制过来把，希望大家可以一起交流讨论

import requests
import time
import json
from lxml import etree
import re


headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
                          "Chrome/77.0.3865.90 Safari/537.36",
            "Referer": "https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=",
            "Origin": "https://www.lagou.com",
            "X-Anit-Forge-Code": "0",
            "X-Anit-Forge-Token": "None",
            "X-Requested-With": "XMLHttpRequest"
        }


def main():
    for i in range(1, 21):
        data = {
            "first": True,
            "pn": i,  # 页数
            "kd": "python"  # 这里的关键字是python
        }

        # 实例化session,保存cookie信息，先登录原始网页，然后进行数据提取
        session = requests.session()

        session.get("https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=", headers=headers, timeout=3)
        time.sleep(2)

        cookie = session.cookies  # 为此次获取的cookie
        # 请求对应网址抓取数据
        url = "https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false"
        response = session.post(url=url, data=data, headers=headers, cookies=cookie, timeout=5)
        response.encoding = response.apparent_encoding
        result = json.loads(response.text)
        time.sleep(2)

        positionIds = result["content"]["positionResult"]["result"]  # 找到json数据中的职位编号
        for position in positionIds:
            positionId = position["positionId"]
            detail_url = "https://www.lagou.com/jobs/%s.html" % positionId  # 构造出详情页的url地址
            parse_detail(detail_url)


def parse_detail(url):
    response = requests.get(url, headers=headers)
    text = response.text
    html = etree.HTML(text)
    item = {}
    item["title"] = html.xpath("//div[@class='job-name']//h1[@class='name']/text()")[0]  # 名称

    item["company"] = html.xpath("//div[@class='job-name']//h4[@class='company']/text()")[0]  # 公司

    item["salary"] = html.xpath("//dd[@class='job_request']//span[@class='salary']/text()")[0]  # 工资

    item["work_place"] = html.xpath("//dd[@class='job_request']//span[2]/text()")[0]  # 工作地点
    item["work_place"] = re.sub(r'[\s/]', "", item["work_place"])

    item["work_years"] = html.xpath("//dd[@class='job_request']//span[3]/text()")[0]  # 工作经验
    item["work_years"] = re.sub(r'[\s/]', "", item["work_years"])

    item["education"] = html.xpath("//dd[@class='job_request']//span[4]/text()")[0]  # 学历
    item["education"] = re.sub(r'[\s/]', "", item["education"])

    item["requirement"] = "".join(html.xpath("//dd[@class='job_bt']//text()")).strip()
    item["requirement"] = re.sub(r"\s", "", item["requirement"])  # 获取工作要求
    print(item)


if __name__ == '__main__':
    main()

因为这种代码的通用性不高，无法像selenium一样模仿浏览器发送请求，所以使用了selenium配合chromedriver的方法。

实现手段： selenium、chromedriver

实现思路：

请求详情页，然后通过抓包获得每个职位的详情页面，如下图所示：

在这里插入图片描述
2. 进入职位的详情页，获取对应数据，每当获取完一页数据的时候，找到下一页的按钮，然后进行点击，方法和第一步差不多，值得注意的是，
3. 在进入详情页面是，应该新打开一个窗口，保留原始的列表页，再获取完详情页的数据时，将其关闭，然后再打开新的详情页，反复循环。
4. 再抽取数据的时候，应该将速度放低，不然爬取太快，不仅会让对方的服务器负载过多，还容易让对方识别出自己是一个爬虫

思路理清之后，就是实现代码啦
这是效果图
在这里插入图片描述

代码如下：

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from lxml import etree
import time
import re


class LagouSpider(object):
    option = webdriver.ChromeOptions()
    # option.add_argument("--proxy-server=http://118.89.24.136:88")

    def __init__(self):
        self.driver = webdriver.Chrome('G:\google\chromedriver.exe')
        self.url = "https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput="
        self.portion = []

    def run(self):
        self.driver.get(self.url)
        while True:
            source = self.driver.page_source  # 获取网页源代码
            self.get_detail_url(source)
            # 获取按钮
            next_page = self.driver.find_element_by_xpath("//div[@class='pager_container']//span[last()]")
            if "pager_next_disabled" in next_page.get_attribute("class"):  # 获取属性
                break
            else:
                next_page.click()
            time.sleep(2)

    def get_detail_url(self, source):
        """获取详情页的url地址"""
        html = etree.HTML(source)
        links = html.xpath("//ul[@class='item_con_list']//div[@class='p_top']/a/@href")  # 获取详情页的url地址
        for link in links:
            self.get_source(link)
            print(link)
            print("=" * 40)
            time.sleep(2)

    def get_source(self, link):  # 获取详情页的源代码
        """获取详情页的源代码"""
        # self.driver.get(link)
        self.driver.execute_script("window.open('%s')" % link)  # 打开一个新页面
        self.driver.switch_to.window(self.driver.window_handles[1])
        WebDriverWait(self.driver, timeout=10).until(EC.presence_of_element_located((By.XPATH, "//div[@class='job-name']//h1[@class='name']")))
        source = self.driver.page_source
        self.get_detail(source)

        # 关闭当前这个详情页页
        self.driver.close()
        # 继续切换会这个列表页
        self.driver.switch_to.window(self.driver.window_handles[0])

    def get_detail(self, source):
        """获取详情页的数据"""
        html = etree.HTML(source)
        item = {}
        item["title"] = html.xpath("//div[@class='job-name']//h1[@class='name']/text()")[0]  # 名称

        item["company"] = html.xpath("//div[@class='job-name']//h4[@class='company']/text()")[0]  # 公司

        item["salary"] = html.xpath("//dd[@class='job_request']//span[@class='salary']/text()")[0]  # 工资

        item["work_place"] = html.xpath("//dd[@class='job_request']//span[2]/text()")[0]  # 工作地点
        item["work_place"] = re.sub(r'[\s/]', "", item["work_place"])

        item["work_years"] = html.xpath("//dd[@class='job_request']//span[3]/text()")[0]  # 工作经验
        item["work_years"] = re.sub(r'[\s/]', "", item["work_years"])

        item["education"] = html.xpath("//dd[@class='job_request']//span[4]/text()")[0]  # 学历
        item["education"] = re.sub(r'[\s/]', "", item["education"])

        item["requirement"] = "".join(html.xpath("//dd[@class='job_bt']//text()")).strip()
        item["requirement"] = re.sub(r"\s", "", item["requirement"])

        self.portion.append(item)
        self.driver.save_screenshot("%s.png" % item["title"])
        # print(self.portion)
        print(item)


if __name__ == '__main__':
    spider = LagouSpider()
    spider.run()

欢迎大家一起讨论交流！！！