爬取煎蛋随手拍图

最新推荐文章于 2020-10-15 15:27:09 发布

python_QYF

最新推荐文章于 2020-10-15 15:27:09 发布

阅读量585

点赞数 1

分类专栏： spider 文章标签： spider

本文链接：https://blog.csdn.net/qyf__123/article/details/84865808

版权

spider 专栏收录该内容

13 篇文章 0 订阅

订阅专栏

爬取煎蛋随手拍图

使用 requests + selenium 来进行图片的爬取

爬取结果

在这里插入图片描述

爬取思路

使用 selenium 发起请求
对页面进行数据的提取
取到页面上每个图片的 url
使用 requests 发起请求
将图片进行保存

实现代码

import os
import time

import requests
from selenium import webdriver


class JandanPic:
    def __init__(self):
        self.start_url = "http://jandan.net/ooxx"
        self.driver = webdriver.Chrome()
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3608.4 Safari/537.36"
        }

    def parse_get_url(self, url):
        resp = requests.get(url, headers=self.headers)
        return resp.content

    def get_content_list(self):
        li_list = self.driver.find_elements_by_xpath("//ol[@class='commentlist']/li")
        print(li_list)
        content_list = []
        for li in li_list:
            items = {}
            items["img_id"] = li.find_element_by_xpath(".//span[@class='righttext']/a").text if len(
                li.find_elements_by_xpath(".//span[@class='righttext']/a")) > 0 else None
            items["img_url"] = li.find_element_by_xpath(".//a[@class='view_img_link']").get_attribute("href") if len(
                li.find_elements_by_xpath(".//a[@class='view_img_link']")) > 0 else None
            print(items)
            content_list.append(items)

        # 下载图片
        self.save_pic(content_list)

        next_url = self.driver.find_elements_by_xpath("//a[@class='previous-comment-page']")
        next_url = next_url[0] if len(next_url) > 0 else None

        return next_url

    def save_pic(self, content_list):
        folder_path = "./image/jandan/"
        if not os.path.exists(folder_path):
            os.mkdir(folder_path)
        for content in content_list:
            if content["img_url"] is not None and content["img_id"] is not None:
                pic_url = content["img_url"]
                pic_name = content["img_id"] + content["img_url"][-4:]
                # 发送请求，获取图片数据
                img_bytes = self.parse_get_url(pic_url)
                print("开始下载：{}".format(pic_url))
                with open(folder_path + pic_name, 'wb') as f:
                    f.write(img_bytes)

    def run(self):
        # 发起请求，打开浏览器
        self.driver.get(self.start_url)
        time.sleep(6)
        # 取数据，保存数据
        next_url = self.get_content_list()

        # 下一页
        while next_url is not None:
            next_url.click()
            print("下一页")
            time.sleep(6)
            # 取数据，保存数据
            next_url = self.get_content_list()

        self.driver.quit()


if __name__ == '__main__':
    jandan = JandanPic()
    jandan.run()