selenium爬取今日头条的图片

最新推荐文章于 2024-05-23 16:31:47 发布

sinat_36395423

最新推荐文章于 2024-05-23 16:31:47 发布

阅读量515

点赞数

文章标签： selenium 网页数据爬取今日头条

本文链接：https://blog.csdn.net/sinat_36395423/article/details/107232410

版权

为什么要用selenium去爬取？

1、用requests爬取时，详情页面的数据信息总是难以动态获取（如有更好的方法请多多指教）

2、学习selenium

好了，不多说，上代码

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
import time
import requests
from hashlib import md5
import os
URL = 'https://www.toutiao.com'

class TouTiaoImg():
    def __init__(self):
        self.browser = webdriver.Chrome()
        self.wait = WebDriverWait(self.browser, 10)
        self.url = URL

    #打开首页，并搜索关键字
    def first_step(self, keyword):
        try:
            self.browser.get(self.url)
            inputs = self.wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'tt-input__inner'))) #搜索输入框
            button = self.wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.tt-input-group__append button'))) #搜索按钮
            inputs.send_keys(keyword)
            button.click()
            time.sleep(2)
            self.browser.switch_to_window(self.browser.window_handles[1]) #让程序切换到列表页
            time.sleep(2)
            self.second_step()
        except TimeoutException:
            self.first_step()
        finally:
            self.browser.close()
    
    #获取列表
    def second_step(self):
        box = self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.feedBox div .sections')))
        i = 0
        while True:
            self.browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')
            try:
                imgItem = box.find_element_by_id('J_section_'+str(i))
            except NoSuchElementException:
                break
            try:
                imgItem.find_element_by_css_selector('div div .lbox')
                flag = True
            except NoSuchElementException:
                flag = False
            ele = imgItem.find_element_by_css_selector('div div .normal div div a')
            title = ele.find_element_by_css_selector('span').text
            ele.click() #点击进入选定的详情页
            time.sleep(2)
            # print(self.browser.window_handles)
            self.browser.switch_to_window(self.browser.window_handles[2]) #切换到详情页
            self.third_step(flag, title)
            self.browser.execute_script('window.close()') #关闭详情页
            # time.sleep(2)
            self.browser.switch_to_window(self.browser.window_handles[1]) #切换到列表页
            i += 1

    #获取详情页面内的图片
    def third_step(self, flag, title):
        print(title)
        # print(flag)
        try:
            if flag: #相册样式展示
                article = self.wait.until(EC.presence_of_element_located((By.TAG_NAME, 'article')))
                imgItems = article.find_elements_by_tag_name('img')
                # print(imgItems)
                for imgItem in imgItems:
                    url = imgItem.get_attribute('src')
                    print(url)
                    self.save_img(url, title)
            else: #文章类型展示
                imgBox = self.wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'image-list')))
                imgItems = imgBox.find_elements_by_class_name('image-item')
                for imgItem in imgItems:
                    url = imgItem.find_element_by_css_selector('div a').get_attribute('href')
                    print(url)
                    self.save_img(url, title)
        except TimeoutException: #可能会遇到视频，所以报超时异常
            print('NO IMAGE')
        print("\n")
    
    #保存图片到本地        
    def save_img(self, url, dir):
        dirPath = 'C:/toutiaoImage/' + dir
        if not os.path.exists(dirPath):
            os.mkdir(dirPath)
        response = requests.get(url)
        save_name = dirPath + '/{}.jpg'.format(md5(response.content).hexdigest())
        if not os.path.exists(save_name):
            with open(save_name, 'wb') as f:
                f.write(response.content)
        
        


if __name__ == '__main__':
    tou = TouTiaoImg()
    tou.first_step('cosplay') #搜索关键字，随你更改

sinat_36395423

关注

0
点赞
踩
2

收藏

觉得还不错? 一键收藏
2
评论
selenium爬取今日头条的图片

为什么要用selenium去爬取？1、用requests爬取时，详情页面的数据信息总是难以动态获取（如有更好的方法请多多指教）2、学习selenium好了，不多说，上代码from selenium import webdriverfrom selenium.webdriver.common.by import Byfrom selenium.common.exceptions import TimeoutException, NoSuchElementExceptionfrom s.
复制链接

扫一扫