python动态爬虫-haokan

最新推荐文章于 2024-06-22 15:53:39 发布

oyjwin

最新推荐文章于 2024-06-22 15:53:39 发布

阅读量242

点赞数

分类专栏： python爬虫

本文链接：https://blog.csdn.net/oyjwin/article/details/117290197

版权

工地安全安全带视频下载动态爬虫百度搜索

关键词由CSDN通过智能技术生成

python爬虫专栏收录该内容

1 篇文章 0 订阅

订阅专栏

#-*- coding: utf-8 -*-
import requests
from lxml import etree
import os,time
from bs4 import BeautifulSoup
import csv
import time
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.chrome.options import Options


header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36 Edg/89.0.774.57"}
# search_dict = {'query':'工地安全带'}
# base_url = "https://haokan.baidu.com/web/search/page"
base_url = "https://haokan.baidu.com/web/search/page?query=工地安全带"
global count
count = 1

# 动态爬虫初始化
time.sleep(1)
chrome_options = Options()
chrome_options.add_argument('–headless')
chromedriver = r"C:\Users\Administrator\AppData\Local\Google\Chrome\Application\chromedriver.exe"
driver = webdriver.Chrome(chromedriver, chrome_options=chrome_options)

def get_request(url):
    # browser = webdriver.Chrome()
    driver.get(url)
    time.sleep(2)
    html = driver.page_source # 解析成html
    html = etree.HTML(html)
    return html

def send_request(url):
    time.sleep(1)
    reqs = requests.get(url=url,headers=header).content
    return reqs

def video_path(url):
    video_content = get_request(url)
    parseHtml = video_content.xpath('//div[@class="videos"]//video')
    download(parseHtml[0].xpath('./@src')[0])

def download(url):
    global count
    rep = send_request(url)
    os.makedirs('./工地安全带',exist_ok=True)
    video_name = '视频{}'.format(count)
    with open('工地安全带/{}.mp4'.format(video_name), 'wb')as f:
        f.write(rep)
    count += 1


def get_idx_url():
    content = get_request(base_url)
    img_paths = content.xpath('//div[@class="message-list"]/a')
    print(img_paths)
    for line in img_paths:
        url = line.xpath('./@href')[0]
        video_path(url)
        # print(page)

get_idx_url()

oyjwin

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
python动态爬虫-haokan

#-*- coding: utf-8 -*-import requestsfrom lxml import etreeimport os,timefrom bs4 import BeautifulSoupimport csvimport timefrom selenium import webdriverfrom selenium.common.exceptions import TimeoutExceptionfrom selenium.webdriver.common.by impo.
复制链接

扫一扫