#-*- coding: utf-8 -*-
import requests
from lxml import etree
import os,time
from bs4 import BeautifulSoup
import csv
import time
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.chrome.options import Options
header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36 Edg/89.0.774.57"}
# search_dict = {'query':'工地安全带'}
# base_url = "https://haokan.baidu.com/web/search/page"
base_url = "https://haokan.baidu.com/web/search/page?query=工地安全带"
global count
count = 1
# 动态爬虫初始化
time.sleep(1)
chrome_options = Options()
chrome_options.add_argument('–headless')
chromedriver = r"C:\Users\Administrator\AppData\Local\Google\Chrome\Application\chromedriver.exe"
driver = webdriver.Chrome(chromedriver, chrome_options=chrome_options)
def get_request(url):
# browser = webdriver.Chrome()
driver.get(url)
time.sleep(2)
html = driver.page_source # 解析成html
html = etree.HTML(html)
return html
def send_request(url):
time.sleep(1)
reqs = requests.get(url=url,headers=header).content
return reqs
def video_path(url):
video_content = get_request(url)
parseHtml = video_content.xpath('//div[@class="videos"]//video')
download(parseHtml[0].xpath('./@src')[0])
def download(url):
global count
rep = send_request(url)
os.makedirs('./工地安全带',exist_ok=True)
video_name = '视频{}'.format(count)
with open('工地安全带/{}.mp4'.format(video_name), 'wb')as f:
f.write(rep)
count += 1
def get_idx_url():
content = get_request(base_url)
img_paths = content.xpath('//div[@class="message-list"]/a')
print(img_paths)
for line in img_paths:
url = line.xpath('./@href')[0]
video_path(url)
# print(page)
get_idx_url()
08-07
1万+
02-06
3204