"""
@coder: github@akin
@since: 2022/6/10上午8:59
@desc: 爬取dy主下所有视频
"""
import requests
import re
from selenium import webdriver
driver = webdriver.Chrome()
driver.get("https://www.douyin.com/user/MS4wLjABAAAATItWsjZK6_kivKOsBNcoApqmSuYRkiEUMQPb22ZDDT8GAXBI_67eMywJevVlwUBz")
driver.implicitly_wait(10)
def drop_down():
"""下拉模拟"""
for x in range(1, 30, 4):
import time
time.sleep(1)
j = x / 9
js = 'document.documentElement.scrollTop = document.documentElement.scrollHeight * %f' % j
driver.execute_script(js)
drop_down()
def down_vod(url):
url = url
headers = {
'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.61 Safari/537.36",
'cookie': 'douyin.com; ttcid=494a0c3b643c46d0b41849fbbe47ea4887; odin_tt=6207c617ff8bda8445129e8cd7bfa1f203f65e0b29373607f0310585b1085900b58b1e96a0014cad012fbab91f2a968ff7c61b84dc064457dfd01b83e9afa6bc; d_ticket=1893b444072fb9f693e148daec8913ece3c1b; sid_guard=09a43a1057cc7f35db23dae168530794%7C1642144859%7C5183999%7CTue%2C+15-Mar-2022+07%3A20%3A58+GMT; __ac_nonce=062a28e99002e37b45060; __ac_signature=_02B4Z6wo00f01a5z71gAAIDDRtu-y00RwnmuV-vAAAk.BMVcycwkcsCm9HyFS8QABqdMAKuNodRsdKF9Vuv-4ff2TDNsytWjcOLFgoYdDP3NhB.F6JAAdcbqCwKdgdcyAlAmD4SFnCYqXUGf33; ttwid=1%7Clyx7xHXpIL15j3RkDLCasnEzsI6v2r4K9pOzWgKYqvo%7C1654820506%7C341a2b1f2ab769807e5a6653342df0703d95a7245eff8652fd2e8fbbca1756a7; douyin.com; strategyABtestKey=1654820507.984; s_v_web_id=verify_l47pek0w_AFtvNkcu_cU37_4NPr_BQrV_p7qsXBQLtwla; passport_csrf_token=572531d33264b7af2d19a89ff5b254cc; passport_csrf_token_default=572531d33264b7af2d19a89ff5b254cc; _tea_utm_cache_2285=undefined; _tea_utm_cache_6383=undefined; _tea_utm_cache_1300=undefined; msToken=C_q08TJik1H6NDQ9G22b2hved1EO8Ban92Jx7yoAK7P3l_bdVIX618yBjJQAIZjNxnmM63e3ArBmXr0AOHfex6rdtvJjmaVTnt__N8YtmXD5FX14E6aI-w==; home_can_add_dy_2_desktop=%221%22; msToken=MklzmRREJgbSLeUjUBOC9pf4N4nU9j2ZxBqerhMesuNCyQW_u1gGR5yqrDzzq9nHwIXtlrgjRdcLAL8fMbpcckUm0g3zi72fEMsazDUG8fD9iphZNEJoeg==; tt_scid=0igyI6wMowxOgJdU6Qsjztd0xqx9AQymBFieuXX9pVlvpx6PBQfQx4BvlqmyeSp-8d43; pwa_guide_count=%222%22; THEME_STAY_TIME=%22173300%22'
}
response = requests.get(url=url, headers=headers)
title = re.findall('<title data-react-helmet="true">(.*?)</title>', response.text)
title2 = title[0]
print(title2)
video_url = re.findall('src(.*?)%22%7D%2C%7B%22src', response.text)[0]
video_url = requests.utils.unquote(video_url).replace('":"', 'https:')
print(video_url)
video_binary = requests.get(url=video_url, headers=headers).content
with open(title2 + '.mp4', mode='wb') as f:
f.write(video_binary)
lis = driver.find_elements_by_css_selector(".ECMy_Zdt")
for li in lis:
try:
href = li.find_element_by_css_selector('a').get_attribute('href')
print(href, 'downing!')
down_vod(href)
except:
pass
print(len(lis))
爬取版本V2,自动创建文件夹
"""
@coder: github@akin
@since: 2022/6/10上午8:59
@desc: 爬取dy主下所有视频,自动创建文件夹,视频下载后存储到该文件夹
"""
import requests
import re
from selenium import webdriver
driver = webdriver.Chrome()
driver.get("https://www.douyin.com/user/MS4wLjABAAAAbtSlJK_BfUcuqyy8ypNouqEH7outUXePTYEcAIpY9rk")
driver.implicitly_wait(10)
def drop_down():
"""下拉模拟"""
for x in range(1, 30, 4):
import time
time.sleep(1)
j = x / 9
js = 'document.documentElement.scrollTop = document.documentElement.scrollHeight * %f' % j
driver.execute_script(js)
drop_down()
up_name = input("输入UP主的名称,用作创建文件夹==:")
import os
os.mkdir("./douyin@{}".format(up_name))
def down_vod(url):
url = url
headers = {
'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.61 Safari/537.36",
'cookie': 'douyin.com; ttcid=494a0c3b643c46d0b41849fbbe47ea4887; odin_tt=6207c617ff8bda8445129e8cd7bfa1f203f65e0b29373607f0310585b1085900b58b1e96a0014cad012fbab91f2a968ff7c61b84dc064457dfd01b83e9afa6bc; d_ticket=1893b444072fb9f693e148daec8913ece3c1b; sid_guard=09a43a1057cc7f35db23dae168530794%7C1642144859%7C5183999%7CTue%2C+15-Mar-2022+07%3A20%3A58+GMT; __ac_nonce=062a28e99002e37b45060; __ac_signature=_02B4Z6wo00f01a5z71gAAIDDRtu-y00RwnmuV-vAAAk.BMVcycwkcsCm9HyFS8QABqdMAKuNodRsdKF9Vuv-4ff2TDNsytWjcOLFgoYdDP3NhB.F6JAAdcbqCwKdgdcyAlAmD4SFnCYqXUGf33; ttwid=1%7Clyx7xHXpIL15j3RkDLCasnEzsI6v2r4K9pOzWgKYqvo%7C1654820506%7C341a2b1f2ab769807e5a6653342df0703d95a7245eff8652fd2e8fbbca1756a7; douyin.com; strategyABtestKey=1654820507.984; s_v_web_id=verify_l47pek0w_AFtvNkcu_cU37_4NPr_BQrV_p7qsXBQLtwla; passport_csrf_token=572531d33264b7af2d19a89ff5b254cc; passport_csrf_token_default=572531d33264b7af2d19a89ff5b254cc; _tea_utm_cache_2285=undefined; _tea_utm_cache_6383=undefined; _tea_utm_cache_1300=undefined; msToken=C_q08TJik1H6NDQ9G22b2hved1EO8Ban92Jx7yoAK7P3l_bdVIX618yBjJQAIZjNxnmM63e3ArBmXr0AOHfex6rdtvJjmaVTnt__N8YtmXD5FX14E6aI-w==; home_can_add_dy_2_desktop=%221%22; msToken=MklzmRREJgbSLeUjUBOC9pf4N4nU9j2ZxBqerhMesuNCyQW_u1gGR5yqrDzzq9nHwIXtlrgjRdcLAL8fMbpcckUm0g3zi72fEMsazDUG8fD9iphZNEJoeg==; tt_scid=0igyI6wMowxOgJdU6Qsjztd0xqx9AQymBFieuXX9pVlvpx6PBQfQx4BvlqmyeSp-8d43; pwa_guide_count=%222%22; THEME_STAY_TIME=%22173300%22'
}
response = requests.get(url=url, headers=headers)
title = re.findall('<title data-react-helmet="true">(.*?)</title>', response.text)
title2 = title[0]
print(title2)
video_url = re.findall('src(.*?)%22%7D%2C%7B%22src', response.text)[0]
video_url = requests.utils.unquote(video_url).replace('":"', 'https:')
print(video_url)
video_binary = requests.get(url=video_url, headers=headers).content
with open("./douyin@{}/".format(up_name) + title2 + '.mp4', mode='wb') as f:
f.write(video_binary)
lis = driver.find_elements_by_css_selector(".ECMy_Zdt")
for li in lis:
try:
href = li.find_element_by_css_selector('a').get_attribute('href')
print(href, 'downing!')
down_vod(href)
except:
pass
print(len(lis))