Pydouyin爬up主下所有视频

# -*- encoding=utf-8 -*-
"""
@coder: github@akin
@since: 2022/6/10上午8:59
@desc: 爬取dy主下所有视频
"""

import requests
import re

from selenium import webdriver

driver = webdriver.Chrome()
# 测试地址:https://www.douyin.com/user/MS4wLjABAAAATItWsjZK6_kivKOsBNcoApqmSuYRkiEUMQPb22ZDDT8GAXBI_67eMywJevVlwUBz
driver.get("https://www.douyin.com/user/MS4wLjABAAAATItWsjZK6_kivKOsBNcoApqmSuYRkiEUMQPb22ZDDT8GAXBI_67eMywJevVlwUBz")
driver.implicitly_wait(10)

def drop_down():
    """下拉模拟"""
    for x in range(1, 30, 4): # 1, 3, 5 在你不断下拉过程中,高度不断变化
        import time
        time.sleep(1)
        j = x / 9 # 1/9 3/9 5/9...
        # document.documentElement.scrollTop 指定果冻条位置
        # document.documentElement.scrollHeight 获取浏览器页面最大高度
        js = 'document.documentElement.scrollTop = document.documentElement.scrollHeight * %f' % j
        driver.execute_script(js)

drop_down()

def down_vod(url):
    # 1、发请求
    # 测试地址:https://www.douyin.com/video/7106168871668894976 小绮乱撞
    url = url
    # headers伪装
    headers = {
        'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.61 Safari/537.36",
        # 没cookie,reponse.text是假的
        'cookie': 'douyin.com; ttcid=494a0c3b643c46d0b41849fbbe47ea4887; odin_tt=6207c617ff8bda8445129e8cd7bfa1f203f65e0b29373607f0310585b1085900b58b1e96a0014cad012fbab91f2a968ff7c61b84dc064457dfd01b83e9afa6bc; d_ticket=1893b444072fb9f693e148daec8913ece3c1b; sid_guard=09a43a1057cc7f35db23dae168530794%7C1642144859%7C5183999%7CTue%2C+15-Mar-2022+07%3A20%3A58+GMT; __ac_nonce=062a28e99002e37b45060; __ac_signature=_02B4Z6wo00f01a5z71gAAIDDRtu-y00RwnmuV-vAAAk.BMVcycwkcsCm9HyFS8QABqdMAKuNodRsdKF9Vuv-4ff2TDNsytWjcOLFgoYdDP3NhB.F6JAAdcbqCwKdgdcyAlAmD4SFnCYqXUGf33; ttwid=1%7Clyx7xHXpIL15j3RkDLCasnEzsI6v2r4K9pOzWgKYqvo%7C1654820506%7C341a2b1f2ab769807e5a6653342df0703d95a7245eff8652fd2e8fbbca1756a7; douyin.com; strategyABtestKey=1654820507.984; s_v_web_id=verify_l47pek0w_AFtvNkcu_cU37_4NPr_BQrV_p7qsXBQLtwla; passport_csrf_token=572531d33264b7af2d19a89ff5b254cc; passport_csrf_token_default=572531d33264b7af2d19a89ff5b254cc; _tea_utm_cache_2285=undefined; _tea_utm_cache_6383=undefined; _tea_utm_cache_1300=undefined; msToken=C_q08TJik1H6NDQ9G22b2hved1EO8Ban92Jx7yoAK7P3l_bdVIX618yBjJQAIZjNxnmM63e3ArBmXr0AOHfex6rdtvJjmaVTnt__N8YtmXD5FX14E6aI-w==; home_can_add_dy_2_desktop=%221%22; msToken=MklzmRREJgbSLeUjUBOC9pf4N4nU9j2ZxBqerhMesuNCyQW_u1gGR5yqrDzzq9nHwIXtlrgjRdcLAL8fMbpcckUm0g3zi72fEMsazDUG8fD9iphZNEJoeg==; tt_scid=0igyI6wMowxOgJdU6Qsjztd0xqx9AQymBFieuXX9pVlvpx6PBQfQx4BvlqmyeSp-8d43; pwa_guide_count=%222%22; THEME_STAY_TIME=%22173300%22'
    }
    response = requests.get(url=url, headers=headers)
    # 2、获取数据
    # print(response)
    # print(response.text)
    # 3、解析数据
    # re.findall('<title data-react-helmet="true">起风了Yyds#精神甜妹 - 抖音</title>')
    title = re.findall('<title data-react-helmet="true">(.*?)</title>', response.text)
    # ['起风了Yyds#精神甜妹 - 抖音']
    # print(title)
    # print(title[0])
    title2 = title[0]
    print(title2)
    video_url = re.findall('src(.*?)%22%7D%2C%7B%22src', response.text)[0]
    # print(video_url)
    video_url = requests.utils.unquote(video_url).replace('":"', 'https:')
    print(video_url)
    video_binary = requests.get(url=video_url, headers=headers).content
    # with open('video/' + title2 + '.mp4', mode='wb') as f:
    with open(title2 + '.mp4', mode='wb') as f:
        f.write(video_binary)

lis = driver.find_elements_by_css_selector(".ECMy_Zdt")
for li in lis:
    # 如果有报错就跳过的try except
    try:
        href = li.find_element_by_css_selector('a').get_attribute('href')
        print(href, 'downing!')
        down_vod(href)
    except:
        pass
print(len(lis))




爬取版本V2,自动创建文件夹

# -*- encoding=utf-8 -*-
"""
@coder: github@akin
@since: 2022/6/10上午8:59
@desc: 爬取dy主下所有视频,自动创建文件夹,视频下载后存储到该文件夹
"""

import requests
import re

from selenium import webdriver

driver = webdriver.Chrome()
# 测试up:起雾
# 测试地址:https://www.douyin.com/user/MS4wLjABAAAAbtSlJK_BfUcuqyy8ypNouqEH7outUXePTYEcAIpY9rk
driver.get("https://www.douyin.com/user/MS4wLjABAAAAbtSlJK_BfUcuqyy8ypNouqEH7outUXePTYEcAIpY9rk")
driver.implicitly_wait(10)

def drop_down():
    """下拉模拟"""
    for x in range(1, 30, 4): # 1, 3, 5 在你不断下拉过程中,高度不断变化
        import time
        time.sleep(1)
        j = x / 9 # 1/9 3/9 5/9...
        # document.documentElement.scrollTop 指定果冻条位置
        # document.documentElement.scrollHeight 获取浏览器页面最大高度
        js = 'document.documentElement.scrollTop = document.documentElement.scrollHeight * %f' % j
        driver.execute_script(js)

drop_down()

up_name = input("输入UP主的名称,用作创建文件夹==:")
import os
os.mkdir("./douyin@{}".format(up_name))

def down_vod(url):
    # 1、发请求
    # 测试地址:https://www.douyin.com/video/7106168871668894976 小绮乱撞
    url = url
    # headers伪装
    headers = {
        'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.61 Safari/537.36",
        # 没cookie,reponse.text是假的
        'cookie': 'douyin.com; ttcid=494a0c3b643c46d0b41849fbbe47ea4887; odin_tt=6207c617ff8bda8445129e8cd7bfa1f203f65e0b29373607f0310585b1085900b58b1e96a0014cad012fbab91f2a968ff7c61b84dc064457dfd01b83e9afa6bc; d_ticket=1893b444072fb9f693e148daec8913ece3c1b; sid_guard=09a43a1057cc7f35db23dae168530794%7C1642144859%7C5183999%7CTue%2C+15-Mar-2022+07%3A20%3A58+GMT; __ac_nonce=062a28e99002e37b45060; __ac_signature=_02B4Z6wo00f01a5z71gAAIDDRtu-y00RwnmuV-vAAAk.BMVcycwkcsCm9HyFS8QABqdMAKuNodRsdKF9Vuv-4ff2TDNsytWjcOLFgoYdDP3NhB.F6JAAdcbqCwKdgdcyAlAmD4SFnCYqXUGf33; ttwid=1%7Clyx7xHXpIL15j3RkDLCasnEzsI6v2r4K9pOzWgKYqvo%7C1654820506%7C341a2b1f2ab769807e5a6653342df0703d95a7245eff8652fd2e8fbbca1756a7; douyin.com; strategyABtestKey=1654820507.984; s_v_web_id=verify_l47pek0w_AFtvNkcu_cU37_4NPr_BQrV_p7qsXBQLtwla; passport_csrf_token=572531d33264b7af2d19a89ff5b254cc; passport_csrf_token_default=572531d33264b7af2d19a89ff5b254cc; _tea_utm_cache_2285=undefined; _tea_utm_cache_6383=undefined; _tea_utm_cache_1300=undefined; msToken=C_q08TJik1H6NDQ9G22b2hved1EO8Ban92Jx7yoAK7P3l_bdVIX618yBjJQAIZjNxnmM63e3ArBmXr0AOHfex6rdtvJjmaVTnt__N8YtmXD5FX14E6aI-w==; home_can_add_dy_2_desktop=%221%22; msToken=MklzmRREJgbSLeUjUBOC9pf4N4nU9j2ZxBqerhMesuNCyQW_u1gGR5yqrDzzq9nHwIXtlrgjRdcLAL8fMbpcckUm0g3zi72fEMsazDUG8fD9iphZNEJoeg==; tt_scid=0igyI6wMowxOgJdU6Qsjztd0xqx9AQymBFieuXX9pVlvpx6PBQfQx4BvlqmyeSp-8d43; pwa_guide_count=%222%22; THEME_STAY_TIME=%22173300%22'
    }
    response = requests.get(url=url, headers=headers)
    # 2、获取数据
    # print(response)
    # print(response.text)
    # 3、解析数据
    # re.findall('<title data-react-helmet="true">起风了Yyds#精神甜妹 - 抖音</title>')
    title = re.findall('<title data-react-helmet="true">(.*?)</title>', response.text)
    # ['起风了Yyds#精神甜妹 - 抖音']
    # print(title)
    # print(title[0])
    title2 = title[0]
    print(title2)
    video_url = re.findall('src(.*?)%22%7D%2C%7B%22src', response.text)[0]
    # print(video_url)
    video_url = requests.utils.unquote(video_url).replace('":"', 'https:')
    print(video_url)
    video_binary = requests.get(url=video_url, headers=headers).content
    with open("./douyin@{}/".format(up_name) + title2 + '.mp4', mode='wb') as f:
    # with open(title2 + '.mp4', mode='wb') as f:
        f.write(video_binary)

lis = driver.find_elements_by_css_selector(".ECMy_Zdt")
for li in lis:
    # 如果有报错就跳过的try except
    try:
        href = li.find_element_by_css_selector('a').get_attribute('href')
        print(href, 'downing!')
        down_vod(href)
    except:
        pass
print(len(lis))

  • 1
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值