爬虫爬取快手短视频

 爬取快手短视频并自动下载保存;

先输入关键词,例如"慢摇',"美女"等

再输入页数(是从开始下载到当前页数),例如5就是从0到5页

然后等待下载即可

import pprint
import requests
import os
import re
import json

def get_response(url, keywords,pcursor):
    hearders = {
        'Accept': '*/*',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Connection': 'keep-alive',
        'Content-Length': '1839',
        'Content-Type': 'application/json',
        'Cookie': 'kpf=PC_WEB; clientid=3; did=web_713774521487450db89fcfc3892aae65; didv=1705562481178; ktrace-context=1|MS43NjQ1ODM2OTgyODY2OTgyLjQzOTc2MzU1LjE3MDU1NjM4MDkxNTEuNzUzNzYy|MS43NjQ1ODM2OTgyODY2OTgyLjk2MjU0NDIxLjE3MDU1NjM4MDkxNTEuNzUzNzYz|0|graphql-server|webservice|false|NA; kpn=KUAISHOU_VISION',
        'Host': 'www.kuaishou.com',
        'Origin': 'https://www.kuaishou.com',
        'Referer': 'https://www.kuaishou.com/search/video?searchKey=%E6%85%A2%E6%91%87',
        'Sec-Ch-Ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
        'Sec-Ch-Ua-Mobile': '?0',
        'Sec-Ch-Ua-Platform': '"Windows"',
        'Sec-Fetch-Dest': 'empty',
        'Sec-Fetch-Mode': 'cors',
        'Sec-Fetch-Site': 'same-origin',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    }
    data = {
        "operationName": "visionSearchPhoto",
        "query": "fragment photoContent on PhotoEntity {\n  __typename\n  id\n  duration\n  caption\n  originCaption\n  likeCount\n  viewCount\n  commentCount\n  realLikeCount\n  coverUrl\n  photoUrl\n  photoH265Url\n  manifest\n  manifestH265\n  videoResource\n  coverUrls {\n    url\n    __typename\n  }\n  timestamp\n  expTag\n  animatedCoverUrl\n  distance\n  videoRatio\n  liked\n  stereoType\n  profileUserTopPhoto\n  musicBlocked\n}\n\nfragment recoPhotoFragment on recoPhotoEntity {\n  __typename\n  id\n  duration\n  caption\n  originCaption\n  likeCount\n  viewCount\n  commentCount\n  realLikeCount\n  coverUrl\n  photoUrl\n  photoH265Url\n  manifest\n  manifestH265\n  videoResource\n  coverUrls {\n    url\n    __typename\n  }\n  timestamp\n  expTag\n  animatedCoverUrl\n  distance\n  videoRatio\n  liked\n  stereoType\n  profileUserTopPhoto\n  musicBlocked\n}\n\nfragment feedContent on Feed {\n  type\n  author {\n    id\n    name\n    headerUrl\n    following\n    headerUrls {\n      url\n      __typename\n    }\n    __typename\n  }\n  photo {\n    ...photoContent\n    ...recoPhotoFragment\n    __typename\n  }\n  canAddComment\n  llsid\n  status\n  currentPcursor\n  tags {\n    type\n    name\n    __typename\n  }\n  __typename\n}\n\nquery visionSearchPhoto($keyword: String, $pcursor: String, $searchSessionId: String, $page: String, $webPageArea: String) {\n  visionSearchPhoto(keyword: $keyword, pcursor: $pcursor, searchSessionId: $searchSessionId, page: $page, webPageArea: $webPageArea) {\n    result\n    llsid\n    webPageArea\n    feeds {\n      ...feedContent\n      __typename\n    }\n    searchSessionId\n    pcursor\n    aladdinBanner {\n      imgUrl\n      link\n      __typename\n    }\n    __typename\n  }\n}\n",
        "variables": {"keyword": keywords, "pcursor": pcursor, "page": "search"}
    }
    data = json.dumps(data)
    response = requests.post(url=url, data=data, headers=hearders)
    return response


def save(url, dir_name,pcursor):
    response = get_response(url, dir_name,pcursor)
    json_data = response.json()
    feed_list = json_data['data']['visionSearchPhoto']['feeds']  # 这是一个列表
    for feeds in feed_list:
        try:
            video_url = feeds['photo']['photoUrl']
            title = feeds['photo']['caption']
            new_title = re.sub(r'[/\:*?"<>|@#]', '', title).split(' ')[0]
            print(video_url, new_title)
            mp4_data = requests.get(video_url).content
            if not os.path.exists(dir_name):
                os.mkdir(dir_name)
            with open(dir_name + '/' + new_title + '.mp4', mode="wb") as f:
                f.write(mp4_data)
            print(new_title + '下载成功')
        except:
            print('下载失败')

if __name__ == '__main__':
    url = "https://www.kuaishou.com/graphql"
    dir_name = input('请输入快手关键词')
    pcursor=input('请输入要下载的页数')
    for i in pcursor:
        save(url, dir_name,str(i))

由于快手没有开放API接口,所以我们只能通过模拟浏览器来爬取数据。 首先,我们需要安装selenium和chromedriver。selenium是一个自动化测试工具,可以模拟用户在浏览器中的操作。chromedriver是一个驱动程序,可以在代码中控制Chrome浏览器。 安装完selenium和chromedriver后,我们可以编写代码来模拟用户登录快手并获取收藏的视频。 代码如下: ```python from selenium import webdriver import time # 打开浏览器 driver = webdriver.Chrome() # 打开快手登录页面 driver.get('https://login.kuaishou.com/web/login') # 等待页面加载完成 time.sleep(5) # 输入账号密码 username = 'your_username' password = 'your_password' driver.find_element_by_name('username').send_keys(username) driver.find_element_by_name('password').send_keys(password) # 点击登录按钮 driver.find_element_by_class_name('login-button').click() # 等待登录成功 time.sleep(5) # 打开收藏页面 driver.get('https://live.kuaishou.com/profile/favorites') # 等待页面加载完成 time.sleep(5) # 获取所有视频的信息 videos = driver.find_elements_by_class_name('video-card') # 遍历所有视频并输出标题和链接 for video in videos: title = video.find_element_by_class_name('title').text link = video.find_element_by_tag_name('a').get_attribute('href') print(title, link) # 关闭浏览器 driver.quit() ``` 需要注意的是,由于快手页面一直在更新,上述代码在某些情况下可能无法正常运行。如果遇到问题,可以尝试手动更改代码中的元素定位方式。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

羊 sir

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值