python 爬取 kuaishou视频

本文介绍了一种使用Python从快手平台抓取用户主页视频列表的方法,并详细解释了如何解析视频首页URL,以及如何进一步获取视频播放地址进行下载。通过正则表达式和JSON操作,实现了对网页内容的有效提取。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

获取用户首页的视频列表

转为每个视频首页的url 保存到文件

https://live.kuaishou.com/profile/XXXXXXX 是用户首页

import requests
import re
import json

headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36","Cookie":"55kuaishou.live.bfb1s=9b8f70844293bed778aade6e0a8f9942; clientid=3; did=web_e443327b4b4949f5887c62c6deaf03a9; client_key=65890b29; didv=1578472661000; userId=165336717; userId=165336717; kuaishou.live.web_st=ChRrdWFpc2hvdS5saXZlLndlYi5zdBKgAWPr5FbsICKzDZ0kZws3AUW3yUrsS2o0J2E-ObC1HJYv6YoJFua0LzwEBSJxm7yZqUbZEx0XuvKeuoznP3vxy9cLUMGQz28flV0_HXi0ZXYsIRIx1KWvKzML7ViSbuaG5lPhujOykWu17XDL3_AVP6w7rOaPl4XXzp8D1EHdzNU3LtXQTAoifGqUmMFcYU21eE-VfjC2FYiSFG0sYTBnJ3UaEo_d-PiuxE4duU2DjxXdbB5BSiIgpP7OcoI9t9q4JKF4inMtRFIY4ztGxK1hzFhjCLhZ6REoBTAB; kuaishou.live.web_ph=b0fee16b1015d885056bfe536708a2753d4f"}


response = requests.get("https://live.kuaishou.com/profile/yuhaiying11", params = {}, headers = headers)
pattern = re.compile('__=(.*?);\(function')
match = re.search(pattern,response.text)
url = match.group().replace('__=', "");
url=url.replace(';(function', "");
data = json.loads(url)
aaaa = data['clients']['graphqlServerClient']
aaaa = json.dumps(aaaa);
pattern0 = re.compile('\[{(.*?)}\]')
match0 = re.search(pattern0,aaaa)
array = json.loads(match0.group())

print(array[0])

f = open('a.txt', 'w')
for obj in array:
   id = obj['id'].replace('VideoFeed:', "");
   f.write('https://live.kuaishou.com/u/yuhaiying11/'+id+'?did=web_e443327b4b4949f5887c62c6deaf03a9\r\n')
f.close()

获得a.txt 首页里每个视频首页的url

获得用户首页下拉时 页面里 增加的视频 首页url

这里部分参数,需要手动在浏览器中获取

import requests
import re
import json

headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36","Cookie":"55kuaishou.live.bfb1s=9b8f70844293bed778aade6e0a8f9942; clientid=3; did=web_e443327b4b4949f5887c62c6deaf03a9; client_key=65890b29; didv=1578472661000; userId=165336717; userId=165336717; kuaishou.live.web_st=ChRrdWFpc2hvdS5saXZlLndlYi5zdBKgAWPr5FbsICKzDZ0kZws3AUW3yUrsS2o0J2E-ObC1HJYv6YoJFua0LzwEBSJxm7yZqUbZEx0XuvKeuoznP3vxy9cLUMGQz28flV0_HXi0ZXYsIRIx1KWvKzML7ViSbuaG5lPhujOykWu17XDL3_AVP6w7rOaPl4XXzp8D1EHdzNU3LtXQTAoifGqUmMFcYU21eE-VfjC2FYiSFG0sYTBnJ3UaEo_d-PiuxE4duU2DjxXdbB5BSiIgpP7OcoI9t9q4JKF4inMtRFIY4ztGxK1hzFhjCLhZ6REoBTAB; kuaishou.live.web_ph=b0fee16b1015d885056bfe536708a2753d4f","Content-Type":"application/json","accept":"*/*","Accept-Language":"zh-CN,zh;q=0.9,en;q=0.8"}


json0 = {
    "operationName":"publicFeedsQuery",
    "variables":{
        "principalId": "yuhaiying11",
		"pcursor": "1.552892860461E12",
		"count": 500
    },
    "query": "query publicFeedsQuery($principalId: String, $pcursor: String, $count: Int) {  publicFeeds(principalId: $principalId, pcursor: $pcursor, count: $count) {    pcursor    live {      user {        id        avatar        name        __typename      }      watchingCount      poster      coverUrl      caption      id      playUrls {        quality        url        __typename      }      quality      gameInfo {        category        name        pubgSurvival        type        kingHero        __typename      }      hasRedPack      liveGuess      expTag      __typename    }    list {      id      thumbnailUrl      poster      workType      type      useVideoPlayer      imgUrls      imgSizes      magicFace      musicName      caption      location      liked      onlyFollowerCanComment      relativeHeight      timestamp      width      height      counts {        displayView        displayLike        displayComment        __typename      }      user {        id        eid        name        avatar        __typename      }      expTag      __typename    }    __typename  }}"
}


response = requests.post("https://live.kuaishou.com/m_graphql", json=json0, headers = headers)
response.encoding='utf-8'
#print(response.text)
array = json.loads(response.text,encoding='utf-8')
listss = array['data']['publicFeeds']['list'];
#print(listss)

f = open('b.txt', 'w')
for obj in listss:
   id = obj['id'];
   f.write('https://live.kuaishou.com/u/yuhaiying11/'+id+'?did=web_e443327b4b4949f5887c62c6deaf03a9\r\n')
f.close()

这里获取了分页,也条数设置为500,

如果想获取第二页,需要请求第一页获得的 pcursor 参数 ,设置为第二页的请求参数,就可以请求第二页 的500条

 

 

 

获得b.txt 文件

 

然后读取这两个文件

在视频首页中获得播放url

然后下载

import requests
import re
import fileinput

def functionname( url_ ):
    #"https://live.kuaishou.com/u/yuhaiying11/3xj2txve5ntjn8e?did=web_e443327b4b4949f5887c62c6deaf03a9"
    response = requests.get(url_, params = {}, headers = headers)
    #print(response.text)
    pattern = re.compile('"playUrl":"http:(.*?).mp4')
    match = re.search(pattern,response.text)
    url = match.group().replace('"playUrl":"', "");
    url = url.encode('utf-8').decode('unicode_escape');#转码
    return url


headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36","Cookie":"55kuaishou.live.bfb1s=ac5f27b3b62895859c4c1622f49856a4; clientid=3; did=web_8fb1ca7b6de24f8cbfd3fb1994bf3a77; client_key=65890b29; didv=1578653936000; userId=165336717; userId=165336717; kuaishou.live.web_st=ChRrdWFpc2hvdS5saXZlLndlYi5zdBKgARTvvKnBFu2hJful3C6EOswrlYnEo1vhjljvRsrSFfn-mZ4qABpG9zVeu1LU3TJYbECqVeKBjktAfIkG71mkFF9zEkGA_tqLqB97uI7fySGeRJcxH7gYHVX8eQKO5JJQb2LbgSv3KAlQkJkfZnq6_K_XAvfkBHPCKTSj9dOrZv2XxHgJQTT2DSmQJztLDJqwNjssx25sEbSkgzz0Zt2rOiIaEqoO82cRG00nqSoI30_iGx2JSCIgODuEtIDjb8f2J5K1FmGOsuomQWw0V5nY9LY9NfKqAGgoBTAB; kuaishou.live.web_ph=c10764d1d4010ff90cdd4682472a327edf35"}


file = open("b.txt")
for eachline in file:
    if eachline.isspace() != True :
        url=functionname(eachline)
        print (url)
        urlll=eachline.replace('https://live.kuaishou.com/u/yuhaiying11/','').replace('?did=web_e443327b4b4949f5887c62c6deaf03a9','')
        print (urlll)
        r = requests.get(url)
        urlll=urlll.replace('\n','')
        urlll='C:\\Users\\Administrator\\Desktop\\pyks\\file\\'+urlll+'.mp4';
        open(urlll, 'wb').write(r.content)

### 使用Python编写爬虫抓取快手视频 #### 安装所需库 为了实现这一目标,首先需要安装一些必要的Python库。`requests` 库用于发送HTTP请求;`lxml` 和 `BeautifulSoup` 可帮助解析HTML文档;而 `re` 是正则表达式的标准库。 ```bash pip install requests lxml beautifulsoup4 ``` #### 获取页面内容 使用 `requests.get()` 方法可以轻松地向指定URL发起GET请求,并接收响应对象。接着可以通过 `.content` 属性读取返回的内容作为字节流处理[^3]。 ```python import requests url = "https://www.kuaishou.com/some_video_page" response = requests.get(url) html_content = response.content.decode('utf-8') ``` #### 解析网页结构 对于大多数网站来说,直接下载整个页面并不是最佳实践。更有效的方式是从源码中定位特定标签或属性值来提取有用的信息。这里推荐两种常用工具——XPath 表达式和 CSS Selectors: - XPath 能够精确查找节点; - BeautifulSoup 结合CSS选择器语法更加直观易懂。 下面的例子展示如何运用这两种方法分别获取视频链接和其他元数据(如标题、描述等)。 ##### 使用XPath解析 ```python from lxml import etree tree = etree.HTML(html_content) video_urls_xpath = tree.xpath('//a[contains(@class, "video-link")]/@href') # 假设这是存储视频地址的地方 titles_xpath = tree.xpath('//div[@id="title"]/text()') print(f"Found {len(video_urls_xpath)} videos.") for i in range(len(titles_xpath)): print(f"{i+1}. Title: {titles_xpath[i]}, URL: https://{video_urls_xpath[i]}") ``` ##### 使用BeautifulSoup解析 ```python from bs4 import BeautifulSoup soup = BeautifulSoup(html_content, 'lxml') videos_bs = soup.select('.video-item a.video-link') # 同样假设类名为 video-link 的<a>标记指向视频页 descriptions_bs = [item.text.strip() for item in soup.find_all(id='description')] for idx, (link, desc) in enumerate(zip(videos_bs, descriptions_bs), start=1): href = link['href'] title = link.string or '<No Title>' print(f"\nVideo #{idx}") print(f"- Link :{href}\n- Description:\n\t{desc[:70]+'...'}\n- Title:{title}") ``` 请注意,在实际操作过程中可能还需要考虑更多细节问题,比如登录验证机制、动态加载内容等问题。此外,务必遵守各平台的服务条款以及法律法规,合理合法地开展网络爬行工作。
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值