python爬虫——爬取网络各种资源

python爬虫——爬取网络各种资源
1.某讯视频
import requests
import re
from tqdm import tqdm

url = "https://apd-vlive.apdcdn.tc.qq.com/defaultts.tc.qq.com/B_JxNyiJmktHRgresXhfyMem1E4_DPhVbhxv28spVNp5Dj6vs6uhjyh7JsYzrUOQcL/svp_50112/ZV6e2op5S_S1AyUVjIbzXsJek1I7zANtM2Tv2peQ2YVY3YFimvlfjsXz1DQmrgxOvXrMl6Vs6HiozYNZAtgUo-JKZKtrgs6Vnubhh-IFRlbEUIcUZOu39XJX7hJt5uDrq9jZ-uScgH0wZi5gJSD03ZA0p0pU32ocepjRtSdPw3Zw-tx5nWAPXVGQZgfcOS3TTPtCNs0qoCwEgtP3z-i0YoIZT-MACU25AB2ILMv_z8HX2bCMw-pYKQ/gzc_1000102_0b53zuabqaaahiae4ebljvrmbtodddfqahca.f322062.ts.m3u8?ver=4"

headers = {
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
}

response = requests.get(url=url,headers=headers).text

m3u8_data = re.sub('#.*','',response).split()

for ts in tqdm(m3u8_data):
    ts_url = 'https://ltscsy.qq.com/B_JxNyiJmktHRgresXhfyMeulWsW_l0JzF9NWhW-VqfOrj6vs6uhjyh7JsYzrUOQcL/svp_50112/vDKS4TspZpx8uhYKG9EVBe5I0alPqhW0tx6JBvJ2aS25FDZoNU5KZ6zqkZHI0oluZXeMLWOdHJVJkwU7hTESavdDeIvxTvVGzzDbdV2aXouqP0rqMwh7iS-HBpSSyoJ7-2trKnnldoZQZ49UsJ97yCUsFgW4sYeCBUsR2eKR2-HnO6bayh1rWhDvF63Nr5aLs8_zJIy0ARYOUMGtem6NWCkxgFVaQdLf2-dyEgVe40V1g7FupCtIRw/'+ts
    video_data = requests.get(ts_url).content
    with open('葫芦娃.mp4','ab') as f:
        f.write(video_data)

2.某音视频
import requests

url = "https://v3-web.douyinvod.com/f32cffe441fd98a917184c59b4c4e876/65f8333a/video/tos/cn/tos-cn-ve-15/oYBBlyMPtABcQwziAatpj9EgFfecM9iB8DhIAw/?a=6383&ch=5&cr=3&dr=0&lr=all&cd=0%7C0%7C0%7C3&cv=1&br=1024&bt=1024&cs=0&ds=4&ft=LjhJEL998xI7uEPmH0P5H4eaciDXt0YbZ_QEe09-mB~D1Inz&mime_type=video_mp4&qs=0&rc=aTs4aWY8aGk2ODZmNWU1ZkBpMzRxdjQ6ZmtzcTMzNGkzM0AxLmJhLjEtNWMxY15fM2MuYSNsbC1ycjRfMGhgLS1kLTBzcw%3D%3D&btag=e00008000&cquery=100a&dy_q=1710761249&feature_id=46a7bb47b4fd1280f3d3825bf2b29388&l=20240318192728651A9F612C610A0B9193"

headers = {
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
}

response = requests.get(url=url,headers=headers)

video_data = response.content

with open('aa.mp4','ab') as f:
    f.write(video_data)
3.某站视频
import json
import re
import requests
import os

url = "https://www.bilibili.com/video/BV1kC411a7cn/?spm_id_from=333.1007.tianma.2-2-4.click&vd_source=f4be0a001848558927c3212d18de2626"

headers = {
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
    "Cookie":"CURRENT_FNVAL=4048; DedeUserID=539138603; DedeUserID__ckMd5=671cb5f4af58c4e5; enable_web_push=DISABLE; buvid3=A9DB3A97-A849-C537-2902-0CA83CA2F72A78091infoc; b_nut=1706950178; i-wanna-go-back=-1; b_ut=5; _uuid=112B10126-29107-64D10-4BF1-610B365B5847376377infoc; buvid_fp_plain=undefined; buvid4=98D90CF7-909F-21A5-857B-E2A5352CB3A970761-024031409-FeaNf0N026PHa1xHiEcvIw%3D%3D; FEED_LIVE_VERSION=V8; header_theme_version=CLOSE; is-2022-channel=1; rpdid=|(RlRRR)lRR0J'u~u|R|mY)J; CURRENT_QUALITY=80; fingerprint=d35797e9afc601e4d1f9c94226939e95; buvid_fp=d35797e9afc601e4d1f9c94226939e95; bp_video_offset_539138603=909006115716464659; b_lsid=1A79ABE1_18E5076F703; bmg_af_switch=1; bmg_src_def_domain=i1.hdslb.com; SESSDATA=23faecec%2C1726298960%2C1a4a3%2A31CjBgBTtVdw4XOBT-_73RNdtnfi3F-w5kEs7_tl50_QHZHhu9sQ025YYZXx4OVxkF7GASVmttUFUxdGdCcnZZT2p0Y0VTRmMzVzhKXzRybzhGYkp4ZTQwQWNIQ256MWNwQjRYRnRLaC0wMlE3eVZ4S294Z3NYWVFjam9zMG5sNGJfVnlkYWVMWF9RIIEC; bili_jct=a32fd0781da03d9df2a4e7c79b3bc9ad; sid=75goizpx; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MTEwMDYxNjcsImlhdCI6MTcxMDc0NjkwNywicGx0IjotMX0.jOzBgRd1WLdIwFX7y_xFu1h20PjzadWHz5538s0qCOs; bili_ticket_expires=1711006107; home_feed_column=4; browser_resolution=778-730; PVID=2"
}

response = requests.get(url=url,headers=headers)

title = re.findall('<h1 title="(.*?)"',response.text)[0]

playinfo  = re.findall('<script>window.__playinfo__=(.*?)</script>',response.text)[0]
playinfo  = json.loads(playinfo)


audio_url = playinfo['data']['dash']['audio'][0]['baseUrl']
video_url = playinfo['data']['dash']['video'][0]['baseUrl']

audio_data = requests.get(audio_url).content
video_data = requests.get(video_url).content
with open('audio.mp3','wb') as f:
    f.write(audio_data)
with open('video.mp4','wb') as f:
    f.write(video_data)

command = f'D:\\PackageDown\\ffmpeg-6.0-full_build\\bin\\ffmpeg.exe -i audio.mp3 -i video.mp4 -acodec copy -vcodec copy  "{title}.mp4"'

os.system(command=command)
os.remove('video.mp4')
os.remove('audio.mp3')

4.音乐
import json
import requests


headers = {
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
}

url = "https://wwwapi.kugou.com/play/songinfo?srcappid=2919&clientver=20000&clienttime=1710753441574&mid=94dda63306ec019da57becefaf677248&uuid=94dda63306ec019da57becefaf677248&dfid=4FHz9d0RpBdS3oyFkd3iivge&appid=1014&platid=4&encode_album_audio_id=6ts59xd9&token=&userid=0&signature=f9070fa15e1408f6c86a667aecfc7b5a"

response = requests.get(url=url,headers=headers)

data = json.loads(response.text)
names =data['data']['song_name']
paly_url = data['data']['play_url']
muisc = requests.get(paly_url).content
with open(f"{names}"+".mp3","wb") as f:
    f.write(muisc)
    
最后
  1. 某讯视频是采用m3u8视频流格式,先找到你所需要爬取电影的m3u8文件的url,然后通过访问这个url得到 .ts文件的地址
  2. 某音和音乐只需要找到视频的链接就可以直接下载
  3. 某站的视频有所不同,某站视频的视频和音频是分开的,现需要搜索 .m4s文件,找到分别视频和音频的url。将视频和音频下载下来之后,会是两个文件,一个.mp3文件和一个.mp4文件。需要借助第三方软件ffmpeg将视频和音频合并,这样就得到了完整的视频资源

上面爬取的某讯视频和音乐都是免费的,是会员的就不能爬(如果你充了会员的话那也可以爬),爬虫是可见即可爬,切记爬虫不等于破解!!!

上面代码仅供参考,请勿商用!!!

  • 8
    点赞
  • 54
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值