前言
本节我们主要是来讲一个实际的网易云数据的爬取案例。
一、小案例演示
1.爬取一张图片
if __name__ == '__main__':
url_ = 'https://p1.music.126.net/mkyEgzk2JLS_EWm_bK8RzA==/109951169408323072.jpg?imageView&quality=89'
headers_ = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1'
}
response_ = requests.get(url_, headers=headers_)
bytes_data = response_.content
with open(f'新造的人.jpg', 'wb') as f:
f.write(bytes_data)
2.爬取一首非VIP歌曲
if __name__ == '__main__':
url_ = 'https://m801.music.126.net/20240313230234/0a80028c73c521aad66fb2afade9647e/jdyyaac/obj/w5rDlsOJwrLDjj7CmsOj/34297600221/ec98/3f91/fa7e/63618915ccae25b8eec9c1ab373d2a8a.m4a'
headers_ = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1'
}
response_ = requests.get(url_, headers=headers_)
bytes_data = response_.content
with open('隔墙有耳.mp3', 'wb') as f:
f.write(bytes_data)
3.爬取MV
import requests
if __name__ == '__main__':
url_ = 'https://ctyunsw2dcdn.qnqcdn.net/vodkgeyttp8.vod.126.net/cloudmusic/IjBiITkyMTEgMSI2JCAwZA==/mv/5404646/41a8e7815c380b8840f3ec86e3e44e48.mp4?wsSecret=96455d7337868ab855315a79dec43ce7&wsTime=1710396215'
headers_ = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1'
}
response = requests.get(url_,headers=headers_)
bytes_data = response.content
with open('光年之外mp4', 'wb') as f:
f.write(bytes_data)
二、翻页的实现
如何进行网页的规律,我们首先从网页的URL进行分析:
1.https://www.sogou.com/sogou?query=%E9%A3%8E%E6%99%AF &insite=wenwen.sogou.com&pid=sogou-wsse-a9e18cb5dd9d3ab4&rcer=
2.https://www.sogou.com/sogou?query=%E9%A3%8E%E6%99%AF &insite=wenwen.sogou.com&pid=sogou-wsse-a9e18cb5dd9d3ab4&rcer=&page=2
3.https://www.sogou.com/sogou?query=%E9%A3%8E%E6%99%AF &insite=wenwen.sogou.com&pid=sogou-wsse-a9e18cb5dd9d3ab4&rcer=&page=3
# 从这里我们可以发现只有start参数的值会变化,这就是翻页所要用到的东西
代码实现:
import time
import requests
if __name__ == '__main__':
input_wd = input('请输入你要搜索的内容:')
pages = int(input('请输入你要爬取的页数:'))
for i in range(pages):
url_ = 'https://www.sogou.com/sogou'
headers_ = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1'
}
url_list = {
'query': input_wd,
'insite': 'wenwen.sogou.com',
'page': i+1
}
response = requests.get(url_,headers=headers_,params=url_list)
str_data = response.content.decode('utf-8')
with open(f'{input_wd}_第{i+1}页.html', 'w', encoding='utf-8') as f:
f.write(str_data)
time.sleep(1) # 这只停止时间,不要因为频繁访问被反爬
三、面向对象的该写
这种方法是将爬虫所用到的参数和方法放到类中进行封装,在需要用到网络爬取的时候就可以直接创建对象来调用类。
代码演示:
import requests
class QW:
# 只要创建对象,该方法就会执行
def __init__(self):
self.url_ = 'https://www.sogou.com/sogou'
self.header_ = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1'
}
# 发送请求
def send_request(self, params_):
response = requests.get(self.url_, headers=self.header_, params=params_)
str_data = response.content.decode('utf-8')
return str_data
# 解析代码
# 保存数据
def save_data(self, page, str_data, data_): # 保存需要接收数据
with open(f'{data_}_第{page + 1}页', 'w', encoding='utf-8') as f:
f.write(str_data)
# 调度方法
def run(self):
data_ = input('请输入你想要搜索的内容:')
page_ = int(input('请输入你想要爬取的数量:'))
for page in range(page_):
param_ = {
'query': data_,
'insite': 'wenwen.sogou.com',
'page': page + 1
}
str_data = self.send_request(param_)
self.save_data(page, str_data, data_)
time.sleep(1)
import time
if __name__ == '__main__':
qw = QW()
qw.run()
总结
本文主要用一些实际案例来进行讲述,熟能生巧,多练即会,
一个人失败的最大原因,就是对于自己的能力永远不敢充分信任,甚至自己认为必将失败无疑。 —— 富兰克林