一、案例一:酷狗top50
分析:
1.Requests库用于请求网页获取网页数据,BeautifulSoup用于解析网页数据,time库的sleep()方法可以让程序暂停
2.添加User-Agent,用于伪装为浏览器,便于爬虫的稳定性。
3.定义get_info()函数,用于获取网页信息并输出信息。
4.程序的主入口利用列表的推导式构造23个URL,并依次调用get_info()函数
代码:
import requests
from bs4 import BeautifulSoup
import time #导入相应的库文件
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
def get_info(url):
wb_data = requests.get(url, headers=headers)
soup = BeautifulSoup(wb_data.text, 'lxml')
ranks = soup.select('span.pc_temp_num')
titles = soup.select('div.pc_temp_songlist > ul > li > a')
times = soup.select('span.pc_temp_tips_r > span')
for rank, title, time in zip(ranks, titles, times):
data = {
'rank': rank.get_text().strip(),
'singer': title.get_text().split('-')[0],
'song': title.get_text().split('-')[0], # 通过split获取歌手和歌曲信息
'time': time.get_text().strip()
}
print(data) # 获取爬虫信息并按字典格式打印
if __name__ == '__main__': # 程序主入口
urls = ['http://www.kugou.com/yy/rank/home/{}-8888.html'.format(str(i)) for i in range(1, 24)] # 构造多页URL
for url in urls:
get_info(url) # 循环调用get_info()函数
time.sleep(1) # 睡眠1秒
二、案例二:下载网易云top50
from bs4 import BeautifulSoup
import requests
import json
import os
import time
class Music:
def __init__(self, init_url, download):
self.init_url = init_url
self.download = download
def mkdir(self, path):
path = path.strip()
if not os.path.exists(path): # 判断此文件夹存不存在
print('创建 ', path, '文件夹')
os.makedirs(path)
return True
else:
print(path, '文件夹已存在,无需创建')
return False
def download_video(self, video_url, name):
path = self.download + "\" + name + '.mp3' # 拼接保存后的文件路径
# print(path)
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36",
}
header = {
"Origin": "http://music.163.com/",
"Referer": video_url, # 请求头必须添加referer
}
headers.update(header) # 更新头部信息
size = 0
start = time.time()
try:
result = requests.get(video_url, headers=headers, stream=True, verify=False)
# print('result', result)
with open(path, "wb") as f:
for chunk in result.iter_content(1024):
f.write(chunk)
f.flush() # 清空缓存
size = size + len(chunk)
print("已下载:%0.2f Mb" % (size / (1024 * 1024)))
except Exception as e:
print("url下载错误:%s" % video_url)
print(e)
stop = time.time()
print("下载完成,耗时:%0.2f秒" % (stop - start))
def spider(self):
r = requests.get(self.init_url).text
soupObj = BeautifulSoup(r, 'lxml')
song_ids = soupObj.find('textarea').text
# print(song_ids)
jobj = json.loads(song_ids)
list01 = []
for item in jobj:
dict01 = {}
# print(item['id']) # 歌曲id
# print(item['name']) # 歌曲名称
dict01['name'] = item['name']
dict01['id'] = item['id']
list01.append(dict01)
print(list01)
len_list = len(list01)
print("一共", len_list, "首歌曲")
self.mkdir(self.download)
print('开始切换文件夹')
os.chdir(self.download)
for i in list01:
name = i['name']
id = i['id']
song_url = "http://music.163.com/song/media/outer/url?id=" + str(id) + ".mp3"
print(song_url) # 最终下载的音乐链接
self.download_video(song_url, name) # 下载
len_list = len_list - 1
print("还剩", len_list, "首歌曲需要下载")
if __name__ == '__main__':
init_url = 'https://music.163.com/artist?id=30284835' # 枯木逢春
download = 'D:\Music\downloads' # 保存地址
s = Music(init_url, download)
s.spider()