小白flag7 python爬虫_网易音乐歌单
准备
import os #存放文件处理
import time #程序运行时间计算
import sys #错误信息返回 预留
from selenium import webdriver #获取渲染后的html页面
from lxml import etree #解析 xpath使用
from urllib.parse import quote # url编码
思路
获取全部页面(网易云音乐包括了iframe是个坑)
解析
获取
分析
保存
代码
import requests
from urllib import request # 请求下载
import os # 存放文件处理
import time # 程序运行时间计算
from selenium import webdriver # 获取渲染后的html页面
from lxml import etree # 解析 xpath使用
from urllib import parse # url编码
# def get_content(url):
# headers = {
# 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.7 Safari/537.36'
# }
# r = requests.get(url=url, headers=headers)
# r.encoding = 'utf-8'
# content = r.text
# print(content)
# return content
'''通过request请求获取不能的到js的渲染'''
class Netease_spider:
def __init__(self):
# 初始设置 无头启动selenium
opt = webdriver.chrome.options.Options()
opt.set_headless()
self.browser = webdriver.Chrome(chrome_options=opt)
content = input('你要搜索的歌曲')
quote_url = parse.quote(content)
self.url = 'https://music.163.com/#/search/m/?s=' + quote_url
self.song = ''
self.song_dict = {}
# 返回网站源码
def get_page(self, url):
self.browser.get(url)
self.browser.switch_to.frame('g_iframe') # 获取内含iframe动态传输页面
html = self.browser.page_source
# html = browser.execute_script('return document.documentElement.outerHTML')
# html = browser.find_element_by_xpath('//*').get_attribute('outerHTML')
return html
# 解析网站源码 及 用户选择
def get_content(self, html):
song_content = etree.HTML(html.encode('utf-8'))
# print(type(song_content))
id = song_content.xpath('//div[@class="hd"]/a[1]/@data-res-id')
song = song_content.xpath('//div[@class="sn"]/div/a/b/@title')
self.song_dict = dict(zip(id, song))
for k, v in self.song_dict.items():
print(k + '\t' + v)
# 选择
def select(self):
info = input('输入你想下载的歌的id(默认回车退出)')
if info == '':
self.song = ''
else:
self.song = self.song_dict[info]
return info
# 保存路径
def path(self):
path = input('选择保存路径(回车默认本地)')
# 回车默认本地
if path == '':
path = os.getcwd()
# print(sys.path)
# print(sys.argv)
return path
else:
# 没有就创建
if not os.path.exists(path):
os.makedirs(path)
return path
# 下载
def download(self, info, path):
print('你访问的网站:''https://music.163.com/#/song?id=' + info)
url = 'https://music.163.com/song/media/outer/url?id={}.mp3'.format(info)
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.7 Safari/537.36'
}
req = requests.get(url, headers=headers, allow_redirects=False)
song_url = req.headers['Location']
try:
# path在主函数中输入
start_time = time.time() # 计时开启
request.urlretrieve(song_url, path + "/" + self.song + ".mp3")
end_time = time.time()
print("{}--下载完成用时{}S".format(self.song, end_time - start_time))
except:
print("{}--下载失败".format(self.song))
# 运行
def run(self):
html = self.get_page(self.url)
self.get_content(html)
info = self.select()
path = self.path()
if info == '':
print('谢谢使用')
exit()
else:
self.download(info, path)
obj = Netease_spider()
obj.run()
缺点:太多bug。。。。不是很灵活但是好歹完成了
目标:改进