import csv import random import time from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.chrome.options import Options import json import os import requests from fake_useragent import UserAgent from lxml import etree import re class QQ_Music: def __init__(self): self.url = 'https://y.qq.com/n/ryqq/radio' self.headers = self.func_get_headers() # 设置无头模式 self.options = Options() self.options.add_argument('--headless') self.driver = webdriver.Chrome(options=self.options) def func_get_headers(self): """获取随机请求头函数""" headers = {'User-Agent': UserAgent().random} return headers def get_html(self, url): """ 获取网页响应内容函数 param:url """ try: res = requests.get(url=url, headers=self.headers) if res.status_code == 200: return res except requests.HTTPError as e: print('异常, Args:{}'.format(e.args)) except Exception as e: raise ' Connect Timeout Error' def func_get_type(self): """获取音乐类别, id值, 函数""" # 存储大类别字典 big_type_item = {} #存储小类别字典 small_type_item = {} # 获取类别 type_list_res = self.get_html(url=self.url).text type = etree.HTML(type_list_res) # 小分类 small_type_list = type.xpath('//ul/li/h4/span/a/text()') # 大分类 big_type_list = type.xpath('//div[@class="mod_radio_sidebar"]/a/text()') big_type_item['big_title'] = big_type_list small_type_item['small_title'] = small_type_list # 获取类别码详情 regex = '<script>(.*?)</script>' pattern = re.compile(regex, re.S) id_result = pattern.findall(type_list_res)[1].split('=')[3] #print(id_result ) data_list = json.loads(id_result)['radio_list'] # 大类别id big_id_list = [] for data in data_list: big_id_item = {} y = data['title'] k = data['id'] big_id_item[y] = k big_id_list.append(big_id_item) # 小类别id值 small_id_list = [] for data in data_list: data_info = data['list'] for data in data_info: small_id__item = {} k = data['title'] y = data['id'] small_id__item[k] = y small_id_list.append(small_id__item) # 返回音乐详情数据,类别, id return data_list, big_type_list, big_type_item, small_type_list, small_type_item, big_id_list, small_id_list def get_type_image(self, img_list): """获取类别的图片url""" item= {} for img in img_list: for img_info in img['list']: k = img_info['title'] y = img_info['pic_url'] item[k] = y return item def save_img(self, filename, img_data): """保存类别图片""" with open(filename, 'wb') as f: f.write(img_data) def get_music_info_data(self, href_link, sec_word): """获取音乐详情数据""" self.driver.get(href_link) # 预留网页加载时间 time.sleep(2) try: while True: # 获取歌曲的图片,歌词,歌曲链接 data_list =self.driver.find_elements(By.XPATH, '//div[@class="mod_player"]') item = {} for data in data_list: item['singer'] = data.find_element(By.XPATH, '/html/body/div[2]/div[2]/div[1]/div[1]/h3/a').text item['song_name'] = data.find_element(By.XPATH, '/html/body/div[2]/div[2]/div[1]/div[1]/h2/a').text item['img_url'] = data.find_element(By.XPATH, '/html/body/div[2]/div[2]/div[1]/div[1]/div[3]/a/img').get_attribute('src') item['song_txt'] = data.find_element(By.XPATH, '//*[@id="qrc_ctn"]').text item['song_url'] = data.find_element(By.XPATH, '/html/body/audio[1]').get_attribute('src') print("测试:", item) self.driver.find_element(By.XPATH, '/html/body/div[2]/div[2]/div[1]/div[1]/div[2]/a[2]').click() self.driver.find_element(By.XPATH, '/html/body/div[2]/div[2]/div[1]/div[1]/div[2]/a[1]').click() time.sleep(random.randint(1, 2)) # 将数据存储到csv中 data_info_list = { '歌手': item['singer'], '歌曲名称': item['song_name'], '封面链接': item['img_url'], '歌曲链接': item['song_url'] } path = './Data_Info/音乐数据/' if not os.path.exists(path): os.makedirs(path) full_file_path = path + '{}.csv'.format(sec_word) # 设置文件标题 with open(full_file_path, 'a', newline='', encoding='gb18030') as f: fieldnames = ['歌手', '歌曲名称', '封面链接', '歌曲链接'] writer = csv.DictWriter(f, fieldnames=fieldnames) writer.writeheader() writer.writerow(data_info_list) # 保存音乐信息 self.down_load(item['song_url'], item['song_name']) except Exception as e: print('该类别下载完成') def down_load(self, song_url, song_name): """将音乐以.mp3格式下载到本地""" data = self.get_html(song_url).content # 创建保存路径 path = './Data_Info/music/' if not os.path.exists(path): os.makedirs(path) filename = path + '{}.mp3'.format(song_name) # 本地存储 with open(filename, 'wb') as f: f.write(data) def run(self): """程序主入口""" data_list, big_show, big_item, small_show, small_item , big_id_list, small_id_list = self.func_get_type() big_show = ' * '.join(big_show) show_info = 20 * '*' + 'QQ音乐排行榜' + '*' * 20 print(show_info.center(100)) print('\n', big_show) one_word = input('\n请输入类别:').strip() for big_id in big_id_list: if one_word in big_id.keys(): print('\n',' -*- '.join(small_show)) sec_word = input('\n请输入类别:') for small_id in small_id_list: if sec_word in small_id.keys(): sec_word_id = small_id.get(sec_word) href_link = 'https://y.qq.com/portal/player_radio.html#id={}'.format(sec_word_id) img_info = self.get_type_image(data_list) one_img_url = img_info.get(sec_word) # 保存类别图片信息 one_img_data = self.get_html(one_img_url).content one_img_path = './Data_Info/类别图片/' if not os.path.exists(one_img_path): os.makedirs(one_img_path) one_path = one_img_path + '{}.jpg'.format(sec_word) self.save_img(one_path, one_img_data) # 获取音乐详情信息 self.get_music_info_data(href_link, sec_word) if __name__ == '__main__': spider = QQ_Music() spider.run() spider.driver.close()
QQ音乐电台数据获取
最新推荐文章于 2024-06-17 17:35:06 发布