QQ音乐电台数据获取

import csv
import random
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import json
import os
import requests
from fake_useragent import UserAgent
from lxml import etree
import re


class QQ_Music:

    def __init__(self):
        self.url = 'https://y.qq.com/n/ryqq/radio'
        self.headers = self.func_get_headers()
        # 设置无头模式
        self.options = Options()
        self.options.add_argument('--headless')
        self.driver = webdriver.Chrome(options=self.options)

    def func_get_headers(self):
        """获取随机请求头函数"""
        headers = {'User-Agent': UserAgent().random}

        return headers

    def get_html(self, url):
        """
        获取网页响应内容函数
        param:url
        """
        try:
            res = requests.get(url=url, headers=self.headers)
            if res.status_code == 200:

                return res

        except requests.HTTPError as e:
            print('异常, Args:{}'.format(e.args))

        except Exception as e:
            raise ' Connect Timeout Error'

    def func_get_type(self):
        """获取音乐类别, id值, 函数"""

        # 存储大类别字典
        big_type_item = {}
        #存储小类别字典
        small_type_item = {}

        # 获取类别
        type_list_res = self.get_html(url=self.url).text
        type = etree.HTML(type_list_res)
        # 小分类
        small_type_list = type.xpath('//ul/li/h4/span/a/text()')
        # 大分类
        big_type_list = type.xpath('//div[@class="mod_radio_sidebar"]/a/text()')
        big_type_item['big_title'] = big_type_list
        small_type_item['small_title'] = small_type_list

        # 获取类别码详情
        regex = '<script>(.*?)</script>'
        pattern = re.compile(regex, re.S)
        id_result = pattern.findall(type_list_res)[1].split('=')[3]
        #print(id_result )
        data_list = json.loads(id_result)['radio_list']

        # 大类别id
        big_id_list = []
        for data in data_list:

            big_id_item = {}
            y = data['title']
            k = data['id']
            big_id_item[y] = k

            big_id_list.append(big_id_item)

        # 小类别id值
        small_id_list = []
        for data in data_list:
            data_info = data['list']

            for data in data_info:
                small_id__item = {}
                k = data['title']
                y = data['id']
                small_id__item[k] = y

                small_id_list.append(small_id__item)

        # 返回音乐详情数据,类别, id
        return data_list, big_type_list, big_type_item, small_type_list, small_type_item, big_id_list, small_id_list

    def get_type_image(self, img_list):
        """获取类别的图片url"""

        item= {}
        for img in img_list:
            for img_info in img['list']:
                k = img_info['title']
                y = img_info['pic_url']
                item[k] = y

        return item

    def save_img(self, filename, img_data):
        """保存类别图片"""

        with open(filename, 'wb') as f:
            f.write(img_data)

    def get_music_info_data(self, href_link, sec_word):
        """获取音乐详情数据"""

        self.driver.get(href_link)
        # 预留网页加载时间
        time.sleep(2)

        try:
            while True:
                # 获取歌曲的图片,歌词,歌曲链接
                data_list =self.driver.find_elements(By.XPATH, '//div[@class="mod_player"]')
                item = {}
                for data in data_list:

                    item['singer'] = data.find_element(By.XPATH, '/html/body/div[2]/div[2]/div[1]/div[1]/h3/a').text
                    item['song_name'] = data.find_element(By.XPATH, '/html/body/div[2]/div[2]/div[1]/div[1]/h2/a').text
                    item['img_url'] = data.find_element(By.XPATH, '/html/body/div[2]/div[2]/div[1]/div[1]/div[3]/a/img').get_attribute('src')
                    item['song_txt'] = data.find_element(By.XPATH, '//*[@id="qrc_ctn"]').text
                    item['song_url'] = data.find_element(By.XPATH, '/html/body/audio[1]').get_attribute('src')

                print("测试:", item)
                self.driver.find_element(By.XPATH, '/html/body/div[2]/div[2]/div[1]/div[1]/div[2]/a[2]').click()
                self.driver.find_element(By.XPATH, '/html/body/div[2]/div[2]/div[1]/div[1]/div[2]/a[1]').click()

                time.sleep(random.randint(1, 2))

                # 将数据存储到csv中
                data_info_list = {
                    '歌手': item['singer'],
                    '歌曲名称': item['song_name'],
                    '封面链接': item['img_url'],
                    '歌曲链接': item['song_url']
                }
                path = './Data_Info/音乐数据/'
                if not os.path.exists(path):
                    os.makedirs(path)
                full_file_path = path + '{}.csv'.format(sec_word)

                # 设置文件标题
                with open(full_file_path, 'a', newline='', encoding='gb18030') as f:
                    fieldnames = ['歌手', '歌曲名称', '封面链接', '歌曲链接']
                    writer = csv.DictWriter(f, fieldnames=fieldnames)
                    writer.writeheader()
                    writer.writerow(data_info_list)

                # 保存音乐信息
                self.down_load(item['song_url'], item['song_name'])

        except Exception as e:
            print('该类别下载完成')

    def down_load(self, song_url, song_name):
        """将音乐以.mp3格式下载到本地"""

        data = self.get_html(song_url).content
        # 创建保存路径
        path = './Data_Info/music/'
        if not os.path.exists(path):
            os.makedirs(path)
        filename = path + '{}.mp3'.format(song_name)
        # 本地存储
        with open(filename, 'wb') as f:
            f.write(data)

    def run(self):
        """程序主入口"""
        data_list, big_show, big_item, small_show, small_item , big_id_list, small_id_list = self.func_get_type()

        big_show = ' * '.join(big_show)
        show_info = 20 * '*' + 'QQ音乐排行榜' + '*' * 20
        print(show_info.center(100))
        print('\n', big_show)

        one_word = input('\n请输入类别:').strip()
        for big_id in big_id_list:

           if one_word in big_id.keys():
               print('\n',' -*- '.join(small_show))
               sec_word = input('\n请输入类别:')
               for small_id in small_id_list:
                   if sec_word in small_id.keys():
                       sec_word_id = small_id.get(sec_word)
                       href_link = 'https://y.qq.com/portal/player_radio.html#id={}'.format(sec_word_id)
                       img_info = self.get_type_image(data_list)
                       one_img_url = img_info.get(sec_word)

                       # 保存类别图片信息
                       one_img_data = self.get_html(one_img_url).content
                       one_img_path = './Data_Info/类别图片/'
                       if not os.path.exists(one_img_path):
                           os.makedirs(one_img_path)
                       one_path = one_img_path + '{}.jpg'.format(sec_word)
                       self.save_img(one_path, one_img_data)

                       # 获取音乐详情信息
                       self.get_music_info_data(href_link, sec_word)


if __name__ == '__main__':
    spider = QQ_Music()
    spider.run()
    spider.driver.close()
  • 5
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值