scrapy爬取kw

scrapy爬取kw

# -*- coding: utf-8 -*-
import scrapy
import json

class KuwoSpider(scrapy.Spider):
    name = 'kuwo'
    allowed_domains = ['kuwo.cn']
    # pn代表页码, rn代表一页有多少,默认30
    start_urls = ['http://www.kuwo.cn/api/www/playlist/playListInfo?pid=1082685104&pn=1&rn=60']

    def start_requests(self):
        # 设置请求头和cookies
        kw = KwHeaders()
        # csrf很重要
        headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36',
         'csrf': '3UIQ2GWU3QH', 
         'Referer': 'http://www.kuwo.cn'
         }
        # 值和csrf一样
        cookie = {'kw_token': '3UIQ2GWU3QH'}
        for url in self.start_urls:
            yield scrapy.Request(url, cookies=cookie, headers=headers, dont_filter=True)

    
    def parse(self, response):
        # 获取每一首歌rid
        music_list = json.loads(response.text)['data']['musicList']
        for music in music_list:
            # 每个rid代表一首歌
            rid = music['rid']
            href = 'http://www.kuwo.cn/url?format=mp3&rid={0}&response=url&type=convert_url3'.format(rid)
            # 歌词地址
            irc_href = 'http://m.kuwo.cn/newh5/singles/songinfoandlrc?musicId={0}'.format(rid)
            music['irc_href'] = irc_href
            yield scrapy.Request(
                href,
                callback=self.parse_music_url,
                meta={'music':music.copy()}
            )
            # yield music
            


    def parse_music_url(self, response):
        '''获取歌的地址'''
        item = response.meta['music']
        url = json.loads(response.text)['url']
        item['music_url'] = url
        # print(item)
        yield item
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值