Python3+Scrapy爬取百度音乐排行榜

一.编写Item

class MusicItem(scrapy.Item):
    # 音乐排名
    top = scrapy.Field()
    # 音乐名
    music_name = scrapy.Field()
    # 歌手
    songer = scrapy.Field()
    pass


二.编写Piplines

Mysql

class MycrawlPipeline(object):
    def __init__(self):
        # 连接数据库
        self.conn = pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd='1likePython',
                                    db='TESTDB', charset='utf8')
        # 建立游标对象
        self.cursor.execute('truncate table Music')
        self.conn.commit()

    def process_item(self, item, spider): 
         try:
            self.cursor.execute("insert into Music (top,music_name,songer) \
                    VALUES (%s,%s,%s)", (item['top'], item['music_name'], item['songer']))
            self.conn.commit()
         except pymysql.Error:
            print("Error%s,%s,%s" % (item['top'], item['music_name'], item['songer']))

         return item
       

MongoDB

class MycrawlPipeline(object):
    def __init__(self):
        # 连接数据库
        self.client = pymongo.MongoClient(host='127.0.0.1', port=27017)
        # 建立游标对象
        self.test = self.client['TESTDB']
        self.post3 = self.test['music']
    def process_item(self, item, spider):   
        data = dict(item)
        self.post3.insert(data)
        return item

三.编写Spiders

from scrapy.spiders import Spider
from scrapy.http import Request
from scrapy.selector import Selector

from Mycrawl.items import MusicItem
import requests



class MusicSpider(Spider):
    # 爬虫名字,重要
    name = 'music'
    # 反爬措施
    # headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119 Safari/537.36'}
    # url = 'https://movie.douban.com/top250'
    allow_domains = ['music.baidu.com']
    start_urls = ['http://music.baidu.com/']

    def parse(self, response):
        item = MusicItem()
        selector = Selector(response)
        title = selector.xpath('//div[@class="hd"]/h2[@class="title"]/text()').extract()
        musics = selector.xpath('//div[@class="bd"]/ul[@class="song-list"]/li')
        for i,music in enumerate(musics):
            top = music.xpath('div[@class="index"]/text()').extract()
            name = music.xpath('div[@class="song-info"]/div[@class="info"]/div[@class="song"]/a/@title').extract()
            songer = music.xpath('div[@class="song-info"]/div[@class="info"]/div[@class="song"]\
                                 /span[@class="artist"]/span/@title').extract()
            if songer == []:
                songer = music.xpath('div[@class="song-info"]/div[@class="info"]/div[@class="song"]/\
                                     span[@class="artist"]/''span[@class="author_list"]/a/@title').extract()
            item['top'] = title[i//10] + top[0]
            item['music_name'] = '《' + name[0] + '》'
            if songer:
                item['songer'] = songer[0]
            else:
                item['songer'] = ''
            yield item


四.数据库显示

爬取成功




评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值