一.编写Item
class MusicItem(scrapy.Item):
# 音乐排名
top = scrapy.Field()
# 音乐名
music_name = scrapy.Field()
# 歌手
songer = scrapy.Field()
pass
二.编写Piplines
Mysql
class MycrawlPipeline(object):
def __init__(self):
# 连接数据库
self.conn = pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd='1likePython',
db='TESTDB', charset='utf8')
# 建立游标对象
self.cursor.execute('truncate table Music')
self.conn.commit()
def process_item(self, item, spider):
try:
self.cursor.execute("insert into Music (top,music_name,songer) \
VALUES (%s,%s,%s)", (item['top'], item['music_name'], item['songer']))
self.conn.commit()
except pymysql.Error:
print("Error%s,%s,%s" % (item['top'], item['music_name'], item['songer']))
return item
MongoDB
class MycrawlPipeline(object):
def __init__(self):
# 连接数据库
self.client = pymongo.MongoClient(host='127.0.0.1', port=27017)
# 建立游标对象
self.test = self.client['TESTDB']
self.post3 = self.test['music']
def process_item(self, item, spider):
data = dict(item)
self.post3.insert(data)
return item
三.编写Spiders
from scrapy.spiders import Spider
from scrapy.http import Request
from scrapy.selector import Selector
from Mycrawl.items import MusicItem
import requests
class MusicSpider(Spider):
# 爬虫名字,重要
name = 'music'
# 反爬措施
# headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119 Safari/537.36'}
# url = 'https://movie.douban.com/top250'
allow_domains = ['music.baidu.com']
start_urls = ['http://music.baidu.com/']
def parse(self, response):
item = MusicItem()
selector = Selector(response)
title = selector.xpath('//div[@class="hd"]/h2[@class="title"]/text()').extract()
musics = selector.xpath('//div[@class="bd"]/ul[@class="song-list"]/li')
for i,music in enumerate(musics):
top = music.xpath('div[@class="index"]/text()').extract()
name = music.xpath('div[@class="song-info"]/div[@class="info"]/div[@class="song"]/a/@title').extract()
songer = music.xpath('div[@class="song-info"]/div[@class="info"]/div[@class="song"]\
/span[@class="artist"]/span/@title').extract()
if songer == []:
songer = music.xpath('div[@class="song-info"]/div[@class="info"]/div[@class="song"]/\
span[@class="artist"]/''span[@class="author_list"]/a/@title').extract()
item['top'] = title[i//10] + top[0]
item['music_name'] = '《' + name[0] + '》'
if songer:
item['songer'] = songer[0]
else:
item['songer'] = ''
yield item
四.数据库显示
爬取成功