首先抓包分析,所需的整页数据都在"main"下,再细分每一部的信息。
每一部电影的信息在"board-item-main"下。
接下来我们可以去找到目标数据并想好匹配方法了。
我们所需的四种数据就找到了,接下来可以去动手了。
一.编写Item
class MaoyanItem(scrapy.Item):
top = scrapy.Field()
title = scrapy.Field()
star = scrapy.Field()
releasetime = scrapy.Field()
pass
二.编写Piplines
Mysql
class MycrawlPipeline(object):
def __init__(self):
# 连接数据库
self.conn = pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd='1likePython',
db='TESTDB', charset='utf8')
# 建立游标对象
self.cursor.execute('truncate table Music')
self.conn.commit()
def process_item(self, item, spider):
try:
self.cursor.execute("insert into Maoyan (top,title,star,releasetime) \
VALUES (%s,%s,%s,%s)", (item['top'], item['title'], item['star'], item['releasetime']))
self.conn.commit()
except pymysql.Error:
print("Error%s,%s,%s,%s" % (item['top'], item['title'], item['star'], item['releasetime']))
return item
MongoDB
class MycrawlPipeline(object):
def __init__(self):
# 连接数据库
self.client = pymongo.MongoClient(host='127.0.0.1', port=27017)
# 建立游标对象
self.test = self.client['TESTDB']
self.post4 = self.test['maoyan']
def process_item(self, item, spider):
data = dict(item)
self.post4.insert(data)
return item
三.编写Spider
import scrapy
from scrapy.selector import Selector
from scrapy.http import Request
from Mycrawl.items import MaoyanItem
class MaoyanSpider(scrapy.Spider):
name = 'maoyan'
# allowed_domains = ['maoyan.com']
pagelist = [7, 6, 1, 2, 4]
def start_requests(self):
for i in self.pagelist:
self.url = 'http://maoyan.com/board/{page}'.format(page=i)
# url = 'https://movie.douban.com/top250'
yield Request(self.url, callback=self.parse)
def parse(self, response):
item = MaoyanItem()
selector = Selector(response)
active = selector.xpath('//ul[@class="navbar"]/li/a[@class="active"]/text()').extract()
tops = selector.xpath('//dd/i/text()').extract()
movies = selector.xpath('//div[@class="movie-item-info"]')
for i, content in enumerate(movies):
title = content.xpath('p[@class="name"]/a/text()').extract()
star = content.xpath('p[2]/text()').extract()
releasetime = content.xpath('p[3]/text()').extract()
item['top'] = active[-1] + '第' + tops[i]
item['title'] = title[0]
item['star'] = star[0].replace(' ', '').replace('\n', '')
if releasetime:
item['releasetime'] = releasetime[0].replace(' ', '').replace('\n', '')
else:
item['releasetime'] = ''
yield item
四.结果展示