使用scrapy框架爬取猫眼电影
爬取全部的页数
import scrapy,re
from jobmaoyan.items import JobmaoyanItem
class MaoyanSpider(scrapy.Spider):
name = 'maoyan_spider'
allowed_domains=['maoyan.com']
start_urls=['http://maoyan.com/films?showType=3']
page_set=set()
def parse(self, response):
datalist = response.xpath("//dd")
for data in datalist:
item = JobmaoyanItem()
item['title'] = data.xpath("div[@class='channel-detail movie-item-title']/a/text()").extract()[0]
item['imgurl'] = data.xpath("div[@class='movie-item']/a[@target='_blank']/div[@class='movie-poster']/img[2]/@data-src").extract()[0]
item['types'] = response.xpath("//li[@class='tags-line']/ul[@class='tags']/li[@class='active']/a[starts-with(@href,'javascript')]/text()").extract()[0]
# print('==========================',item['types'])
datail_url = "http://maoyan.com" + data.xpath("div[@class='movie-item']/a/@href").extract()[0]
yield scrapy.Request(url=datail_url, callback=self.parse_detail, meta={"data": item})
#获取页码网址递归循环实现获取全部的页码
pageurls = response.xpath("//a[starts-with(@href,'?showType=3&offset=')]/@href").extract()
for pageurl in pageurls:
if pageurl in self.page_set:
pass
else:
self.page_set.add(pageurl)
purl = 'http://maoyan.com/films' + pageurl
# print('------------------------',purl)
yield scrapy.Request(url=purl, callback=self.parse)
def parse_detail(self,response):
item = response.meta['data']
item["d_type"] = response.xpath("//div[@class='movie-brief-container']/ul/li[1]/text()").extract()[0]
item["d_country"] = response.xpath("//div[@class='movie-brief-container']/ul/li[2]/text()").extract()[0]
item['d_country'] = re.sub(r"\s", "", item['d_country'])
item["d_stime"] = response.xpath("//div[@class='movie-brief-container']/ul/li[3]/text()").extract()[0]
item["d_content"]=response.xpath("//div[@class='mod-content']/span[@class ='dra']/text()").extract()[0]
item["comment1"]=response.xpath("//div[@class='comment-list-container']/ul/li[1]/div[@class='main']/div[@class='comment-content']/text()").extract()[0]
item["comment2"]=response.xpath("//div[@class='comment-list-container']/ul/li[2]/div[@class='main']/div[@class='comment-content']/text()").extract()[0]
item["comment3"]=response.xpath("//div[@class='comment-list-container']/ul/li[3]/div[@class='main']/div[@class='comment-content']/text()").extract()[0]
yield item
写入数据库文件pipelines_mysql
需要在setting中写入pipelines_mysql文件 并放开
ITEM_PIPELINES = {
# 'jobmaoyan.pipelines.JobmaoyanPipeline': 300,
# 'jobmaoyan.pipelines_txt.JobmaoyanPipeline': 300,
'jobmaoyan.pipelines_mysql.JobmaoyanPipeline': 300,
# 'jobmaoyan.pipelines_json.JobmaoyanPipeline': 300,
# 'jobmaoyan.pipelines_xls.JobmaoyanPipeline': 300,
# 'jobmaoyan.pipelines_mongdb.JobmaoyanPipeline': 300,
}
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql
class JobmaoyanPipeline(object):
def process_item(self, item, spider):
db = pymysql.connect('127.0.0.1', "root", "123456", "jobbole")
cursor = db.cursor()
create_sql="create table if not exists catmovies(id int primary key auto_increment,types text,title text,imgurl text,type text,country text,stime text,content text,comment1 text,comment2 text,comment3 text)"
cursor.execute(create_sql)
insert_sql = "insert into catmovies values(0,'%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')"%(item['types'],item['title'],item['imgurl'],item['d_type'],item['d_country'],item['d_stime'],item['d_content'],item["comment1"],item["comment2"],item["comment3"])
try:
cursor.execute(insert_sql)
db.commit()
except:
db.rollback()
cursor.close()
db.close()
return item