使用scrapy框架爬取猫眼电影全部的页码并写入数据库

最新推荐文章于 2020-11-09 15:16:58 发布

天天Jo

最新推荐文章于 2020-11-09 15:16:58 发布

阅读量1.1k

点赞数

分类专栏： python scrapy 文章标签： python scrapy

本文链接：https://blog.csdn.net/weixin_42784553/article/details/82751479

版权

python 同时被 2 个专栏收录

58 篇文章 1 订阅

订阅专栏

scrapy

1 篇文章 0 订阅

订阅专栏

使用scrapy框架爬取猫眼电影
爬取全部的页数

import scrapy,re
from jobmaoyan.items import JobmaoyanItem
class MaoyanSpider(scrapy.Spider):
    name = 'maoyan_spider'
    allowed_domains=['maoyan.com']
    start_urls=['http://maoyan.com/films?showType=3']

    page_set=set()
    def parse(self, response):

        datalist = response.xpath("//dd")
        for data in datalist:
            item = JobmaoyanItem()
            item['title'] = data.xpath("div[@class='channel-detail movie-item-title']/a/text()").extract()[0]
            item['imgurl'] = data.xpath("div[@class='movie-item']/a[@target='_blank']/div[@class='movie-poster']/img[2]/@data-src").extract()[0]

            item['types'] = response.xpath("//li[@class='tags-line']/ul[@class='tags']/li[@class='active']/a[starts-with(@href,'javascript')]/text()").extract()[0]
            # print('==========================',item['types'])
            datail_url = "http://maoyan.com" + data.xpath("div[@class='movie-item']/a/@href").extract()[0]

            yield scrapy.Request(url=datail_url, callback=self.parse_detail, meta={"data": item})

	#获取页码网址递归循环实现获取全部的页码
        pageurls = response.xpath("//a[starts-with(@href,'?showType=3&offset=')]/@href").extract()
        for pageurl in pageurls:
            if pageurl in self.page_set:
                pass
            else:
                self.page_set.add(pageurl)
                purl = 'http://maoyan.com/films' + pageurl
                # print('------------------------',purl)
                yield scrapy.Request(url=purl, callback=self.parse)

    def parse_detail(self,response):

        item = response.meta['data']
        item["d_type"] = response.xpath("//div[@class='movie-brief-container']/ul/li[1]/text()").extract()[0]
        item["d_country"] = response.xpath("//div[@class='movie-brief-container']/ul/li[2]/text()").extract()[0]
        item['d_country'] = re.sub(r"\s", "", item['d_country'])
        item["d_stime"] = response.xpath("//div[@class='movie-brief-container']/ul/li[3]/text()").extract()[0]
        item["d_content"]=response.xpath("//div[@class='mod-content']/span[@class ='dra']/text()").extract()[0]
        item["comment1"]=response.xpath("//div[@class='comment-list-container']/ul/li[1]/div[@class='main']/div[@class='comment-content']/text()").extract()[0]
        item["comment2"]=response.xpath("//div[@class='comment-list-container']/ul/li[2]/div[@class='main']/div[@class='comment-content']/text()").extract()[0]
        item["comment3"]=response.xpath("//div[@class='comment-list-container']/ul/li[3]/div[@class='main']/div[@class='comment-content']/text()").extract()[0]

        yield item

写入数据库文件pipelines_mysql
需要在setting中写入pipelines_mysql文件并放开

ITEM_PIPELINES = {
   # 'jobmaoyan.pipelines.JobmaoyanPipeline': 300,
   # 'jobmaoyan.pipelines_txt.JobmaoyanPipeline': 300,
   'jobmaoyan.pipelines_mysql.JobmaoyanPipeline': 300,
   # 'jobmaoyan.pipelines_json.JobmaoyanPipeline': 300,
   # 'jobmaoyan.pipelines_xls.JobmaoyanPipeline': 300,
   # 'jobmaoyan.pipelines_mongdb.JobmaoyanPipeline': 300,
}

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql

class JobmaoyanPipeline(object):
    def process_item(self, item, spider):
        db = pymysql.connect('127.0.0.1', "root", "123456", "jobbole")
        cursor = db.cursor()
        create_sql="create table if not exists catmovies(id int primary key auto_increment,types text,title text,imgurl text,type text,country text,stime text,content text,comment1 text,comment2 text,comment3 text)"
        cursor.execute(create_sql)
        insert_sql = "insert into catmovies values(0,'%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')"%(item['types'],item['title'],item['imgurl'],item['d_type'],item['d_country'],item['d_stime'],item['d_content'],item["comment1"],item["comment2"],item["comment3"])
        try:
            cursor.execute(insert_sql)
            db.commit()
        except:
            db.rollback()

        cursor.close()
        db.close()

        return item