Python——Scrapy爬虫实战小项目

步骤一:在cmd下创建项目
>> scrapy startproject NewVideoMovie
>> cd NewVideoMovie
>> scrapy genspider spider http://www.yy6080.cn/vodtypehtml/1.html

创建结果:
在这里插入图片描述

步骤二:编辑主程序
# -*- coding: utf-8 -*-
import scrapy
from day23.spidermovie.spidermovie.items import SpidermovieItem
count = 0
class MoviespiderSpider(scrapy.Spider):
    name = 'moviespider'
    # allowed_domains = ['http://www.yy6080.cn/vodtypehtml/1.html']
    start_urls = ['http://www.yy6080.cn/vodtypehtml/1.html']

    def parse(self, response):

        movItems = response.xpath("//div[@class='movie-item']")  # 返回选择器列表
        # 遍历选择器列表

        movLen = len(movItems)
        movCount = 0

        for movItem in movItems:
            movCount += 1
            sItem = SpidermovieItem()
            # 解析电影名称
            movieName = movItem.xpath("div[@class='meta']/div/a/text()")

            if movieName:
                sItem['movName'] = movieName.extract()[0].strip()
            # print(movieName)
            # 解析电影评分
            movieScore = movItem.xpath("div[@class='meta']/div/span/text()")
            if movieScore:
                sItem['movScore'] = movieScore.extract()[0].strip()
            # print(movieScore)
            # 解析电影类型
            movieType = movItem.xpath("div[@class='meta']/div[@class='otherinfo']/text()")
            if movieType:
                sItem['movType'] = movieType.extract()[0].strip()
            # print(movieType)
            # 解析电影链接
            movieLink = movItem.xpath("div[@class='meta']/div/a/@href")
            if movieLink:
                sItem['movLink'] = "http://www.yy6080.cn" + movieLink.extract()[0].strip()
            # print(movieLink)
            nextPage = ""
            nextPage1 = response.xpath("//a[@class='pagelink_a']/@href").extract()
            nextText = response.xpath("//a[@class='pagelink_a']/text()").extract()

            if nextText[-2] == "下一页":
                url = "http://www.yy6080.cn" + nextPage1[-2]
                nextPage = url
                sItem['nextPage1'] = nextPage
                pass
            dataDetail = sItem['movLink']

            if movieName and movieScore and movieType and movieLink:
                yield scrapy.Request(url=dataDetail, callback=self.parsesecond,
                                     meta={'item': sItem, 'movLen': movLen, 'movCount': movCount},
                                     dont_filter=True)
                pass
            pass
        pass

        # 爬取二级页面
    def parsesecond(self, response):
        sItem = response.meta['item']
        movLen = response.meta['movLen']
        movCount = response.meta['movCount']

        countens1 =response.xpath("//tbody/tr")
        for countens2 in countens1:
            text1 = ""
            text2 = ""
            tt1 = countens2.xpath("td[@class='span2']/span/text()")
            if tt1:
                text1 = tt1.extract()[0].strip()
                pass
            tt2 = countens2.xpath("td/a/text()")
            if tt2:
                text2 = tt2.extract()
                pass
            else:
                tt2 = countens2.xpath("td/text()")
                pass
                if tt2:
                    text2 = tt2.extract()
                    pass
            # 导演
            if text1 == "导演":
                if text2:
                    txr = ""
                    for temp in text2:
                        sItem['movie_director'] = temp.strip() + txr
                else:
                    sItem['movie_director'] = None
                pass
            # 编剧
            elif text1 == "编剧":
                if text2:
                    txr = ""
                    for temp in text2:
                        sItem['movie_screenwriter'] = temp.strip() + txr
                else:
                    sItem['movie_screenwriter'] = None
                pass
            # 类型
            elif text1 == "类型":
                if text2:
                    sItem['movie_type'] = text2[0]
                else:
                    sItem['movie_type'] = None
                pass
            # 制片国家
            elif text1 == "制片国家":
                if text2:
                    sItem['movie_country'] = text2[0]
                else:
                    sItem['movie_country'] = None
                pass
            # 制片国家
            elif text1 == "制片国家":
                if text2:
                    sItem['movie_country'] = text2[0]
                else:
                    sItem['movie_country'] = None
                pass
            # 语言
            elif text1 == "语言":
                if text2:
                    sItem['movie_language'] = text2[0]
                else:
                    sItem['movie_language'] = None
                pass
            # 上映时间
            elif text1 == "上映时间":
                if text2:
                    sItem['movie_showtime'] = text2[-1].strip(':')[-1]
                else:
                    sItem['movie_showtime'] = None
                pass
            # 评分
            elif text1 == "评分":
                if text2:
                    sItem['movie_score'] = text2[0]
                else:
                    sItem['movie_score'] = None
                pass
            # 电影主演
            movie_tostar = countens2.xpath("//td[@id='casts']/a/text()")
            if movie_tostar:
                sItem['movie_tostar'] = movie_tostar.extract()[0].strip()
                pass
            # 剧情介绍
            movie_plot = countens2.xpath("//div[@class='col-md-8']/p/text()")
            if movie_plot:
                sItem['movie_plot'] = movie_plot.extract()[0].strip()
                pass
        yield sItem
        if movLen == movCount:
            print("爬取下一页!" * 10)
            print("--***--" * 10)
            yield scrapy.Request(sItem['nextPage1'], self.parse, dont_filter=True)
            pass
        pass
    pass
步骤三:编写items
# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class SpidermovieItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    movName = scrapy.Field()  # 名称
    movScore = scrapy.Field()  # 评分
    movLink = scrapy.Field()  # 链接
    movType = scrapy.Field()  # 类型
    # movDetail = scrapy.Field()
    nextPage1 = scrapy.Field()

# 二级页面
    print("--***--**---"*6)
    movie_director = scrapy.Field()  # 导演
    movie_screenwriter = scrapy.Field()  # 编剧
    movie_tostar = scrapy.Field()  # 主演
    movie_type = scrapy.Field()  # 类型
    movie_country = scrapy.Field()  # 制片国家
    movie_language = scrapy.Field()  # 语言
    movie_showtime = scrapy.Field()  # 上映时间
    movie_score = scrapy.Field()  # 评分
    movie_plot = scrapy.Field()  # 剧情介绍
    pass
步骤四:编写管道pipelines
# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
from .movdao.moviepositiondao import MovPositionDao
class SpidermoviePipeline(object):
    def process_item(self, item, spider):
        movPositionDao = MovPositionDao()

        movPositionDao.create((item['movName'],
                               item['movScore'],
                               item['movLink'],
                               item['movType']))

        movPositionDao.createdatil((item['movie_director'], item['movie_screenwriter'],
                                    item['movie_tostar'], item['movie_type'],
                                    item['movie_country'], item['movie_language'],
                                    item['movie_showtime'], item['movie_score'],
                                    item['movie_plot']))
        print("通过管道输出!!!")
        print(item['movName'])
        print(item['movScore'])
        print(item['movLink'])
        print(item['movType'])
        print('*************')
        print(item['movie_director'])
        print(item['movie_screenwriter'])
        print(item['movie_tostar'])
        print(item['movie_type'])
        print(item['movie_country'])
        print(item['movie_language'])
        print(item['movie_showtime'])
        print(item['movie_score'])
        print(item['movie_plot'])

        return item
步骤五:打开所需管道settings
# -*- coding: utf-8 -*-

# Scrapy settings for spidermovie project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'spidermovie'

SPIDER_MODULES = ['spidermovie.spiders']
NEWSPIDER_MODULE = 'spidermovie.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'spidermovie (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 1
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
SPIDER_MIDDLEWARES = {
    'spidermovie.middlewares.SpidermovieSpiderMiddleware': 543,
}

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
    'spidermovie.middlewares.SpidermovieDownloaderMiddleware': 543,
}

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
    'spidermovie.pipelines.SpidermoviePipeline': 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

# 配置日志输出
LOG_LEVEL = 'ERROR'
LOG_FILE = 'jobspider.log'

步骤六:创建写入数据库文件

创建结果
在这里插入图片描述

创建movIedao方法
import pymysql
import json
import os

class MovDao():
    def __init__(self, configPath = 'pymysql.json'):
        self.__connection = None
        self.__cursor = None
        self.__config = json.load(open(os.path.dirname(__file__) + os.sep + configPath, 'r'))  # 通过json配置文件获得数据库的连接配置信息
        print(self.__config)
        pass
        # 获取数据库连接的方法

    def getConnection(self):
        # 当有连接对象时直接返回连接对象
        if self.__connection:
            return self.__connection
        # 否则通过建立新的连接对象
        try:
            self.__connection = pymysql.connect(**self.__config)
            return self.__connection
        except pymysql.MySQLError as e:
            print("Exception" + str(e))
            pass
        pass
        # 用于执行SQL语句的通用方法  # sql 注入

    def execute(self, sql, params):
        try:
            self.__cursor = self.getConnection().cursor()
            # execute 在执行delete update insert 返回int值,返回的对数据库里的数据修改
            if params:
                result = self.__cursor.execute(sql, params)
            else:
                result = self.__cursor.execute(sql)
            return result
        except (pymysql.MySQLError, pymysql.DatabaseError, Exception) as e:
            print("出现数据库访问异常!!" + str(e))
            self.rollback()
        pass

    def fetch(self):
        if self.__cursor:
            return self.__cursor.fetchall()
        pass

    def commit(self):
        if self.__connection:
            self.__connection.commit()
        pass

    def rollback(self):
        if self.__connection:
            self.__connection.rollback()
        pass

    def getLastRowId(self):
        if self.__cursor:
            return self.__cursor.lastrowid

    def close(self):
        if self.__cursor:
            self.__cursor.close()
        if self.__connection:
            self.__connection.close()
        pass

    pass

创建moviepositiondao方法
from.moviedao import MovDao

# 定义一个电影数据操作的数据库访问类
class MovPositionDao(MovDao):
    def __init__(self):
        super().__init__()
        pass
    def create(self, params):
        sql = "insert into mov_position (mov_name, mov_score, mov_type, mov_link) " \
              "values (%s, %s, %s, %s)"
        result = self.execute(sql, params)
        self.commit()

        return result
    def createdatil(self, params):
        sql = "insert into movie_position_data (movie_director, movie_screenwriter," \
              " movie_tostar, movie_type, movie_country, movie_language, movie_showtime," \
              " movie_score, movie_plot) " \
              "values (%s, %s, %s, %s, %s, %s, %s, %s, %s)"
        result = self.execute(sql, params)
        self.commit()
        self.close()
        return result

    pass
创建连接数据库的json文件
{"host": "127.0.0.1",
  "user":"root",
  "password" :"root",
  "database":"db_mov_data",
  "port":3306,
  "charset":"utf8"}
步骤七:创建爬虫启动脚本
# 此脚本是爬虫启动脚本
from scrapy.cmdline import execute

execute(['scrapy', 'crawl', 'moviespider'])
控制台打印效果:

在这里插入图片描述

数据库写入效果:

在这里插入图片描述
在这里插入图片描述

  • 0
    点赞
  • 10
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值