Scrapy 爬虫框架结构及示例-爬取豆瓣电影Top250

最新推荐文章于 2024-03-21 03:00:00 发布

Dolphin@ifat.xyz

最新推荐文章于 2024-03-21 03:00:00 发布

阅读量789

点赞数

分类专栏： Spiders python

本文链接：https://blog.csdn.net/wdschn/article/details/80978090

版权

python 同时被 2 个专栏收录

9 篇文章 0 订阅

订阅专栏

Spiders

2 篇文章 0 订阅

订阅专栏

1.0创建爬虫

1.1 创建 project

scrapy startproject mySpider

1.2 创建爬虫

# 创建一个基于 scrapy.Spider 类的爬虫
scrapy genspider SpiderName "domain.com"
# 创建以及基于 scrapy.spiders.CrawlSpider 的爬虫
scrapy genspider -t crawl spiderName "domain.com"

1.3 文件结构

这里写图片描述

主要文件的作用：

scrapy.cfg ：项目的配置文件
mySpider/ ：项目的Python模块，将会从这里引用代码
mySpider/items.py ：项目的目标文件
mySpider/pipelines.py ：项目的管道文件
mySpider/settings.py ：项目的设置文件
mySpider/spiders/ ：存储爬虫代码目录

2.0 爬虫 SpiderName/

爬虫主要用于发起请求获取返回数据。继续访问返回 response 中的 url ，或者处理结果中的数据。
主要基于两个类展开：

Spider
CrawlSpider

2.1 基于Spider 的爬虫

# tencent.py

from mySpider.items import TencentItem
import scrapy
import re

class TencentSpider(scrapy.Spider):
    name = "tencent"
    allowed_domains = ["hr.tencent.com"]
    start_urls = [
        "http://hr.tencent.com/position.php?&start=0#a"
    ]

    def parse(self, response):
        for each in response.xpath('//*[@class="even"]'):

            item = TencentItem()
            name = each.xpath('./td[1]/a/text()').extract()[0]
            detailLink = each.xpath('./td[1]/a/@href').extract()[0]
            positionInfo = each.xpath('./td[2]/text()').extract()[0]
            peopleNumber = each.xpath('./td[3]/text()').extract()[0]
            workLocation = each.xpath('./td[4]/text()').extract()[0]
            publishTime = each.xpath('./td[5]/text()').extract()[0]

            #print name, detailLink, catalog, peopleNumber, workLocation,publishTime

            item['name'] = name.encode('utf-8')
            item['detailLink'] = detailLink.encode('utf-8')
            item['positionInfo'] = positionInfo.encode('utf-8')
            item['peopleNumber'] = peopleNumber.encode('utf-8')
            item['workLocation'] = workLocation.encode('utf-8')
            item['publishTime'] = publishTime.encode('utf-8')

            curpage = re.search('(\d+)',response.url).group(1)
            page = int(curpage) + 10
            url = re.sub('\d+', str(page), response.url)

            # 发送新的url请求加入待爬队列，并调用回调函数 self.parse
            yield scrapy.Request(url, callback = self.parse)

            # 将获取的数据交给pipeline
            yield item

2.2 基于 CrawlSpider 的爬虫

以爬取豆瓣电影TOP250为例

# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule

from mySpider.items import doubanItem


class Douban250Spider(CrawlSpider):
    # 爬虫名称
    name = 'douban250'
    # 域
    allowed_domains = ['douban.com']
    # 起始 url
    start_urls = ['https://movie.douban.com/top250']
    # 处理链接的 LinkExtractor，使用正则。
    pageLink = LinkExtractor(allow=('start=\d+&filter='))
    # 处理连接，设置了回调函数和是否跟进
    rules = (
        Rule(pageLink, callback='parse_item', follow=True),
        # Rule(pageLink, callback='parse_item',process_links='deal_links' , follow=True),
    )

    # 解析结果，接收 response
    def parse_item(self, response):
        # xpath 获取到内容
        content = response.xpath('//div[@class="item"]')
        # 循环获取每一个
        for i in content:
            # doubanItem 类 定义在 ../items.py 中
            item = doubanItem()
            # xpath获取
            item['rank'] = i.xpath('div/em/text()').extract()[0]
            item['title'] = i.xpath('div//div[@class="hd"]/a/span/text()').extract()[0]
            item['url'] = i.xpath('div//div[@class="hd"]/a/@href').extract()[0]
            item['rating_num'] = i.xpath('div//span[@class="rating_num"]/text()').extract()[0]
            item['comment'] = i.xpath('div//span[@class="inq"]/text()').extract()

            text = i.xpath('div/div[@class="bd"]/p/text()').extract()

            item['date_year'] = text[1].split("/")[0].strip()[:4]
            item['country'] = text[1].split("/")[-2].strip().replace(' ', "、")
            item['genre'] = text[1].split("/")[-1].strip().replace(' ', "、")
            # 处理，简单处理结果
            if len(item['comment']) == 0:
                item['comment'] = "暂无简介！"
            else:
                item['comment'] = item['comment'][0]
            # 迭代返回给管道
            yield item

3.0 items.py

用来定义模型，存储数据的结构

import scrapy

class doubanItem(scrapy.Item):
    rank = scrapy.Field()
    date_year = scrapy.Field()
    title = scrapy.Field()
    url = scrapy.Field()
    country = scrapy.Field()
    genre = scrapy.Field()
    rating_num = scrapy.Field()
    comment = scrapy.Field()

4.0 管道 pipelines.py

# -*- coding: utf-8 -*-

# 文件处理类库，可以指定编码格式
import codecs
import json
import MySQLdb

class doubanPipline(object):
    def __init__(self):
        # 打开文件
        # self.filename = codecs.open('./outfiles/doubanTop250.json', 'w', encoding='utf-8')

        # 建立数据库连接
        self.db = MySQLdb.connect(host='localhost', port=3306, user='root', passwd='root', db='spider', charset='utf8', use_unicode=True)
        self.cursor = self.db.cursor()

    def process_item(self, item, spider):
        # 写入mysql
        self.cursor.execute("""insert into doubantop250 (rank,title,url,rating_num,comment,date_year,country,genre) values (%s,%s,%s,%s,%s,%s,%s,%s)""",
                            [int(item['rank']), item['title'], item['url'], item['rating_num'], item['comment'], item['date_year'], item['country'], item['genre'] ])
        self.db.commit()

        # 写入文件
        # content = json.dumps(dict(item), ensure_ascii=False) + "\n"
        # self.filename.write(content)
        return item

    def spider_closed(self, spider):
        # 关闭数据库连接
        self.db.close()

        # 关闭文件
        # self.filename.close()

5.0 配置文件 settings.py

# -*- coding: utf-8 -*-

# Scrapy settings for mySpider project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://doc.scrapy.org/en/latest/topics/settings.html
#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'mySpider'

SPIDER_MODULES = ['mySpider.spiders']
NEWSPIDER_MODULE = 'mySpider.spiders'



USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'

# Obey robots.txt rules
ROBOTSTXT_OBEY = True

# Configure maximum concurrent requests performed by Scrapy (default: 16)
# CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
# CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
# COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
# TELNETCONSOLE_ENABLED = False

# Override the default request headers:
# DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
# }

# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
# SPIDER_MIDDLEWARES = {
#    'mySpider.middlewares.MyspiderSpiderMiddleware': 543,
# }

# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
#    'mySpider.middlewares.MyspiderDownloaderMiddleware': 543,
# }

# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
# EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
# }

# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
    # 'mySpider.pipelines.MyspiderPipeline': 300,
    # "mySpider.pipelines.ItcastJsonPipeline": 300,
    # "mySpider.pipelines.tencentJsonPipline":300,
    # "mySpider.pipelines.sunJsonPipeline": 300,
    "mySpider.pipelines.doubanPipline": 300,
}

#
# LOG_FILE = "../Log/SUN.log"
# LOG_LEVEL = "DEBUG"

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
# AUTOTHROTTLE_ENABLED = True
# The initial download delay
# AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
# AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
# AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
# HTTPCACHE_ENABLED = True
# HTTPCACHE_EXPIRATION_SECS = 0
# HTTPCACHE_DIR = 'httpcache'
# HTTPCACHE_IGNORE_HTTP_CODES = []
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'