Python 链接提取器 CrawlSpider

最新推荐文章于 2024-07-20 17:12:48 发布

岚天、

最新推荐文章于 2024-07-20 17:12:48 发布

阅读量489

点赞数

分类专栏： python 文章标签： python 爬虫 scrapy

本文链接：https://blog.csdn.net/weixin_46310452/article/details/126099412

版权

python 专栏收录该内容

7 篇文章 0 订阅

订阅专栏

作用：提取这种不知道有多少页的，数据的链接有规律的，

优点：可以用正则表达式，xpath，css等去获取有规则的url进行解析

简单Demo:

from scrapy.linkextractors import LinkExtractor
link = LinkExtractor(allow=r'book/1188_\d+.html')  # \d表示数字 \d+：表示多个数字
# xpath:    link1 = LinkExtrcator(restrict_xpaths=r'//div[@id=su]')
print(link.extract_links(response))

实际案例

1.创建项目

# 创建项目
scrapy staratproject dushuproject
# 跳转到spiders路径
cd dushuproject/dushuproject/spiders
# 创建爬虫类，项目
scrapy genspider -t crawl read www.dushu.com
# 设置配置 settings    创建实体:items   配置管道piplines

# allow：正则表达式 匹配链接url路径
# follow:限制页数，如果是True就是不限制，每一次循环的界面都找，相当于找到所有匹配的，而False:就匹配第一页response中的匹配链接来解析

read.py

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from read_book.items import ReadBookItem


class ReadSpider(CrawlSpider):
    name = 'read'
    allowed_domains = ['www.dushu.com']
    # 注意，缺少主页是因为表达式rules 正则不匹配，
    start_urls = ['https://www.dushu.com/book/1107_1.html']

    rules = (
        # allow：正则表达式 匹配链接url路径
        # follow:限制页数，如果是True就是不限制，每一次循环的界面都找，相当于找到所有匹配的，而False:就匹配第一页response中的匹配链接来解析
        Rule(LinkExtractor(allow=r'/book/1107_\d+.html'), callback='parse_item', follow=False),
    )

    def parse_item(self, response):
        item = {}
        # print("++++++++++++++++++")     #//div[@class='bookslist']/ul/li//div//img/@src   data-original
        # url_list = response.xpath("//div[@class='bookslist']/ul/li//div//img/@data-original").extract()
        # name_list = response.xpath("//div[@class='bookslist']/ul/li//div//img/@alt").extract()
        img_list = response.xpath("//div[@class='bookslist']/ul/li//div//img")
        for img in img_list:
            src = img.xpath("./@data-original").extract_first()
            if src is None:
                src = img.xpath("./@src").extract_first()
            name = img.xpath("./@alt").extract_first()
            book = ReadBookItem(name=name,src=src)
            yield book

settings.py 开启写入mysql管道配置mysql信息

# Scrapy settings for read_book project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'read_book'

SPIDER_MODULES = ['read_book.spiders']
NEWSPIDER_MODULE = 'read_book.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'read_book (+http://www.yourdomain.com)'

# Obey robots.txt rules
# ROBOTSTXT_OBEY = True

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'read_book.middlewares.ReadBookSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'read_book.middlewares.ReadBookDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   'read_book.pipelines.ReadBookPipeline': 300,
   'read_book.pipelines.MysqlPipline': 301,
}

#Mysql配置
DB_HOST = "127.0.0.1"
DB_PORT = 3306
DB_USER = "root"
DB_PASSWORD = "123456"
DB_CHARSET = "utf8"
DB_NAME = "spider"


# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

pipeline.py

import pymysql     # 导入mysql的依赖包
from scrapy.utils.project import get_project_settings   #导入可以获取settings.py中配置

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import pymysql
from scrapy.utils.project import get_project_settings

class ReadBookPipeline:
    def open_spider(self,spider):
        print("爬虫开始。。。。。。。。。。。。。",spider)
        self.fp = open("book.json","w",encoding="utf-8")

    def process_item(self, item, spider):
        self.fp.write(str(item))
        return item

    def close_spider(self,spider):
        print("爬虫结束。。。。。。。。。。",spider)
        self.fp.close()

# 存入mysql
class MysqlPipline:
    def open_spider(self,spider):
        settings = get_project_settings()
        self.host = user = settings['DB_HOST']
        self.port = user = settings['DB_PORT']
        self.user = user = settings['DB_USER']
        self.password = user = settings['DB_PASSWORD']
        self.name = user = settings['DB_NAME']
        self.charset = user = settings['DB_CHARSET']
        # self.conn = pymysql.connect(host=host, prot=prot, user =user, password=password, charset=charset, database=name)
        self.conn = pymysql.connect(host=self.host, port=self.port, user=self.user, password=self.password, charset=self.charset, database=self.name)
        self.cursor = self.conn.cursor()

    def process_item(self, item, spider):
        # Mysql配置
        # DB_HOST = "127.0.0.1"
        # DB_PORT = 3306
        # DB_USER = "root"
        # DB_PASSWORD = "123456"
        # DB_CHARSET = "utf8"
        # DB_NAME = "spider"
        sql = "insert into book(src,name) values ('{}','{}')".format(item['src'],item['name'])
        self.cursor.execute(sql)
        self.conn.commit()
        return item

    def close_spider(self,spider):
        self.conn.close()
        self.cursor.close()

items.py


import scrapy


class ReadBookItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    # pass
    # url name
    name = scrapy.Field()
    src = scrapy.Field()