python_爬虫 23 Scrapy框架之(九)Scrapy爬虫实战

目录

实战1:爬取简书页面内容并同步保存到数据库

项目目录:

spider/js.py

items.py

pipelines.py

settings.py

start.py

mysql 表结构

结果:

代码 

 实战2:爬取简书页面内容并使用 twisted 异步保存到数据库

pipelines.py

settings.py

代码

实战3:将 selenium + chromedriver 集成到 scrapy ,爬取并异步保存到数据库

项目目录:

spider/js.py

items.py

middlewares.py

pipelines.py

settings.py

start.py

代码


简书网站整站爬虫

实战1:爬取简书页面内容并同步保存到数据库

实战2: 实战2:爬取简书页面内容并使用 twisted 异步保存到数据库

实战3:将 selenium + chromedriver 集成到 scrapy ,爬取并异步保存到数据库

实战1:爬取简书页面内容并同步保存到数据库

通过scrapy 框架,爬取简书首页中的文章,获取文章的标题、内容、作者、发布时间、头像url、文章id、页面url等

项目目录:

spider/js.py

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from jianshu.items import ArticleItem


class JsSpider(CrawlSpider):
    name = 'js'
    allowed_domains = ['jianshu.com']
    start_urls = ['https://www.jianshu.com/']

    rules = (
        # https://www.jianshu.com/p/907c9f3d8f5b
        Rule(LinkExtractor(allow=r'.*/p/[1-9a-z]{12}.*'), callback='parse_detail', follow=True),
    )

    def parse_detail(self, response):
        # title avatar author 这三个属性,是后来script 加载的
        title = response.xpath("//h1[@class='_1RuRku']/text()").get()
        avatar = response.xpath("//a[@class='_1qp91i _1OhGeD']/img/@src").get()
        author = response.xpath("//section[1]/div/div/div/div/span/a/text()").get()
        pub_time = response.xpath("//div[@class='s-dsoj']/time/text()").get()
        url = response.url
        url1 = url.split("?")[0]
        article_id = url1.split("/")[-1]
        content = response.xpath("//article[@class='_2rhmJa']").get()

        item = ArticleItem(
            title=title,
            avatar=avatar,
            author=author,
            pub_time=pub_time,
            origin_url=url,
            article_id=article_id,
            content=content
        )
        print(item)
        yield item

items.py

import scrapy


class ArticleItem(scrapy.Item):
    title = scrapy.Field()
    content = scrapy.Field()
    article_id = scrapy.Field()
    origin_url = scrapy.Field()
    author = scrapy.Field()
    avatar = scrapy.Field()
    pub_time = scrapy.Field()

pipelines.py

from itemadapter import ItemAdapter
import pymysql


class JianshuPipeline:

    def __init__(self):
        dbparams = {
            'host': "127.0.0.1",
            'port': 3306,
            'user': 'root',
            'password': 'root',
            'database': 'runoob',
            'charset': 'utf8'
        }
        self.conn = pymysql.connect(**dbparams)
        self.curser = self.conn.cursor()
        self._sql = None

    def process_item(self, item, spider):
        self.curser.execute(self.sql, (item['title'], item['content'], item['author'], item['avatar'],
                            item['pub_time'], item['article_id'], item['origin_url']))
        self.conn.commit()
        return item

    @property
    def sql(self):
        if not self._sql:
            self._sql = """
            insert into article(id, title, content, author,avatar, pub_time, article_id, origin_url)
            values (null, %s,  %s,  %s,  %s,  %s,  %s,  %s)
            """
            return self._sql
        return self._sql

settings.py

BOT_NAME = 'jianshu'

SPIDER_MODULES = ['jianshu.spiders']
NEWSPIDER_MODULE = 'jianshu.spiders'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

DOWNLOAD_DELAY = 3

# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  'Accept-Language': 'en',
  "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER",

}

ITEM_PIPELINES = {
   'jianshu.pipelines.JianshuPipeline': 300,
}

start.py

from scrapy import cmdline

cmdline.execute("scrapy crawl js".split())

mysql 表结构

DROP TABLE IF EXISTS `article`;
CREATE TABLE `article` (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `title` varchar(255) DEFAULT NULL,
  `content` longtext,
  `author` varchar(255) DEFAULT NULL,
  `avatar` varchar(255) DEFAULT NULL,
  `pub_time` datetime DEFAULT NULL,
  `article_id` varchar(20) DEFAULT NULL,
  `origin_url` varchar(255) DEFAULT NULL,
  `read_count` varchar(255) DEFAULT '0',
  `word_count` varchar(255) DEFAULT '0',
  `like_count` int(11) DEFAULT '0',
  PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=62 DEFAULT CHARSET=utf8;

结果:

代码 

 实战2:爬取简书页面内容并使用 twisted 异步保存到数据库

在实战1的基础上,修改 pipelines.py 文件为如下

pipelines.py

from pymysql import cursors
from twisted.enterprise import adbapi


# 知识点待整理; ConnectionPool
class JianShuTwistedPipeline(object):
    def __init__(self):
        dbparams = {
            'host': "127.0.0.1",
            'port': 3306,
            'user': 'root',
            'password': 'root',
            'database': 'runoob',
            'charset': 'utf8',
            "cursorclass": cursors.DictCursor
        }
        self.dbpool = adbapi.ConnectionPool("pymysql", **dbparams)
        self._sql = None

    @property
    def sql(self):
        if not self._sql:
            self._sql = """
            insert into article(id, title, content, author,avatar, pub_time, article_id, origin_url)
            values (null, %s,  %s,  %s,  %s,  %s,  %s,  %s)
            """
            return self._sql
        return self._sql

    def process_item(self, item, spider):
        defer = self.dbpool.runInteraction(self.insert_item, item)
        defer.addErrback(self.handle_error, item, spider)

    def insert_item(self, cursor, item):
        cursor.execute(self.sql, (item['title'], item['content'], item['author'], item['avatar'],
                            item['pub_time'], item['article_id'], item['origin_url']))

    def handle_error(self, error, item, spider):
        print("=" * 10 + "error begin" + "=" * 10)
        print(error)
        print("=" * 10 + "error end" + "=" * 10)

settings.py

修改使用的pipelines 配置:

ITEM_PIPELINES = {
   'jianshu.pipelines.JianShuTwistedPipeline': 300,
}

代码

实战3:将 selenium + chromedriver 集成到 scrapy ,爬取并异步保存到数据库

 定义 SeleniumDownloadMiddleware 下载器中间件,selenium 的 webdriver.Chrome 去请求url, 并将返回的结果封装成 HtmlResponse 返回给 scrapy 引擎。

项目目录:

spider/js.py

from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from jianshu.items import ArticleItem


class JsSpider(CrawlSpider):
    name = 'js'
    allowed_domains = ['jianshu.com']
    start_urls = ['https://www.jianshu.com/']

    rules = (
        # https://www.jianshu.com/p/907c9f3d8f5b
        Rule(LinkExtractor(allow=r'.*/p/[1-9a-z]{12}.*'), callback='parse_detail', follow=True),
    )

    def parse_detail(self, response):
        # title avatar author 这三个属性,是后来script 加载的
        title = response.xpath("//h1[@class='_1RuRku']/text()").get()
        avatar = response.xpath("//a[@class='_1qp91i _1OhGeD']/img/@src").get()
        author = response.xpath("//section[1]/div/div/div/div/span/a/text()").get()
        pub_time = response.xpath("//div[@class='s-dsoj']/time/text()").get()
        url = response.url
        url1 = url.split("?")[0]
        article_id = url1.split("/")[-1]
        content = response.xpath("//article[@class='_2rhmJa']").get()

        read_count = response.xpath("//div/div[@class='s-dsoj']/span[3]/text()").get()
        word_count = response.xpath("//div/div[@class='s-dsoj']/span[2]/text()").get()
        like_count = response.xpath("//span[@class='_3tCVn5']/span/text()").get()

        item = ArticleItem(
            title=title,
            avatar=avatar,
            author=author,
            pub_time=pub_time,
            origin_url=url,
            article_id=article_id,
            content=content,
            read_count=read_count,
            word_count=word_count,
            like_count=like_count,
        )
        print(item)
        yield item

items.py

import scrapy


class ArticleItem(scrapy.Item):
    title = scrapy.Field()
    content = scrapy.Field()
    article_id = scrapy.Field()
    origin_url = scrapy.Field()
    author = scrapy.Field()
    avatar = scrapy.Field()
    pub_time = scrapy.Field()
    read_count = scrapy.Field()
    word_count = scrapy.Field()
    like_count = scrapy.Field()

middlewares.py

import time
from selenium import webdriver
from scrapy.http.response.html import HtmlResponse

class SeleniumDownloadMiddleware(object):
    def __init__(self):
        self.driver = webdriver.Chrome(executable_path=r"D:\chromedriver_win32\chromedriver.exe")

    def process_request(self, request, spider):
        print("执行 SeleniumDownloadMiddleware -》 process_request ")
        self.driver.get(request.url)  # 请求页面数据
        time.sleep(1)
        try:
            while True:  # 点击获取所有收录专题数据
                showMore = self.driver.find_element_by_class_name("show-more")
                if not showMore:
                    break
                showMore.click()
        except:
            pass
        source = self.driver.page_source  # 请求的页面数据
        response = HtmlResponse(
                    url=self.driver.current_url,
                    body=source,
                    request=request,
                    encoding="utf-8"
                )
        return response

pipelines.py

from pymysql import cursors
from twisted.enterprise import adbapi


# 知识点待整理; ConnectionPool
class JianShuTwistedPipeline(object):
    def __init__(self):
        dbparams = {
            'host': "127.0.0.1",
            'port': 3306,
            'user': 'root',
            'password': 'root',
            'database': 'runoob',
            'charset': 'utf8',
            "cursorclass": cursors.DictCursor
        }
        self.dbpool = adbapi.ConnectionPool("pymysql", **dbparams)
        self._sql = None

    @property
    def sql(self):
        if not self._sql:
            self._sql = """
            insert into article(id, title, content, author,avatar, pub_time, article_id, 
                                origin_url, read_count, word_count, like_count)
            values (null, %s,  %s,  %s,  %s,  %s,  %s,  %s,  %s,  %s,  %s)
            """
            return self._sql
        return self._sql

    def process_item(self, item, spider):
        defer = self.dbpool.runInteraction(self.insert_item, item)
        defer.addErrback(self.handle_error, item, spider)

    def insert_item(self, cursor, item):
        cursor.execute(self.sql, (item['title'], item['content'], item['author'], item['avatar'],
                            item['pub_time'], item['article_id'], item['origin_url'],
                            item['read_count'], item['word_count'], item['like_count']))

    def handle_error(self, error, item, spider):
        print("=" * 10 + "error begin" + "=" * 10)
        print(error)
        print("=" * 10 + "error end" + "=" * 10)

settings.py

BOT_NAME = 'jianshu'

SPIDER_MODULES = ['jianshu.spiders']
NEWSPIDER_MODULE = 'jianshu.spiders'

ROBOTSTXT_OBEY = False

DOWNLOAD_DELAY = 3

DEFAULT_REQUEST_HEADERS = {
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  'Accept-Language': 'en',
  "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER",
}

DOWNLOADER_MIDDLEWARES = {
   'jianshu.middlewares.SeleniumDownloadMiddleware': 543,
}

ITEM_PIPELINES = {
   'jianshu.pipelines.JianShuTwistedPipeline': 300,
}

start.py

from scrapy import cmdline

cmdline.execute("scrapy crawl js".split())

代码

 

目录 

  • 1
    点赞
  • 13
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值