爬简书

最新推荐文章于 2021-02-19 14:52:50 发布

xiaogeldx

最新推荐文章于 2021-02-19 14:52:50 发布

阅读量202

点赞数

本文链接：https://blog.csdn.net/xiaogeldx/article/details/104368901

版权

文章目录

爬简书
- mysql 数据表设置

爬简书

artical_spider.py

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from jianshu_spider.items import JianshuSpiderItem


class ArticalSpiderSpider(CrawlSpider):
    name = 'artical_spider'
    allowed_domains = ['www.jianshu.com']
    start_urls = ['http://www.jianshu.com/']

    rules = (
        # 每篇文章中的推荐阅读的 href 是 /p/* ，所以这里用 .*/p/
        Rule(LinkExtractor(allow=r'.*/p/[0-9a-f]{12}.*'), callback='parse_item', follow=True),
    )

    def parse_item(self, response):
        title = response.xpath('//h1[@class="_1RuRku"]/text()').get()
        article_id = response.url.split('?')[0].split('/')[-1]
        origin_url = response.url
        content = response.xpath('//article[@class="_2rhmJa"]').get()
        avatar = response.xpath('//img[@class="_13D2Eh"]/@src').get()
        print(avatar)
        author = response.xpath('//span[@class="FxYr8x"]/a[@class="_1OhGeD"]/text()').get()
        print(author)
        some = response.xpath('//div[@class="s-dsoj"]')[0]
        print(some)
        pub_time = some.xpath('.//time/text()').get()
        word_count = some.xpath('.//span[2]/text()').get()
        read_count = some.xpath('.//span[3]/text()').get()
        like_count_origin = some.xpath('//span[@class="_1LOh_5"]/text()').get()
        like_count = like_count_origin.split('人')[0]
        subjects_origin = response.xpath('//div[@class="_2Nttfz"]/a[@class="_3s5t0Q _1OhGeD"]/span[@class="_2-Djqu"]/text()').getall()
        subjects = ','.join(subjects_origin)
        item = JianshuSpiderItem(title=title, article_id=article_id, origin_url=origin_url, content=content,
                                 avatar=avatar, author=author, pub_time=pub_time, word_count=word_count,
                                 read_count=read_count, like_count=like_count, subjects=subjects)
        #item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get()
        #item['name'] = response.xpath('//div[@id="name"]').get()
        #item['description'] = response.xpath('//div[@id="description"]').get()
        return item

items.py

import scrapy


class JianshuSpiderItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    title = scrapy.Field()
    avatar = scrapy.Field()
    author = scrapy.Field()
    article_id = scrapy.Field()
    pub_time = scrapy.Field()
    word_count = scrapy.Field()
    read_count = scrapy.Field()
    content = scrapy.Field()
    subjects = scrapy.Field()
    origin_url = scrapy.Field()
    like_count = scrapy.Field()

pipelines.py

import pymysql
from twisted.enterprise import adbapi   # 专门做数据库处理
from pymysql import cursors


class JianshuSpiderPipeline(object):

    def __init__(self):
        dbparams = {
            'host': '150.109.61.206',
            'port': 3306,
            'user': 'root',
            'password': '111111',
            'database': 'jianshu',
            'charset': 'utf8'
        }
        # **dbparams 相当于将 dbparams 中的键值一一对应到参数中
        self.conn = pymysql.connect(**dbparams)
        self.cursor = self.conn.cursor()
        self._sql = None

    def process_item(self, item, spider):
        # self.cursor.execute(self.sql, (item['title'], item['content'], item['author'], item['avatar'], item['pub_time'],
        #                                item['origin_url'], item['article_id']))
        self.cursor.execute(self.sql, (item['title'], item['content'], item['origin_url'], item['article_id']))
        self.conn.commit()
        return item

    @property
    def sql(self):
        if not self._sql:
            # self._sql = """
            #     insert into article(id, title, content, author, avatar, pub_time, article_id) values(null, %s, %s, %s, %s, %s, %s, %s)
            #
            # """
            self._sql = """
                insert into article(id, title, content, origin_url, article_id) values(null, %s, %s, %s, %s)
               
            """
            return self._sql
        return self._sql


class JianshuTwistedPipeline(object):
    """
    异步
    """

    def __init__(self):
        dbparams = {
            'host': '150.109.61.206',
            'port': 3306,
            'user': 'root',
            'password': '111111',
            'database': 'jianshu',
            'charset': 'utf8',
            'cursorclass': cursors.DictCursor   #游标的类
        }
        self.dbpool = adbapi.ConnectionPool('pymysql', **dbparams)
        self._sql = None

    @property
    def sql(self):
        if not self._sql:
            # self._sql = """
            #     insert into article(id, title, content, author, avatar, pub_time, article_id) values(null, %s, %s, %s, %s, %s, %s, %s)
            #
            # """
            self._sql = """
                insert into article(id, title, content, origin_url, article_id) values(null, %s, %s, %s, %s)

            """
            return self._sql
        return self._sql

    def process_item(self, item, spider):
        # self.cursor.execute(self.sql, (item['title'], item['content'], item['author'], item['avatar'], item['pub_time'], item['origin_url'], item['article_id']))
        # self.conn.commit()
        # return item
        #runInteraction() 调用 insert_item() 插入数据，实现异步，如果直接用 insert_item() 插入数据，就是同步了
        # defer = self.dbpool.runInteraction(self.insert_item, item)
        defer = self.dbpool.runInteraction(self.insert_item, item)
        defer.addErrback(self.handle_error, item, spider)
        return item

    def insert_item(self, cursor, item):
        # sql 方法加了 property 装饰器，所以调用 sql 不用加 （）
        cursor.execute(self.sql, (item['title'], item['content'], item['author'], item['avatar'], item['pub_time'],
                                  item['origin_url'], item['article_id'], item['word_count'], item['read_count'],
                                  item['like_count'], item['subjects']))
        # cursor.execute(self.sql, (item['title'], item['content'], item['origin_url'], item['article_id']))

    def handle_error(self, error, item, spider):
        print('='*10+'error'+'='*10)
        print(error)
        print('='*10+'error'+'='*10)

settings.py

import random
import os

BOT_NAME = 'jianshu_spider'

SPIDER_MODULES = ['jianshu_spider.spiders']
NEWSPIDER_MODULE = 'jianshu_spider.spiders'
ROBOTSTXT_OBEY = True
DOWNLOAD_DELAY = random.randint(1,3)
DEFAULT_REQUEST_HEADERS = {
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  'Accept-Language': 'en',
  'User-Agent': 'ELinks/0.13.GIT (textmode; Linux 2.6.24-1-686 i686; 175x65-2)'
}
SPIDER_MIDDLEWARES = {
   'jianshu_spider.middlewares.JianshuSpiderSpiderMiddleware': 542,
    # 'jianshu_spider.middlewares.SeleniumDownloadMiddleWare': 544
}
DOWNLOADER_MIDDLEWARES = {
   'jianshu_spider.middlewares.JianshuSpiderDownloaderMiddleware': 543,
}
ITEM_PIPELINES = {
   'jianshu_spider.pipelines.JianshuTwistedPipeline': 300,
   # 'jianshu_spider.pipelines.JianshuSpiderPipeline': 300,
}

middlewares.py

from scrapy import signals


class JianshuSpiderSpiderMiddleware(object):
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the spider middleware does not modify the
    # passed objects.

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_spider_input(self, response, spider):
        # Called for each response that goes through the spider
        # middleware and into the spider.

        # Should return None or raise an exception.
        return None

    def process_spider_output(self, response, result, spider):
        # Called with the results returned from the Spider, after
        # it has processed the response.

        # Must return an iterable of Request, dict or Item objects.
        for i in result:
            yield i

    def process_spider_exception(self, response, exception, spider):
        # Called when a spider or process_spider_input() method
        # (from other spider middleware) raises an exception.

        # Should return either None or an iterable of Request, dict
        # or Item objects.
        pass

    def process_start_requests(self, start_requests, spider):
        # Called with the start requests of the spider, and works
        # similarly to the process_spider_output() method, except
        # that it doesn’t have a response associated.

        # Must return only requests (not items).
        for r in start_requests:
            yield r

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)

import random
import time

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait

from scrapy import signals
from scrapy.http.response.html import HtmlResponse


class JianshuSpiderDownloaderMiddleware(object):
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the downloader middleware does not modify the
    # passed objects.
    USER_AGENTS = [
        'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
        'Mozilla/5.0(Windows NT 10.0;Win64;x64)AppleWebKit/537.36(KHTML, likeGecko)Chrome/70.0.3538.102Safari/537.36Edge/18.18362',
        'Mozilla/5.0(Windows NT 10.0;WOW64;Trident/7.0;rv:11.0)likeGecko',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
        'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
        'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)',
        'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; InfoPath.3)',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E) QQBrowser/6.9.11079.201',
        'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E)',
        'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; Acoo Browser 1.98.744; .NET CLR 3.5.30729)',
        'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; Acoo Browser 1.98.744; .NET CLR 3.5.30729)',
        'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; Acoo Browser; GTB5; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; InfoPath.1; .NET CLR 3.5.30729; .NET CLR 3.0.30618)',
        'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; SV1; Acoo Browser; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; Avant Browser)',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; GTB5; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; Maxthon; InfoPath.1; .NET CLR 3.5.30729; .NET CLR 3.0.30618)',
        'Mozilla/4.0 (compatible; Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; Acoo Browser 1.98.744; .NET CLR 3.5.30729); Windows NT 5.1; Trident/4.0)',
        'Mozilla/4.0 (compatible; Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; GTB6; Acoo Browser; .NET CLR 1.1.4322; .NET CLR 2.0.50727); Windows NT 5.1; Trident/4.0; Maxthon; .NET CLR 2.0.50727; .NET CLR 1.1.4322; InfoPath.2)',
        'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; Acoo Browser; GTB6; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; InfoPath.1; .NET CLR 3.5.30729; .NET CLR 3.0.30618)',
        'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; Acoo Browser; GTB5; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; InfoPath.1; .NET CLR 3.5.30729; .NET CLR 3.0.30618)',
        'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; GTB6; Acoo Browser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Trident/4.0; Acoo Browser; GTB5; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; InfoPath.1; .NET CLR 3.5.30729; .NET CLR 3.0.30618)',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; GTB5; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; GTB5; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; InfoPath.1; .NET CLR 3.5.30729; .NET CLR 3.0.30618)',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Acoo Browser; InfoPath.2; .NET CLR 2.0.50727; Alexa Toolbar)',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Acoo Browser; .NET CLR 2.0.50727; .NET CLR 1.1.4322)',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Acoo Browser; .NET CLR 1.0.3705; .NET CLR 1.1.4322; .NET CLR 2.0.50727; FDM; .NET CLR 3.0.04506.30; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022; InfoPath.2)',
        'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; Acoo Browser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
        'Mozilla/4.0 (compatible; MSIE 6.0; America Online Browser 1.1; Windows NT 5.1)',
        'Mozilla/4.0 (compatible; MSIE 6.0; America Online Browser 1.1; Windows NT 5.0)',
        'Mozilla/4.0 (compatible; MSIE 6.0; America Online Browser 1.1; Windows 98)',
        'Mozilla/4.0 (compatible; MSIE 7.0; AOL 8.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
        'Mozilla/4.0 (compatible; MSIE 7.0; AOL 8.0; Windows NT 5.1; .NET CLR 1.0.3705; .NET CLR 1.1.4322)',
        'Mozilla/4.0 (compatible; MSIE 7.0; AOL 8.0; Windows NT 5.1; .NET CLR 1.0.3705)',
        'Mozilla/4.0 (compatible; MSIE 7.0; AOL 8.0; Windows NT 5.1)',
        'Mozilla/4.0 (compatible; MSIE 6.0; AOL 7.0; Windows NT 5.1; SV1; .NET CLR 1.0.3705)',
        'Mozilla/4.0 (compatible; MSIE 6.0; AOL 7.0; Windows NT 5.1; SV1)',
        'Mozilla/4.0 (compatible; MSIE 6.0; AOL 7.0; Windows NT 5.1; Q312461; YComp 5.0.0.0)',
        'Mozilla/4.0 (compatible; MSIE 6.0; AOL 7.0; Windows NT 5.1; Q312461)',
        'Mozilla/4.0 (compatible; MSIE 6.0; AOL 7.0; Windows NT 5.1; Hotbar 4.2.8.0)',
        'Mozilla/4.0 (compatible; MSIE 6.0; AOL 7.0; Windows NT 5.1; Hotbar 4.1.7.0)',
        'Mozilla/4.0 (compatible; MSIE 6.0; AOL 7.0; Windows NT 5.1; .NET CLR 1.0.3705)',
        'Mozilla/5.0 (X11; U; OpenBSD ppc; en-US; rv:1.8.1.9) Gecko/20070223 BonEcho/2.0.0.9',
        'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.8.1.9) Gecko/20071103 BonEcho/2.0.0.9',
        'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.9) Gecko/20071113 BonEcho/2.0.0.9',
        'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en; rv:1.8.1.12) Gecko/20080206 Camino/1.5.5',
        'Mozilla/5.0 (Macintosh; U; Intel Mac OS X Mach-O; en; rv:1.8.1.12) Gecko/20080206 Camino/1.5.5',
        'Mozilla/5.0 (Macintosh; U; PPC Mac OS X; pl-PL; rv:1.0.1) Gecko/20021111 Chimera/0.6',
        'Mozilla/5.0 (Macintosh; U; PPC Mac OS X; pl-PL; rv:1.0.1) Gecko/20021111 Chimera/0.6',
        'Mozilla/5.0 (Macintosh; U; PPC Mac OS X; en-US; rv:1.0.1) Gecko/20021111 Chimera/0.6',
        'Mozilla/5.0 (Macintosh; U; PPC Mac OS X; en-US; rv:1.0.1) Gecko/20021104 Chimera/0.6',
        'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3',
        'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; Crazy Browser 3.0.5) ; .NET CLR 3.0.04506.30; InfoPath.2; InfoPath.3; .NET CLR 1.1.4322; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727)',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; InfoPath.2; Crazy Browser 3.0.5)',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; Crazy Browser 2.0.1)',
        'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; InfoPath.1; Crazy Browser 2.0.1)',
        'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; Crazy Browser 2.0.1)',
        'ELinks/0.13.GIT (textmode; Linux 2.6.29 i686; 119x51-2)',
        'ELinks/0.13.GIT (textmode; Linux 2.6.27-rc6.git i686; 175x65-3)',
        'ELinks/0.13.GIT (textmode; Linux 2.6.26-rc7.1 i686; 119x68-3)',
        'ELinks/0.13.GIT (textmode; Linux 2.6.24-1-686 i686; 175x65-2)'
    ]

    def __init__(self):
        self.driver = webdriver.Chrome(executable_path=r'D:\downloads\chromedriver_win32\chromedriver.exe')

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_request(self, request, spider):

        user_agent = random.choice(self.USER_AGENTS)
        request.headers['User-Agent'] = user_agent
        # driver = webdriver.Chrome(executable_path=r'D:\downloads\chromedriver_win32\chromedriver.exe')

        # while True:
        print(request.url)
        self.driver.get(request.url)
        # time.sleep(2)
        # print(driver.current_url)
        # print(driver.page_source)
        # avatar_tag = WebDriverWait(driver, 10).until(
        #     EC.presence_of_all_elements_located((By.XPATH, '//img[@class="_13D2Eh"'))
        # )
        # if avatar_tag == True:
        source = self.driver.page_source
        print(self.driver.current_url)
        # print(source)
        response = HtmlResponse(url=self.driver.current_url, body=source, request=request, encoding='utf8')
        print(response)
        # try:
        #     while True:
        #         show_more = self.driver.find_element(By.CLASS_NAME, 'H7E3vT')
        #         show_more.click()
        #         time.sleep(1)
        #         if not show_more:
        #             break
        #
        # except:
        #     pass

        return response

    def process_response(self, request, response, spider):
        # Called with the response returned from the downloader.

        # Must either;
        # - return a Response object
        # - return a Request object
        # - or raise IgnoreRequest
        return response

    def process_exception(self, request, exception, spider):
        # Called when a download handler or a process_request()
        # (from other downloader middleware) raises an exception.

        # Must either:
        # - return None: continue processing this exception
        # - return a Response object: stops process_exception() chain
        # - return a Request object: stops process_exception() chain
        pass

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)

mysql 数据表设置

数据库：jianshu
表：article

名	类型	长度	不是null	自动递增
id	int		1	1
title	varchar	255
content	longtext
author	varchar	255
avatar	varchar	255
pub_time	datetime
article_id	varchar	20
origin_url	varchar	255
read_count	int	11
like_count	int	11
word_count	int	11
subjects	text

注：不是null 和自动递增中的 1 表示勾选，不写表示不勾选

xiaogeldx

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
爬简书

文章目录爬简书爬简书artical_spider.pyimport scrapyfrom scrapy.linkextractors import LinkExtractorfrom scrapy.spiders import CrawlSpider, Rulefrom jianshu_spider.items import JianshuSpiderItemclass Ar...
复制链接

扫一扫