Scrapy


1.1 Scrapy入门

1.创建一个scrapy项目,项目名为mySpider
scrapy startproject mySpider
2.生成一个爬虫 名字为demo爬取网站为demo.cn
scrapy genspider demo "demo.cn"
3.提取数据
完善spider 使用xpath等
4.保存数据
pipeline中保存数据

运行

1.在命令中运行爬虫
scrapy crawl db #db为爬虫名字
2.在pycharm 中运行爬虫
from scrapy import cmdline
cmdline.execute("scrapy crawl db".split())
#或者
3.在pipline里面查看爬虫名字
scrapy.name

db.py

import scrapy
from scrapy.http.response.html import HtmlResponse
from scrapy.selector.unified import SelectorList

class DbSpider(scrapy.Spider):
    name = 'db'
    #douban.com允许范围
    allowed_domains = ['douban.com']
    #爬取起始位置
    start_urls = ['http://douban.com/']

    def parse(self, response):
        li_list=response.xpath("//div[@class='side-links nav-anon']/ul/li")
        #定义一个字典 保存数据
        #从xpath中获取selector对象之后我们要提取数据
        # 我们有两种方式 例如数据:<a>这个一个文字</a>
        # 1.get()/getall()--><a>这个一个文字</a>
        #2.extract_first() -->这是一个文字
        item={}
        for li in li_list:
            item['name']=li.xpath('a/text()').extract_first()
            # print(item)
            yield item
        # print(li_list)

piplines.py

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import json

class MyspiderPipeline:
    def __init__(self):
        self.f=open('demo.json','w',encoding='utf-8')

    def open_spider(self,item):
        print('爬虫开始了')

    def process_item(self, item, spider):
        print(item)
        # json.dumps()可以将自定形式的数据转换为字符串
        # ensure_ascii=False在转换的时候默认是ascii字符
        item_json=json.dumps(item,ensure_ascii=False)
        self.f.write(item_json+'\n')
        return item

    def close_spider(self,item):
        print('爬虫结束了')


class MyspiderPipeline1:
    def process_item(self, item, spider):
        # print(item)
        return item

settings.py

# Scrapy settings for myspider project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

'''爬虫名字'''
BOT_NAME = 'myspider'
'''模块名字'''
SPIDER_MODULES = ['myspider.spiders']
NEWSPIDER_MODULE = 'myspider.spiders'

#设置log日志等级
LOG_LEVEL='WARNING'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'myspider (+http://www.yourdomain.com)'

# Obey robots.txt rules
'''ROBOTSTXT协议'''
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

'''设置请求头'''
# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36',
  'Accept-Language': 'en',
}

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'myspider.middlewares.MyspiderSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'myspider.middlewares.MyspiderDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
'''设置管道'''
ITEM_PIPELINES = {
   'myspider.pipelines.MyspiderPipeline': 300,
'myspider.pipelines.MyspiderPipeline1': 301,
}


# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

1.2 logging模块的使用

python logging模块 basicConfig配置文件

logging.basicConfig(level=log_level,
                    format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
                    datefmt='%a, %d %b %Y %H:%M:%S',
                    filename='parser_result.log',
                    filemode='w')
logging.basicConfig函数各参数:
filename: 指定日志文件名
filemode: 和file函数意义相同,指定日志文件的打开模式,'w'或'a'
format: 指定输出的格式和内容,format可以输出很多有用信息,如上例所示:
 %(levelno)s: 打印日志级别的数值
 %(levelname)s: 打印日志级别名称
 %(pathname)s: 打印当前执行程序的路径,其实就是sys.argv[0]
 %(filename)s: 打印当前执行程序名
 %(funcName)s: 打印日志的当前函数
 %(lineno)d: 打印日志的当前行号
 %(asctime)s: 打印日志的时间
 %(thread)d: 打印线程ID
 %(threadName)s: 打印线程名称
 %(process)d: 打印进程ID
 %(message)s: 打印日志信息
datefmt: 指定时间格式,同time.strftime()
level: 设置日志级别,默认为logging.WARNING
stream: 指定将日志的输出流,可以指定输出到sys.stderr,sys.stdout或者文件,默认输出到sys.stderr,当stream和filename同时指定时,stream被忽略

logging打印信息函数:

logging.debug('This is debug message')
logging.info('This is info message')
logging.warning('This is warning message')
要在scrapy中使用logging需要在配置文件中设置
LOG_FILE='./log.log'

1.3 爬取腾讯招聘平台案例

hr.py

import scrapy
import requests
import json
class HrSpider(scrapy.Spider):
    name = 'hr'
    allowed_domains = ['tencent.com']
    #列表页
    one_urls = 'https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1598451564725&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=&attrId=&keyword=&pageIndex={}&pageSize=10&language=zh-cn&area=cn'

    #详情页
    two_url='https://careers.tencent.com/tencentcareer/api/post/ByRelated?timestamp=1598451725389&postId={}&num=7&language=zh-cn'
    start_urls=[one_urls.format(1)]

    def parse(self, response):
        for page in range(1,3):
            # 向这10页来发起请求
            url=self.one_urls.format(page)
            yield scrapy.Request(
                url=url,
                #回调函数,把url传入,调用该函数
                callback=self.parse_one
            )
    def parse_one(self,response):
        # response为向url发送请求后的结果
        data=json.loads(response.text)
        # print(data)
        for job in data['Data']['Posts']:
            # 定义一个字典保存数据
            item={}
            item['zh_name']=job['RecruitPostName']
            item['zh_type']=job['CategoryName']
            post_id=job['PostId']
            #拼接详情页url
            detail_url=self.two_url.format(post_id)
            yield scrapy.Request(
                url=detail_url,
                meta={'item':item},
                callback=self.parse_two
            )

    def parse_two(self,response):
        # item=response.meta['item']
        item=response.meta.get('item')
        data = json.loads(response.text)
        # print(data)

        item['zh_duty']=data['Data'][0]['Responsibility']
        print(item)

1.4 爬取阳光热线问政平台

yg.py

import scrapy
from sun.items import SunItem
import json
import time
class YgSpider(scrapy.Spider):
    name = 'yg'
    allowed_domains = ['wz.sun06.com']
    start_urls = ['http://wz.sun0769.com/political/index/politicsNewest?id=1']

    def parse(self, response):
        li_list=response.xpath('//ul[@class="title-state-ul"]/li')
        for li in li_list:
            #初始化
            item=SunItem()
            #标题和详情页地址
            item['title']=li.xpath("./span[3]/a/text()").extract_first()
            item['href']='http://wz.sun0769.com'+li.xpath('./span[3]/a/@href').extract_first()
            print(item)
            yield scrapy.Request(
                url=item['href'],
                callback=self.parse_detail,
                meta={'item':item},
                dont_filter=True
            )

    # 获取下一页
        next_url='http://wz.sun0769.com'+response.xpath("//div[@class='mr-three paging-box']/a[2]/@href").extract_first()
        print(next_url)
        time.sleep(3)
        if next_url is not None:
            yield scrapy.Request(
                url=next_url,
                callback=self.parse,
                dont_filter=True
            )
    def parse_detail(self,response):
        item = response.meta.get('item')
        item['content'] = response.xpath("//div[@class='details-box']/pre/text()").extract_first()
        print(item)
        yield item

items.py

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class SunItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    title=scrapy.Field()
    href=scrapy.Field()
    content=scrapy.Field()

piplines.py

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
from itemadapter import ItemAdapter

import re
class SunPipeline:
    def process_item(self, item, spider):
        item['content']=self.parse_content(item['content'])
        return item

    #定义一个函数解决格式问题
    def parse_content(self,content):
        content=re.sub(r'\r\n','',content)
        return content

1.5 关于回调函数无法被调用的问题

生成了一个回调函数但是,一直都无法被调用。查阅相关资料之后发现。Offsite Spider Middlewar这个中间件有过滤url的作用,会过滤掉那些那些不在 allowed_domains 列表中requests的请求.因此如果你要请求的域名是固定的则应该在allowed_domains中添加进去,否则在Request()中将参数dont_filter设置为True

1.6 setting 文件

# Scrapy settings for myspider project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

'''爬虫名字'''
BOT_NAME = 'myspider'
'''模块名字'''
SPIDER_MODULES = ['myspider.spiders']
NEWSPIDER_MODULE = 'myspider.spiders'

#设置log日志等级
LOG_LEVEL='WARNING'
LOG_FILE='./log.log'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'myspider (+http://www.yourdomain.com)'

# Obey robots.txt rules
'''ROBOTSTXT协议'''
ROBOTSTXT_OBEY = False

'''设置最大的并发请求'''
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32


# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
'''设置延迟'''
#DOWNLOAD_DELAY = 3

# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

'''设置cookie默认开启'''
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

'''设置是可用终端'''
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

'''设置请求头'''
# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36',
  'Accept-Language': 'en',
}

'''爬虫中间件'''
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'myspider.middlewares.MyspiderSpiderMiddleware': 543,
#}

'''下载中间件'''
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'myspider.middlewares.MyspiderDownloaderMiddleware': 543,
#}

'''插件'''
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}


# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
'''设置管道'''
ITEM_PIPELINES = {
   'myspider.pipelines.MyspiderPipeline': 300,
'myspider.pipelines.MyspiderPipeline1': 301,
}


# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

1.7 piplines的使用

从pipeline的字典形可以看出来,pipeline可以有多个,⽽且确实pipeline能够定义多个

为什么需要多个pipeline:

1 可能会有多个spider,不同的pipeline处理不同的item的内容

2 ⼀个spider的内容可以要做不同的操作,⽐如存⼊不同的数据库中

注意:

1 pipeline的权重越⼩优先级越⾼

2 pipeline中process_item⽅法名不能修改为其他的名称

1.8 CrawlSpider

1.创建CrawlSpider 创建方式 scrapy genspider -t crawl 爬虫的名字 域名
2.CrawlSpider 需要定义回调函数的时候最好找个函数名字不要以parse命名
3.Rule对象 什么时候你要follow callback的实现方式

# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule

class CygSpider(CrawlSpider):
    name = 'cyg'
    allowed_domains = ['wz.sun0769.com']
    start_urls = ['http://wz.sun0769.com/political/index/politicsNewest?id=1']
    # 定义提取url地址的规则
    rules = (
        # LinkExtractor 链接提取器 需要提取的url地址
        # callback 提取Url地址的response会交给回调函数处理
        # follow=True 就是请求连续不断新的url地址
        # 列表页
        Rule(LinkExtractor(allow=r'http://wz.sun0769.com/political/index/politicsNewest\?id=\d+'),follow=True),
        # 详情页
        Rule(LinkExtractor(allow=r'http://wz.sun0769.com/political/politics/index\?id=\d+'), callback='parse_item'),
    )

    def parse_item(self, response):
        item = {}
        #item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get()
        #item['name'] = response.xpath('//div[@id="name"]').get()
        #item['description'] = response.xpath('//div[@id="description"]').get()
        # 详情页的数据
        item['content'] = response.xpath("//div[@class='details-box']/pre/text()").extract_first()
        print(item)
        return item

1.9 CrawlSpider 爬取小程序社区

http://www.wxapp-union.com/portal.php?mod=list&catid=2&page=1

item.py

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class SpItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    title=scrapy.Field()
    author=scrapy.Field()
    pub_data=scrapy.Field()#发布时间

wxapp.py

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from sp.items import SpItem

class WxappSpider(CrawlSpider):
    name = 'wxapp'
    allowed_domains = ['wxapp-union.com']
    start_urls = ['http://www.wxapp-union.com/portal.php?mod=list&catid=2&page=1']

    rules = (
        #列表页面
        Rule(LinkExtractor(allow=r'http://www.wxapp-union.com/portal.php\?mod=list&catid=2&page=\d+'), follow=True),
        #详情页面
        Rule(LinkExtractor(allow=r'http://www.wxapp-union.com/article-\d+-1.html'), callback='parse_item'),
    )

    def parse_item(self, response):
        # item = {}
        #item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get()
        #item['name'] = response.xpath('//div[@id="name"]').get()
        #item['description'] = response.xpath('//div[@id="description"]').get()

        item=SpItem()
        item['title']=response.xpath('//h1[@class="ph"]/text()').extract_first()
        item['author']=response.xpath("//p[@class='authors']/a/text()").extract_first()
        item['pub_data']=response.xpath("//p[@class='authors']/span/text()").extract_first()

        return item

1.10 携带cookie登陆

  • 设置请求头
  • 将cookie分割为字典传入
import scrapy


class RenrenSpider(scrapy.Spider):
    name = 'renren'
    allowed_domains = ['renren.com']
    start_urls = ['http://www.renren.com/975003376/profile']

    # 重写起始方法,模拟登陆
    def start_requests(self):
        #携带cookie
        cookie='you cookie'
        headers={
            'Cookie':cookie
        }

        cookies={i.split('=')[0]:i.split('=')[1] for i in cookie.split('; ')}
        #发送请求
        yield scrapy.Request(
            url=self.start_urls[0],
            #处理请求结果
            callback=self.parse,
            #携带cookie
            # headers=headers,
            cookies=cookies,
        )
    def parse(self, response):
        print(response.text)
        with open('renren.html','w',encoding='utf-8') as f:
            f.write(response.text)

1.11 发送post请求

# -*- coding: utf-8 -*-
import scrapy

'''
commit: Sign in
authenticity_token: ay/QHPxeCTKwPlks4/0QoVvp2CttEF5NRJ/mimgV7xv7N+d1ONDn5IRbtNxCoG1JpdCASZ8Sw669MMmNm9GwFg==
ga_id: 287622012.1592305586
login: LogicJerry
password: 123456
webauthn-support: supported
webauthn-iuvpaa-support: supported
return_to: 
required_field_2fbe: 
timestamp: 1593524450720
timestamp_secret: 02ae98af552a04d667ca9ae3afb11bbb763332685c2b8cf12daeef6f9f26b22f
'''

class GithubSpider(scrapy.Spider):
    name = 'github'
    allowed_domains = ['github.com']
    start_urls = ['https://github.com/login']

    def parse(self, response):
        commit = 'Sign in'
        authenticity_token = response.xpath("//input[@name='authenticity_token']/@value").extract_first()
        # ga_id = response.xpath("//input[@name='ga_id']/@value").extract_first()
        login = 'LogicJerry'
        password = '12122121zxl'
        timestamp = response.xpath("//input[@name='timestamp']/@value").extract_first()
        timestamp_secret = response.xpath("//input[@name='timestamp_secret']/@value").extract_first()

        # print(authenticity_token)

        # 定义一个字典来提交数据
        data = {
            'commit':commit,
            'authenticity_token': authenticity_token,
            # 'ga_id': ga_id,
            'login': login,
            'password': password,
            'webauthn-support': 'supported',
            'webauthn-iuvpaa-support': 'unsupported',
            'timestamp': timestamp,
            'timestamp_secret': timestamp_secret,
        }

        yield scrapy.FormRequest(

            # 提交的地址
            url='https://github.com/session',
            # 提交的数据
            formdata=data,
            # 响应的方法
            callback=self.after_login,
        )

    def after_login(self,response):

        # 保存页面
        with open('github.html','w',encoding='utf-8') as f:
            f.write(response.body.decode())
        # print(response)

1.12 模拟登陆github

# -*- coding: utf-8 -*-
import scrapy


class Github2Spider(scrapy.Spider):
    name = 'github2'
    allowed_domains = ['github.com']
    start_urls = ['https://github.com/login']

    def parse(self, response):

        yield scrapy.FormRequest.from_response(
            # 请求的响应结果
            response= response,
            # 提交数据
            formdata={'login':'用户名','password':'密码'},
            # 回调函数
            callback = self.after_login

        )

    def after_login(self,response):

        # 保存文件
        with open('github2.html','w',encoding='utf-8') as f:
            f.write(response.body.decode())
        # pass

1.13 设置随机ua与代理ip

  • 修改配置文件

    DOWNLOADER_MIDDLEWARES = {
       'mw.middlewares.MwDownloaderMiddleware': 543,
       'mw.middlewares.RandomUserAgent': 544,
    }
    
  • 在爬虫中间件中添加

    #定义随机请求头
    class RandomUserAgent(object):
        def process_request(self, request, spider):
            from fake_useragent import UserAgent
            user_agent=UserAgent().random
            request.headers['user_agent']=user_agent
            
            #设置代理ip
            # request.meta['proxy']='ip:端口'
    
  • 方法二:直接在配置文件中修改请求头

    from fake_useragent import UserAgent
    DEFAULT_REQUEST_HEADERS = {
      'User-Agent': UserAgent().random,
      'Accept-Language': 'en',
    }
    

1.14 使用scrapy内置的下载文件的方法

  • 1:避免重新下载最近已经下载过的数据
  • 2:可以方便的指定文件存储的路径
  • 3:可以将下载的图片转换成通用的格式。如:png,jpg
  • 4:可以方便的生成缩略图
  • 5:可以方便的检测图片的宽和高,确保他们满足最小限制
  • 6:异步下载,效率非常高

1.14.1 下载文件的 Files Pipeline

使用Files Pipeline下载文件,按照以下步骤完成:

  • 定义好一个Item,然后在这个item中定义两个属性,分别为file_urls以及files。files_urls是用来存储需要下载的文件的url链接,需要给一个列表
  • 当文件下载完成后,会把文件下载的相关信息存储到item的files属性中。如下载路径、下载的url和文件校验码等
  • 在配置文件settings.py中配置FILES_STORE,这个配置用来设置文件下载路径
  • 启动pipeline:在ITEM_PIPELINES中设置scrapy.piplines.files.FilesPipeline:1

1.14.2 下载图片的 Images Pipeline

使用images pipeline下载文件步骤:

  • 定义好一个Item,然后在这个item中定义两个属性,分别为image_urls以及images。image_urls是用来存储需要下载的文件的url链接,需要给一个列表
  • 当文件下载完成后,会把文件下载的相关信息存储到item的images属性中。如下载路径、下载的url和图片校验码等
  • 在配置文件settings.py中配置IMAGES_STORE,这个配置用来设置文件下载路径
  • 启动pipeline:在ITEM_PIPELINES中设置scrapy.pipelines.images.ImagesPipeline:1

audi

# -*- coding: utf-8 -*-
import scrapy
from pic.items import PicItem

class AudiSpider(scrapy.Spider):
    name = 'audi'
    allowed_domains = ['car.autohome.com.cn']
    start_urls = ['https://car.autohome.com.cn/photolist/series/44501/5729675.html#pvareaid=3454450']
    # https://car.autohome.com.cn/photolist/series/18/p2/
    # https://car.autohome.com.cn/photolist/series/18/p3/
    def parse(self, response):

        # 找ul下面的li
        ul = response.xpath('//ul[@id="imgList"]/li')

        for li in ul:

            # item = {}

            item = PicItem()

            # item['src'] = 'https:'+ li.xpath("./a/img/@src").extract_first() # 第一种
            item['image_urls'] = ['https:'+ li.xpath("./a/img/@src").extract_first()] # 第二种

            print(item)
            yield item
        pass

pipline

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html

# 获取数据
# 保存图片
from urllib import request # 用这个模块来保存图片
import os
from scrapy.pipelines.images import ImagesPipeline
class PicPipeline:
    def process_item(self, item, spider):

        # 获取图片的url
        src = item['src']

        # 切割图片的url 获取图片名字
        img_name = src.split('__')[-1]

        # 保存图片 指定一个路径 E:\JerryProject\spider\day19\pic\images
        file_path = os.path.join(os.path.dirname(os.path.dirname(__file__)),'images')

        print(img_name)
        request.urlretrieve(src,file_path+'/'+img_name)
        return item

items

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class PicItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    image_urls = scrapy.Field() # 一个是图片的url
    images = scrapy.Field() # 保存图片的路径
    pass

settings

# -*- coding: utf-8 -*-

# Scrapy settings for pic project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'pic'

SPIDER_MODULES = ['pic.spiders']
NEWSPIDER_MODULE = 'pic.spiders'

LOG_LEVEL = 'WARNING'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'pic (+http://www.yourdomain.com)'

# Obey robots.txt rules
# ROBOTSTXT_OBEY = True

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language': 'en',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'
}

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'pic.middlewares.PicSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'pic.middlewares.PicDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   # 'pic.pipelines.PicPipeline': 300,
    'scrapy.pipelines.images.ImagesPipeline':1
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
import os
IMAGES_STORE = os.path.join(os.path.dirname(os.path.dirname(__file__)),'images')

1.15 爬取苏宁图书

# -*- coding: utf-8 -*-
import scrapy
from copy import deepcopy
import re


class SnbSpider(scrapy.Spider):
    name = 'snb'
    allowed_domains = ['suning.com']
    start_urls = ['https://book.suning.com/']

    def parse(self, response):
        # 获取大分类
        dl_list = response.xpath("//div[@class='menu-item']/dl")
        for dl in dl_list:
            item = {}
            item["b_cate"] = dl.xpath("./dt/h3/a/text()").extract_first()
            dd_list = dl.xpath("./dd/a")
            for dd in dd_list:
                # 获取小分类
                item["s_cate"] = dd.xpath("./text()").extract_first()
                item["s_href"] = dd.xpath("./@href").extract_first()
                item["h_2"] = item["s_href"][26:32]

                # 发送请求
                yield scrapy.Request(
                    url=item["s_href"],
                    callback=self.parse_book_list,
                    meta={"item": deepcopy(item)}
                )

    def parse_book_list(self, response):
        item = response.meta["item"]
        ul_list = response.xpath("//ul[@class='clearfix']/li")
        for li in ul_list:
            item["book_name"] = li.xpath(".//div[@class='res-img']//a/img/@alt").extract_first()
            # 反爬
            item["book_image"] = li.xpath(".//div[@class='res-img']//a/img/@src").extract_first()
            if item["book_image"] == None:
                item["book_image"] = li.xpath(".//div[@class='res-img']//a/img/@src2").extract_first()
            item["book_url"] = li.xpath(".//div[@class='res-img']//a/@href").extract_first()
            if item["book_url"]:
                item["book_url"] = "https:" + item["book_url"]
                yield scrapy.Request(
                    url=item["book_url"],
                    callback=self.parse_book_detail,
                    meta={"item": deepcopy(item)}
                )

        # 翻页  cp 页码  ci 类别
        # https://list.suning.com/emall/showProductList.do?ci=502320&pg=03&cp=3
        # https://list.suning.com/emall/showProductList.do?ci=502320&pg=03&cp=4

        currentPage = int(re.findall('param.currentPage = "(.*?)";', response.text)[0])
        # param.pageNumbers = "100";
        pageNumbers = int(re.findall('param.pageNumbers = "(.*?)";', response.text)[0])
        print(currentPage, pageNumbers)
        if currentPage < pageNumbers:
            next_url = "https://list.suning.com/emall/showProductList.do?ci={}&pg=03&cp={}".format(item['h_2'], currentPage+1)
            print(next_url)
            yield scrapy.Request(
                url=next_url,
                callback=self.parse_book_list,
                meta={"item": response.meta['item']}
            )

    def parse_data(self, data):
        return data.replace("\n", "").replace("\t", "").replace("\r", "")

    def parse_book_detail(self, response):
        item = response.meta["item"]
        item["book_author"] = response.xpath("//ul[@class='bk-publish clearfix']/li[1]/text()").extract_first()
        if item["book_author"] == None:
            item["book_author"] = "无"
        else:
            item["book_author"] = self.parse_data(item["book_author"])
        item["book_press"] = response.xpath("//ul[@class='bk-publish clearfix']/li[2]/text()").extract_first()
        # item["book_press"] = self.parse_data(item["book_press"])
ROBOTSTXT_OBEY = False

1.16 完整笔记

初级:https://www.yuque.com/qiucheng-8htms/haqumr/hxyenk#57o68

所有:https://www.yuque.com/books/share/9611f921-b1a8-4e5a-9588-22337077ba53?#

第二章 Scrapy分布式

2.1 Redis内存数据库

2.1.1 背景

随着互联网+大数据时代的来临,传统的关系型数据库已经不能满足中大型网站日益增长的访问量和数据量。这个时候就需要一种能够快速存取数据的组件来缓解数据库服务I/O的压力,来解决系统性能上的瓶颈。

2.1.2 数据库的发展历史

1.在互联网+大数据时代来临之前,企业的一些内部信息管理系统,一个单个数据库实例就能满足系统的需求 单数据库实例

2.随着系统访问用户的增多,数据量的增大,单个数据库实例已经满足不了系统的读取需求 缓存(memcache)+单数据库实例

3.缓存可以缓解系统的读取压力,但是数据量的写入压力持续增大,缓存+主从数据库+读写分离

4.数据量再次增大,读写分离以后,主数据库的写库压力出现瓶颈、缓存+主从数据库集群+读写分离+分库分表

5.互联网+大数据时代来临,关系型数据库不能很好的存取一些并发性高,实时性高的,并且数据格式不固定的数据。

nosql+主从数据库集群+读写分离+分库分表

2.1.3 Redis是什么?

Redis是一个高性能的,开源的,C语言开发的,键值对存储数据的nosql数据库。

NoSQL:not only sql,泛指非关系型数据库 Redis/MongoDB/Hbase Hadoop

关系型数据库:MySQL、oracle、SqlServer

2.1.4 NoSQL和SQL数据库的比较

  • 适用场景不同:SQL数据库适合用于关系特别复杂的数据查询场景,nosql反之
  • 事务:SQL对事务的支持非常完善,而nosql基本不支持事务
  • 两者在不断的取长补短

2.1.5 Redis特性

  • Redis支持数据的持久化,可以将内存中的数据保存在磁盘中,重启的时候可以再次加载进行使用
  • Redis不仅仅支持简单的key-value类型的数据,同时还提供List,set等数据类型
  • Redis支持数据的备份

2.1.5 Redis有什么用?

Redis的主要作用:快速存取

2.1.6 Redis应用场景

点赞/秒杀/直播平台的在线好友列表/商品排行榜/单点登录

2.1.7 Redis怎么用?

官网地址:https://redis.io/

命令地址:http://doc.redisfans.com/

Redis的五大数据类型以及应用场景

string/list/set/hash/zset

2.1.8 Redis的安装及启动

sudo apt-get install redis-server

查看帮助命令
redis-server --help

编辑Redis配置文件
sudo vim /etc/redis/redis.conf
将daemonize no改为 daemonize yes保存退出

启动
redis-server

打开服务
sudo service redis start

关闭服务
sudo service redis stop

2.1.9 Redis的配置文件

/etc/redis/redis.conf

当redis作为守护进程运行的时候,它会写一个 pid 到 /var/run/redis.pid 文件里面。
daemonize no

监听端口号,默认为 6379,如果你设为 0 ,redis 将不在 socket 上监听任何客户端连接。
port 6379

设置数据库的数目。
databases 16

根据给定的时间间隔和写入次数将数据保存到磁盘
下面的例子的意思是:
900 秒内如果至少有 1 个 key 的值变化,则保存
300 秒内如果至少有 10 个 key 的值变化,则保存
60 秒内如果至少有 10000 个 key 的值变化,则保存
 
save 900 1
save 300 10
save 60 10000

监听端口号,默认为 6379,如果你设为 0 ,redis 将不在 socket 上监听任何客户端连接。
port 6379

Redis默认只允许本地连接,不允许其他机器连接
bind 127.0.0.1

更多配置文件:https://www.cnblogs.com/kreo/p/4423362.html

2.1.10 Redis数据库简单使用

DBSIZE      查看当前数据库的key数量
keys *      查看key的内容
FLUSHDB     清空当前数据库的key的数量
FLUSHALL    清空所有库的key(慎用)
exists key   判断key是否存在

2.2 Redis 数据类型

1.redis-string

string是redis最基本的类型,一个key对应一个value

strin可以包含任何数据,最大不能超过512M

1.set/get/del/append/strlen

set  ---- 设置值
get  ---- 获取值
mset  ---- 设置多个值
mget  ---- 获取多个值
append ---- 添加字段
del ---- 删除
strlen ---- 返回字符串长度
expire ---过期

2.incr/decr/incrby/decrby

incr ---- 增加
decr ---- 减少
incrby  ----- 制定增加多少
decrby  ----- 制定减少多少

3.getrange/setrange

getrange ---- 获取指定区间范围内的值,类似between....and的关系
getrange name 0 -1
setrange ---- 代表从第几位开始替换,下脚本从零开始
从0 -1表示全部

2.redis-list(单值多value)

List(列表)

列表是简单的字符串列表,按照插入顺序排序,可以添加一个元素列表的头部(左边)或者尾部(右边)

它的底层实际是个链表

1.lpush/rpush/lrange

lpush/rpush/lrange ---- 从左/从右/获取指定长度
lpush list01  1 2 3 4 5  倒序排列
rpush list02  1 2 3 4 5  正序排列
lrange  list01  0  -1  获取list01 中的所有值

2.lpop/rpop

lpop/rpop ---- 移除最左/最右
lpop list01 删除元素5
rpop list01 删除元素1

3.lindex,按照索引下标获得元素(从上到下)

lrange list01 0 -1
lindex list01 1

4.llen,求列表长度

llen list01

5.lrem key

删N个value
lrem list01 2 1   在list01中删除2个1

6.ltrim key

ltrim ---- 开始index结束index,截取指定范围的值后在赋值给key
ltrim list01 0 2    截取list01 从0到2的数据在赋值给list01

7.rpoplpush list1 list2 将list1中最后一个压入list2中第一位

lrange list01 0 -1
lrange list02 0 -1
rpoplpush list1 list2

8.lset key index value

lset list01 0 x     将list02中第一位换成x

9.linsert key before/after

linsert list01b  before x php  在x之前加字段php

3.redis-Hash

hash是一个键值对集合

hash是一个string类型的field和value的映射表,hash特别适合存储对象

1.hset/hget/hmset/hmget/hgetall/hdel

设值/取值/设值多个值/取多个值/取全部值/删除值
hset user id 11
hget user id 
hmset customer id 11 name juran age 26
hmget customer id name age      只返回相应的值
hgetall   customer              返回全部
hdel user id   删除id

2.hlen

求哈希长度 
hlen customer

3.hexists key

hexists ---- 在key里面的某个值
存在返回1 ,不存在返回0

4.hkeys/hvals

hkeys students
hvals students

4.redis-set(不重复的)

Set(集合)

set是string类型的无序集合

1.sadd/smembers/sismember

sadd/smembers/sismember ---- 添加/查看集合/查看是否存在
sadd set01 1 2 2 3 3  去掉重复添加
smembers set01   得到set01
sismember set01 1  如果存在返回1  不存在返回0

2.scard

scard ---- 获取集合里面的元素个数
scard set01

3.srem key value

srem ---- 删除集合中元素
srem set01 3
SMEMBERS set01   3已经被删除掉

4.srandmember key

srandmembe ---- 随机出几个数
sadd set02  1 2 3 4 5 6 7 8
srandmember set02  2

5.spop key

spop ---- 随机出栈
spop set01

6.smove key1 key2

sadd set03 x y z 
smove set01 set03 2  将set01中的2 移动到set03中

7.数学集合类

sadd set01 1 2 3 4 5
sadd set02 1 2 3 a b
差集
SDIFF set01 set02   返回 4 5 在第一个set中不在第二个set中
交集
SINTER set01 set02   返回 1 2 3
并集
SUNION set01 set02  返回set01 set02 中的值  去掉重复

5.redis-Zset

Zset(有序集合)

1.zadd/zrange

zadd zset01 60 v1 70 v2 80 v3 90 v4 100 v5
zrange zset01 0 -1 【withscores】
带分数返回   withscores

2.zrangebyscore key start end

zrangebyscore key start end----根据开始结束来取值
zrangebyscore zset01 60 70

zrangebyscore zset01 60 (90   表示不包含90

zrangebyscore zset01  60 90  limit  1 2 从第一条开始截取2条

3.zrem key

zrem key value---- 某score下对应的value值,作用是删除元素
zrem zset01 v1

4.zcard/zcount key score 区间/zrank key values

zcard   求zset01 总条数
zcount  zset01 60 90  求60-90个数
zrank   zset01  v2   返回1  返回对应下角标,从0开始

Python操作Redis

redispy安装及连接

安装

pip install redis

连接

r = redis.StrictRedis(host='localhost',port=6379,db=0,decode_responses=True)

字符串相关操作

import redis

class TestString(object):
    def __init__(self):
        self.r = redis.StrictRedis(host='192.168.75.130',port=6379)
    设置值
    def test_set(self):
        res = self.r.set('user1','juran-1')
        print(res)
    取值
    def test_get(self):
        res = self.r.get('user1')
        print(res)
    设置多个值
    def test_mset(self):
        d = {
            'user2':'juran-2',
            'user3':'juran-3'
        }
        res = self.r.mset(d)
    取多个值
    def test_mget(self):
        l = ['user2','user3']
        res = self.r.mget(l)
        print(res)
    删除
    def test_del(self):
        self.r.delete('user2')

列表相关操作

class TestList(object):
    def __init__(self):
        self.r = redis.StrictRedis(host='192.168.75.130',port=6379)
    插入记录
    def test_push(self):
        res = self.r.lpush('common','1')
        res = self.r.rpush('common','2')
        # res = self.r.rpush('jr','123')
    弹出记录
    def test_pop(self):
        res = self.r.lpop('common')
        res = self.r.rpop('common')
    范围取值
    def test_range(self):
        res = self.r.lrange('common',0,-1)
        print(res)

集合相关操作

class TestSet(object):
    def __init__(self):
        self.r = redis.StrictRedis(host='192.168.75.130', port=6379)
    添加数据
    def test_sadd(self):
        res = self.r.sadd('set01','1','2')
        lis = ['Cat','Dog']
        res = self.r.sadd('set02',lis)
    删除数据
    def test_del(self):
        res = self.r.srem('set01',1)
    随机删除数据
    def test_pop(self):
        res = self.r.spop('set02')

哈希相关操作

class TestHash(object):
    def __init__(self):
        self.r = redis.StrictRedis(host='192.168.75.130', port=6379)
    
    批量设值
    def test_hset(self):
        dic = {
            'id':1,
            'name':'huawei'
        }
        res = self.r.hmset('mobile',dic)
    批量取值
    def test_hgetall(self):
        res = self.r.hgetall('mobile')
    判断是否存在  存在返回1  不存在返回0
    def test_hexists(self):
        res = self.r.hexists('mobile','id')
        print(res)

Scrapy-分布式

什么是scrapy_redis

scrapy_redis:Redis-based components for scrapy

github地址:https://github.com/rmax/scrapy-redis

scrapy_redis下载

clone github scrapy_redis源码文件
git clone https://github.com/rolando/scrapy-redis.git

scrapy_redis中的settings文件

# Scrapy settings for example project
#
# For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here:
#
#     http://doc.scrapy.org/topics/settings.html
#
SPIDER_MODULES = ['example.spiders']
NEWSPIDER_MODULE = 'example.spiders'

USER_AGENT = 'scrapy-redis (+https://github.com/rolando/scrapy-redis)'

# 指定那个去重的方法给requests对象去重
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
# 指定Scheduler队列
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# Redis中的数据是否保存持久 如果是false 关闭Redis的时候 数据会被清空
SCHEDULER_PERSIST = True
#SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderPriorityQueue"
#SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderQueue"
#SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderStack"

ITEM_PIPELINES = {
    'example.pipelines.ExamplePipeline': 300,
    'scrapy_redis.pipelines.RedisPipeline': 400,
}

LOG_LEVEL = 'DEBUG'

# Introduce an artifical delay to make use of parallelism. to speed up the
# crawl.
DOWNLOAD_DELAY = 1

# 配置文件 连接Redis?

# 两种配置Redis的方式
# REDIS_URL = 'redis://127.0.0.1:6379'

# REDIS_HOST = '127.0.0.1'
# REDIS_PORT = '6379'

scrapy_redis运行

allowed_domains = ['dmoztools.net']
start_urls = ['http://www.dmoztools.net/']

scrapy crawl dmoz

运行结束后redis中多了三个键

dmoz:requests   存放的是待爬取的requests对象 
dmoz:item       爬取到的信息
dmoz:dupefilter 爬取的requests的指纹

普通爬虫改成分布式爬虫

# 改成成分布式
# 1.改造爬虫
# 1.1 导入类
from scrapy_redis.spiders import RedisSpider
# 1.2 继承类
class JdSpider(RedisSpider):
# 1.3 注释start_url
# 2.改写配置文件
# Scrapy settings for example project
#
# For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here:
#
#     http://doc.scrapy.org/topics/settings.html
#
SPIDER_MODULES = ['example.spiders']
NEWSPIDER_MODULE = 'example.spiders'

USER_AGENT = 'scrapy-redis (+https://github.com/rolando/scrapy-redis)'

# 指定那个去重的方法给requests对象去重
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
# 指定Scheduler队列
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# Redis中的数据是否保存持久 如果是false 关闭Redis的时候 数据会被清空
SCHEDULER_PERSIST = True
#SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderPriorityQueue"
#SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderQueue"
#SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderStack"

ITEM_PIPELINES = {
    'example.pipelines.ExamplePipeline': 300,
    'scrapy_redis.pipelines.RedisPipeline': 400,
}

LOG_LEVEL = 'DEBUG'

# Introduce an artifical delay to make use of parallelism. to speed up the
# crawl.
DOWNLOAD_DELAY = 1

# 配置文件 连接Redis?

# 两种配置Redis的方式
# REDIS_URL = 'redis://127.0.0.1:6379'

# REDIS_HOST = '127.0.0.1'
# REDIS_PORT = '6379'

4.在redis中加入爬取网站
lpush jd https://book.jd.com/booksort.html

jd.py

# -*- coding: utf-8 -*-
import scrapy
from copy import deepcopy
from scrapy_redis.spiders import RedisSpider


# 改成成分布式
# 1.改造爬虫
# 1.1 导入类
# 1.2 继承类
# 1.3 注释start_url
# 2.改写配置文件


class JdSpider(RedisSpider):
    name = 'jd'
    allowed_domains = ['jd.com']
    # start_urls = ['https://book.jd.com/booksort.html']
    redis_key = "jd"

    def parse(self, response):
        dt_list = response.xpath("//div[@class='mc']/dl/dt")

        for dt in dt_list:
            item = {}
            item['b_cate'] = dt.xpath("./a/text()").extract_first()
            # //a[@id='3']/following-sibling::a[1]
            em_list = dt.xpath("./following-sibling::dd[1]/em")
            for em in em_list:
                item['s_cate'] = em.xpath("./a/text()").extract_first()
                item['s_href'] = em.xpath("./a/@href").extract_first()
                # https://list.jd.com/1713-3258-3297.html
                # https://list.jd.com/1713,3258,3297.html
                # item['s_href'] = item['s_href'].replace("-", ",").split("/")
                # item['s_href'] = item['s_href'][2] + "/list.html?cat=" + item['s_href'][3].replace(".html", "")
                # print(item)
                # https://list.jd.com/list.html?cat=1713,3258,3297
                if item['s_href'] is not None:
                    item['s_href'] = "https:" + item['s_href']
                    yield scrapy.Request(
                        url=item['s_href'],
                        callback=self.parse_book_list,
                        meta={"item": deepcopy(item)}
                    )

    def parse_book_list(self, response):
        item = response.meta.get('item')
        # print(item)
        li_list = response.xpath("//div[@id='J_goodsList']/ul/li")
        for li in li_list:
            item['book_img'] = li.xpath(".//div[@class='p-img']/a/img/@src").extract_first()
            item["book_name"] = li.xpath(".//div[@class='p-name']/a/em/text()").extract_first()
            item["book_press"] = li.xpath(".//span[@class='p-bi-store']/a/text()").extract_first()
            # print(item)
            item["book_price"] = li.xpath(".//div[@class='p-price']/strong/i/text()").extract_first()
            # print(item)
            yield item

爬取当当图书

# Scrapy settings for spiders project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'spiders'

SPIDER_MODULES = ['spiders.spiders']
NEWSPIDER_MODULE = 'spiders.spiders'


# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  'Accept-Language': 'en',
}


# 指定那个去重的方法给requests对象去重
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
# 指定Scheduler队列
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# Redis中的数据是否保存持久 如果是false 关闭Redis的时候 数据会被清空
SCHEDULER_PERSIST = True

ITEM_PIPELINES = {
   'spiders.pipelines.SpidersPipeline': 300,
    'scrapy_redis.pipelines.RedisPipeline': 400,
}

import scrapy
from copy import deepcopy
from scrapy_redis.spiders import RedisSpider
class DangdangSpider(scrapy.Spider):
    name = 'dangdang'
    allowed_domains = ['dangdang.com']
    start_urls = ['http://book.dangdang.com/']
    # redis_key='dangdang'

    def parse(self, response):
        div_list=response.xpath("//div[@class='con flq_body']/div")
        for div in div_list:
            item={}
            item['b_cate']=div.xpath("./dl/dt//text()").extract()
            item['b_cate'] =[i.strip() for i in item['b_cate'] if len(i.strip())>0]

            # print(item)

            dl_list=div.xpath(".//dl[@class='inner_dl']")
            for dl in dl_list:
                item['m_cate']=dl.xpath("./dt//text()").extract()
                item['m_cate'] = [i.strip() for i in item['b_cate'] if len(i.strip()) > 0][0]
                a_list=dl.xpath("./dd/a")
                for a in a_list:
                    item['s_cate']=a.xpath("./text()").extract_first()
                    item['s_href']=a.xpath("./@href").extract_first()
                    # print(item)
                    if item['s_href'] is not None:
                        yield scrapy.Request(
                            url=item['s_href'],
                            callback=self.parse_book_list,
                            meta={'item':deepcopy(item)}
                        )

    def parse_book_list(self,response):
        print(6666)
        item=response.meta['item']
        li_list=response.xpath("//ul[@class='bigimg']/li")
        print(li_list)
        for li in li_list:
            print('$$$$$')
            item['book_img'] = li.xpath('./a[@class="pic"]/img/@src').extract_first()
            # if item['book_img'] == 'xxxxx':
            #     pass
            item['book_name'] = li.xpath('./p[@class="name"]/a/@title').extract_first()
            print(item)
        yield item
        # next_url=response.xpath("//li[@class='next']/a/@href").extract_first()
        # if next_url is not None:
        #     new_next_url='http://category.dangdang.com'+next_url
        #     print("下一页", new_next_url)
        #
        #     yield scrapy.Request(
        #         url=new_next_url,
        #         callback=self.parse_book_list,
        #         meta={"item":deepcopy(item)}
        #     )


# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
from itemadapter import ItemAdapter

import csv
class SpidersPipeline:
    def process_item(self, item, spider):
        print("=============================")
        print(item)
        datas=[item]
        header = ['b_cate', 'm_cate', 's_cate', 's_href', 'book_img', 'book_name']
        with open('test.csv', 'a', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=header)  # 提前预览列名,当下面代码写入数据时,会将其一一对应。
            writer.writeheader()  # 写入列名
            writer.writerows(datas)  # 写入数据
            print("写入成功")
        return item
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值