Scrapy帮助

最新推荐文章于 2022-03-15 18:07:19 发布

HukDog

最新推荐文章于 2022-03-15 18:07:19 发布

阅读量264

点赞数

分类专栏： Python 文章标签：码农之路

本文链接：https://blog.csdn.net/qq_39090704/article/details/81670101

版权

Python 专栏收录该内容

19 篇文章 0 订阅

订阅专栏

爬虫文件目录spider下，创建任意名称的.py文件写入如下内容，右键运行，可在pycharm中模拟命令行终端

form scrapy import cmdline

# blabla为爬虫文件的`name`值
cmdline.execute("scrapy crawl blabla".split())

爬虫文件中的name,allowed_domains和start_urls:

name = ''   --------------# 爬虫文件spider的名称

allowed_domains = [''] ----------------# 允许爬取的域名

# 通常会修改爬虫程序的start_urls
start_urls = [''] -----------------# 开始爬取的网址,多个网址时,多线程进行请求

爬取网站过程

1.items.py文件定义数据模型，也就是用于存储主爬虫程序爬取的内容的字段

2.然后在爬虫文件中导入from ..items import 自动生成的爬虫item类

3.初始化数据模型对象，对数据模型字段赋值，yield逐条返回item

4.返回的item进入管道文件pipelines 进行处理和返回，以及自定义管道

5.middlewares中间件多用于请求数据，同时自定义设置请求头信息，自定义中间件

6.settings文件用于配置项目信息，包括请求信息，管道调用和优先级，延迟delay,中间件等等

主爬虫文件中的def parse(self,response)函数

# -*- coding: utf-8 -*-
import scrapy


class BaiduspiderSpider(scrapy.Spider):
    # 爬虫名
    name = 'baiduSpider'
    # 允许爬虫的范围
    # allowed_domains = ['baidu.com']
    start_urls = ['http://www.baidu.com/']

    # response 为scrapy的默认下载中间请求start_urls后返回的结果
    def parse(self, response):
        # 请求的响应文本
        # print(response.text)
        # body为响应体
        # print(response.body)
        # 响应头
        # print(response.headers)
        # 获取当前状态
        # print(response.status)
        code = response.body.decode()
        # 获取的response可直接使用xpath获取内容
        result = response.xpath('.//div[@class="cover"]/@cover-text')
        # 仍能对其使用xpath取数据
        result = result.xpath(......)
        # 此时`result`为`['selector=<...>,data=<...>']`样式
        result = result.extract()  # 使其转化为列表
        result = result[0]    # 取第一个元素
        # 也可以一步到位:
        # retult = response.xpath('.//div[@class="cover"]/@cover-text').extract_first('')

        # 将items.py作为模块中导入其中的class类`MokoItem`
        from ..items import MokoItem
        for ul in ul_list:
            # 初始化一个对象MokoItem类
            item = MokoItem()
            # 为数据模型中的字段赋值
            item['title'] = retult

            # 逐个数据模型每个item对象
            yield item

在控制台运行爬虫时可以同时保存返回的item为本地文件

# 将文件存储为指定类型 支持四种数据类型text,json,xml,csv
crapy crwl meikong -o meikong.xml

# 转换编码
scrapy crawl meikong -o mei.json -s FEED_EXPORT_ENCODING=UTF-8

settings部分配置说明

BOT_NAME = 'baidu'

# 爬虫所在地
SPIDER_MODULES = ['baidu.spiders']
NEWSPIDER_MODULE = 'baidu.spiders'

# 遵守爬虫协议
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
# 最大请求并发量 默认16
# CONCURRENT_REQUESTS = 32

# configure 配置 请求延迟
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
# 是否使用cookie
#COOKIES_ENABLED = False

# 请求头信息,可添加用户表示User-Agent
DEFAULT_REQUEST_HEADERS = {
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  'Accept-Language': 'en'
}

SPIDER_MIDDLEWARES = {
        # 值越小,优先级越高,优先级越高,越先执行
   'baidu.middlewares.BaiduSpiderMiddleware': 543,
}
# 下载中间件
OWNLOADER_MIDDLEWARES = {
        # 值越小,优先级越高,优先级越高,越先执行
   'baidu.middlewares.BaiduDownloaderMiddleware': 543,
}

# Enable or disable extensions 是否进行扩展
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

ITEM_PIPELINES = {
    # 值越小,优先级越高,优先级越高,越先执行
   'baidu.pipelines.BaiduPipeline': 1,
}

主爬虫文件中图片和文件链接字段【】和回调函数callback=

'setting.py'文件中使用系统文件，图片处理管道下载图片和文件

spider.py

# -*- coding: utf-8 -*-
import scrapy
from ..items import ImagenetItem

class ImageSpider(scrapy.Spider):
    name = 'image'
    allowed_domains = ['pic.netbian.com']
    # 请求最开始的url
    start_urls = ['http://pic.netbian.com/4kmeishi/']

    def parse(self, response):
        # 根据响应来找到制定的内容 现在找的是img的src属性
        img_list = response.xpath('//ul[@class="clearfix"]/li/a/img/@src')
        # print(img_list)
        # 找到多个属性值 遍历
        for img in img_list:
            # 使用在items.py中定义的数据模型item
            item = ImagenetItem()
            img_src ='http://pic.netbian.com' + img.extract()
            # print(img_src)
            # 将下载地址放入数据模型中
            # 下载地址要包在list当中
            item['img_src'] = [img_src]
            item['fileUrl'] = [fileUrl]
            # 将数据传输给管道
            yield item

        next_url = response.xpath('//div[@class="page"]/a[text()="下一页"]/@href').extract()
        if len(next_url) != 0:
            url = 'http://pic.netbian.com' + next_url[0]
            # 将url传给scrapy.Request  得到的结果继续用self.parse继续处理
            yield scrapy.Request(url,callback=self.parse)

settings.py

ITEM_PIPELINES = {
   # 'imageNet.pipelines.ImagenetPipeline': 300,
   #  scrapy中专门负责图片下载的管道
    'scrapy.pipelines.images.ImagesPipeline' : 1,
    # 文件下载管道
    'scrapy.pipelines.files.FilesPipeline':2
}
# 体片的存储路径
IMAGES_STORE = 'D:/imageDownLoad'
# 图片的下载地址 根据item中的字段来设置哪一个内容需要被下载
IMAGES_URLS_FIELD = 'img_src'

FILES_STORE = 'd:/qishu/book/'
FILES_URLS_FIELD = 'fileUrl'

自定义管道文件将items字段保存到sqlite数据库注意settings.py文件中启用管道

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import sqlite3

class HongxiutianxiangPipeline(object):
    def process_item(self, item, spider):
        return item

class HongXiuDBPipeline(object):
    def open_spider(self,spider):
        self.con = sqlite3.connect('hongxiuDB')
        self.cursor = self.con.cursor()
        self.cursor.execute('create table if not exists bookTable(name text,author text,img text,intro text)')
        self.con.commit()
    def process_item(self,item,spider):
        print('--------------------------------------')
        self.cursor.execute('insert into bookTable VALUES ("{}","{}","{}","{}")'.format(item['name'],item['author'],item['img'],item['intro']))
        self.con.commit()
        return item
    def close_spider(self,spider):
        self.cursor.close()
        self.con.close()

自定义管道保存items到json文件中

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

# 用来打开指定文件,并且对文件进行转码 防止出现乱码问题
import codecs
import json
import os
class XiaoshuoPipeline(object):
    def __init__(self):
        # w 写文件
        # w+读写文件 r+ 读写文件
        # 前者读写文件 如果文件不存在则创建
        # 后者读写文件 如果文件不存在 则抛出异常
        self.file = codecs.open(filename='book.json',mode='w+',encoding='utf-8')
        self.file.write('"list":[')
    # 如果想要将数据写入本地或者使用数据库的时候 这个方法需要保留
    def process_item(self, item, spider):
        # 将item对象转化成一个字典对象
        res = dict(item)
        # dumps将字典对象转化成字符串 ascii编码是否可用
        # 如果直接将字典形式的数据写入到文件当中  会发生错误
        # 所以需要将字典形式的值,转化为字符串写入文件
        str  = json.dumps(res,ensure_ascii=False)
        # 将数据写入到文件当中
        self.file.write(str)
        self.file.write(',\n')
        return item
    def open_spider(self,spider):
        print('爬虫开始')
    def close_spider(self,spider):
        # 删除文件当中最后一个字符
        # -1 表示偏移量
        # SEEK_END 定位到文件最后一个字符
        self.file.seek(-1,os.SEEK_END)
        # 开始执行
        self.file.truncate()

        self.file.seek(-1,os.SEEK_END)
        self.file.truncate()

        self.file.write(']')
        self.file.close()
        print('爬虫结束')

HukDog

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
1
评论
Scrapy帮助

爬虫文件目录spider下，创建任意名称的.py文件写入如下内容，右键运行，可在pycharm中模拟命令行终端form scrapy import cmdline# blabla为爬虫文件的`name`值cmdline.execute("scrapy crawl blabla".split())爬虫文件中的name,allowed_domains和start_urls:na...
复制链接

扫一扫