安装
pip install scrapy
入门使用
1.创建一个scrapy项目
scrapy startproject 项目名
scrapy startproject myspider
2.生成一个爬虫
scrapy genspider 爬虫名 允许爬取的范围
允许爬取的范围:域名
例如:
https://fanyi.baidu.com/?aldtype=16047#en/zh/
对于这个网站来说,允许爬取的范围就是 fanyi.baidu.com
3.提取数据
完善spider,使用xpath等方法
4.保存数据
在pipeline中保存数据
5.启动爬虫
scrapy crawl 爬虫名
scrapy入门
创建一个scrapy项目
创建一个爬虫
完善spider
使用pipeline
实例
使用scrapy框架爬取下面的网站
http://www.itcast.cn/channel/teacher.shtml#ajavaee
网站介绍
确定start_urls的地址
http://www.itcast.cn/channel/teacher.shtml#ajavaee
查看一下上面的这个url地址对应的响应中是否存在着我们想要的数据
选中页面,鼠标右键,查看网页源代码
我们可以看见,这个url地址对应的响应中确实是存在着我们想要的数据的
因而,我们选用这个url地址作为我们的start_urls地址
写xpath语句
代码预览
文件树
这个是最终的文件树
cmd
scrapy startproject Myspider
cd Myspider
scrapy genspider itcast itcast.cn
# 网址为:http://www.itcast.cn/channel/teacher.shtml#ajavaee
scrapy crawl itcast
itcast.py
# -*- coding: utf-8 -*-
import scrapy
from ..items import MyspiderItem
class ItcastSpider(scrapy.Spider):
name = 'itcast' # 爬虫名
allowed_domains = ['itcast.cn'] # 允许爬取的范围,网站的域名
start_urls = ['http://www.itcast.cn/channel/teacher.shtml#ajavaee'] # 最开始请求的url地址
# 我们的爬虫启动起来必须要有一个url地址,告诉它:从哪个地方开始爬取内容
# 当前的url地址我们并没有给它指定,这是它通过allowed_domains自动帮助我们生成的一个url地址
# 这个url地址一般来说都是需要我们自己手动指定的,因为scrapy默认生成的url地址对应的响应一般不包括我们想要的数据
def parse(self,response):
# 注意:这个地方方法的名字不可以修改,否则会报错
# raise NotImplementedError
# 方法未完成
# 处理start_urls地址对应的响应,它的响应就是这个response
# 拿到response之后,就可以提取数据
# result=response.xpath('//ul[@class="clears"]//div[@class="main_mask"]/h2/text()')
# print(result)
# 获取老师的姓名
# teacher_name=response.xpath('//ul[@class="clears"]//div[@class="main_mask"]/h2/text()').extract()
# print(teacher_name)
"""
为什么我们可以对列表使用extract()方法?
这个得到的并不是列表,而是scrapy自己定义的一种与列表类似的类型
"""
# 分组提取数据
li_list=response.xpath('//div[@class="maincon"]/ul[@class="clears"]//li') # 取到了所有的li标签
for li in li_list:
item=MyspiderItem()
# item["name"]=li.xpath('.//div[@class="main_bot"]/h2/text()').extract()[0]
item["name"]=li.xpath('.//div[@class="main_bot"]/h2/text()').extract_first()
# item["title"]=li.xpath('.//div[@class="main_bot"]/h2/span/text()').extract()[0]
item["title"]=li.xpath('.//div[@class="main_bot"]/h2/span/text()').extract_first()
# .extract_first()与.extract()[0]
# .extract_first()就能够从这个列表中提取它的第一个值、第一个字符串、第一个结果
# .extract_first()=.extract()[0]
"""
如果xpath书写错误或者是没有获取到数据的时候:
- .extract_first()
返回一个None值
- .extract()[0]
报错
"""
# print(item)
# 如何将item中的数据传到pipelines中呢?
yield item
# 减少内存的占用
# Spider must return Request, BaseItem, dict or None
items.py
# -*- coding: utf-8 -*-
"""
我们可以在其中预先定义好我们要爬哪一些字段
我们可以在items.py文件中定义
"""
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class MyspiderItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
name = scrapy.Field()
title = scrapy.Field()
pipelines.py
# -*- coding: utf-8 -*-
"""
管道:数据的处理和保存
"""
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# 需要在setting中开启
import json # 数据的保存
class MyspiderPipeline1:
def process_item(self, item, spider):
# 这个item就是我们从爬虫的那个地方传过去的item,会到这个地方来
# 注意:这个地方方法的名字不可以修改,否则会报错
print(item)
# item["你好"]="hello" # 向item中添加键值对
# 保存数据到本地
self.save_item(dict(item))
return item
# 为了让数据能够在不同的pipeline中传递,必须使用return返回数据,否则后面的pipeline接收不到数据,为None值
def save_item(self,item):
"""实现存储方法"""
with open("temp.txt","a") as f:
json.dump(item,f,ensure_ascii=False,indent=2)
class MyspiderPipeline2:
def process_item(self, item, spider):
# 注意:这个地方方法的名字不可以修改,否则会报错
return item
settings.py
变动的地方
全部的代码
# -*- coding: utf-8 -*-
"""这是整个项目的设置的文件"""
# Scrapy settings for Myspider project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'Myspider'
SPIDER_MODULES = ['Myspider.spiders']
NEWSPIDER_MODULE = 'Myspider.spiders'
LOG_LEVEL = "WARN"
# 比"WARN"等级要小的日志都不会再显示在终端中
# log4j定义了8个级别的log(除去OFF和ALL,可以说分为6个级别),优先级从高到低依次为:OFF、FATAL、ERROR、WARN、INFO、DEBUG、TRACE、 ALL。
# Log4j建议只使用四个级别,优先级从高到低分别是ERROR、WARN、INFO、DEBUG。
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# =True 表示遵守Robots协议
# =False 表示不遵守Robots协议
# 默认=True,也就是遵守Robots协议
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'Myspider.middlewares.MyspiderSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'Myspider.middlewares.MyspiderDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'Myspider.pipelines.MyspiderPipeline1': 300, # 优先
# 'Myspider.pipelines.MyspiderPipeline2': 301,
}
# 键:类的位置,相对于项目路径
# 值:距离引擎的远近,越小越近越优先
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
思考
为什么我们需要给爬虫一个允许爬取的范围呢?
因为使用scrapy框架爬虫非常快,如果我们不加以限制,可能会爬取到别的网站上的信息