scrapy学习
scrapy解释
- scrapy就是一个基于异步的爬虫框架
环境安装
linux/Mac
pip install scrapy
windows
pip install wheel
- 根据py和系统版本下载对应的twisted
http://www.lfd.uci.edu/~gohlke/pythonlibs/#twisted
- 进入下载的目录
Twisted-20.3.0-cp36-cp36m-win_amd64.whl
pip install pywin
pip install scrapy
基本使用
- 创建一个工程:
scrapy startproject ProName
- 进入工程:
cd ProName
- 创建一个爬虫文件:
scrapy genspider spiderName www.xx.com
- 执行工程:
scrapy crawl spiderName
- 将parse返回的值存储到本地文件中的命令:
scrapy crawl spiderName -o 文件名.指定后缀名
- 将parse返回的值存储到本地文件中的命令:
settings相关配置
-
指定UA:
USER_AGENT = '...'
-
关闭robots:
ROBOTSTXT_OBEY = False
-
指定日志等级:
LOG_LEVEL = 'ERROR'
-
# 管道类的优先级,数字越小优先级越高 ITEM_PIPELINES = { 'ProName.pipelines.ProNamePipeline': 300, }
-
指定图片下载位置:
IMAGES_STORE = 'img'
持久化存储案例
-
创建工程:
scrapy startproject one
-
进入到工程:
cd one
-
创建爬虫文件:
scrapy genspider first www.xx.com
-
# 目录结构 one │ items.py │ middlewares.py │ pipelines.py │ settings.py │ __init__.py │ ├─spiders │ │ first.py │ │ __init__.py
-
爬虫文件编写
# first.py import scrapy from one.items import OneItem class FirstSpider(scrapy.Spider): name = 'first' # allowed_domains = ['www.xx.com'] start_urls = ['http://www.xx.com/'] # 填写相应的URL url = 'http://www.xx.com/%d/' # 爬取多页 page = 2 # 页码 def parse(self, response): li_list = response.xpath("...") for li in li_list: title = li.xpath('...').extract_first() detail_url = li.xpath('...').extract_first() # 详情页面的url item = OneItem() # 实例化OneItem对象 item['title'] = title item['url'] = url yield scrapy.Request(detail_url, callback=self.parse_detail, meta={'item': item}) # 请求传参 if self.page <= 50: new_url = self.url % self.page self.page += 1 # 手动发送请求 yield scrapy.Request(new_url, callback=self.parse) # 触发递归 def parse_detail(self, response): item = response.meta['item'] url = response.xpath('...').extract_first() item['url'] = url yield item
-
# items.py import scrapy class OneItem(scrapy.Item): # define the fields for your item here like: title = scrapy.Field() url = scrapy.Field()
-
数据库相关配置
mysql -uroot -p123 # 进入到数据库 create databases scrapy; # 创建数据库 use scrapy; # 使用数据库 create table first (title varchar(50), url varchar(128)); # 创建相关表
-
持久化存储相关配置
import pymysql class TextPipeline: fp = None # 新建一个文件句柄 def open_spider(self, spider): """ 爬虫开始触发 """ print("爬虫开始了") self.fp = open('a.txt', 'w', encoding='utf-8') def process_item(self, item, spider): self.fp.write(item['title'] + ": " + item['url'] + '\n') return item def close_spider(self, spider): """ 爬虫结束触发 """ print("爬虫结束了") self.fp.close() class MysqlPipeline: conn = None cursor = None def open_spider(self, spider): """ 爬虫开始触发 """ print("爬虫开始了(Mysql存储)") self.conn = pymysql.Connect( host='127.0.0.1', port=3306, database='scrapy', user='root', password='123', charset='utf8' ) def process_item(self, item, spider): self.cursor = self.conn.cursor() sql = 'insert into first values ("%s", "%s")' % (item['title'], item['url']) try: self.cursor.execute(sql) self.conn.commit() except Exception as e: print(e) self.conn.rollback() return item def close_spider(self, spider): """ 爬虫结束触发 """ print("爬虫结束了(Mysql存储)") self.cursor.close() self.conn.close()
-
settings文件相关配置
# 添加或修改以下配置 USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36' ROBOTSTXT_OBEY = False LOG_LEVEL = 'ERROR' ITEM_PIPELINES = { # 设置存储权重 'one.pipelines.MysqlPipeline': 300, 'one.pipelines.TextPipeline': 301, }
-
启动工程:
scrapy crawl first
图片存储案例
-
创建工程:
scrapy startproject one
-
进入到工程:
cd one
-
创建爬虫文件:
scrapy genspider first www.xx.com
-
# 目录结构 one │ items.py │ middlewares.py │ pipelines.py │ settings.py │ __init__.py │ ├─spiders │ │ first.py │ │ __init__.py
-
爬虫文件编写
# first.py import scrapy from one.items import OneItem class FirstSpider(scrapy.Spider): name = 'first' # allowed_domains = ['www.xx.com'] start_urls = ['http://www.xx.com/'] # 填写相应的URL url = 'http://www.xx.com/%d/' # 爬取多页 page = 2 # 页码 def parse(self, response): li_list = response.xpath("...") for li in li_list: title = li.xpath('...').extract_first() + '.jpg' detail_url = li.xpath('...').extract_first() # 详情页面的url item = OneItem() # 实例化OneItem对象 item['title'] = title item['url'] = url yield scrapy.Request(detail_url, callback=self.parse_detail, meta={'item': item}) # 请求传参 if self.page <= 50: new_url = self.url % self.page self.page += 1 # 手动发送请求 yield scrapy.Request(new_url, callback=self.parse) # 触发递归 def parse_detail(self, response): item = response.meta['item'] url = response.xpath('...').extract_first() item['url'] = url yield item
-
# items.py import scrapy class OneItem(scrapy.Item): # define the fields for your item here like: title = scrapy.Field() url = scrapy.Field()
-
图片存储和下载相关配置
import scrapy from scrapy.pipelines.images import ImagesPipeline class ImgPipeline(ImagesPipeline): # 对指定url发送请求 def get_media_requests(self, item, info): yield scrapy.Request(item['url'], meta={'item': item}) # 指定图片存储的路径 def file_path(self, request, response=None, info=None, *, item=None): item = request.meta['item'] title = item['title'] return title
-
settings文件相关配置
# 添加或修改以下配置 USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36' ROBOTSTXT_OBEY = False LOG_LEVEL = 'ERROR' IMAGES_STORE = 'img' # 设置图片文件存储路径 ITEM_PIPELINES = { # 设置存储权重 'one.pipelines.ImgPipeline': 300, }
-
启动工程:
scrapy crawl first