Python3 Scrapy爬虫框架(Scrapy/scrapy-redis)
本文由 Luzhuo 编写,转发请保留该信息.
原文: https://blog..net/Rozol/article/details/80010173
Scrapy
Scrapy 是 Python 写的, 主要用于爬取网站数据, 爬过的链接会自动过滤
使用的 Twisted 异步网络框架
官网: https://scrapy.org/
文档: https://docs.scrapy.org/en/latest/
中文文档: http://scrapy-chs.readthedocs.io/zh_CN/latest/index.html
安装: pip install Scrapy
PyDispatcher-2.0.5 Scrapy-1.5.0 asn1crypto-0.24.0 cffi-1.11.5 cryptography-2.2.2 cssselect-1.0.3 parsel-1.4.0 pyOpenSSL-17.5.0 pyasn1-0.4.2 pyasn1-modules-0.2.1 pycparser-2.18 queuelib-1.5.0 service-identity-17.0.0 w3lib-1.19.0
其他依赖库: pywin32-223
常用命令
创建项目: scrapy startproject mySpider
爬虫
创建爬虫:
scrapy genspider tieba tieba.baidu.com
scrapy genspider -t crawl tieba tieba.baidu.com
启动爬虫: scrapy crawl tieba
分布式爬虫:
启动爬虫: scrapy runspider tieba.py
发布指令: lpush tieba:start_urls http://tieba.baidu.com/f/index/xxx
框架
Scrapy Engine(引擎): 负责Spider、ItemPipeline、Downloader、Scheduler中间的通讯
Scheduler(调度器): 负责接收 引擎 发来请求, 加入队列, 当引擎需要时交还给 引擎
Downloader(下载器): 负责下载 引擎 发送的所有请求, 将结果交还给 引擎, 引擎交给Spider来处理
Spider(爬虫): 负责从结果里提取数据, 获取 Item字段 需要的数据, 将需要跟进的URL提交给 引擎, 引擎交给 调度器
Item Pipeline(管道): 负责处理Spider中获取到的Item, 并进行处理与保存
Downloader Middlewares(下载中间件): 扩展下载功能的组件
Spider Middlewares(Spider中间件): 扩展Spider中间通信的组件
项目目录
scrapy.cfg: 项目的配置文件
mySpider: 项目
mySpider/items.py: 目标文件
mySpider/pipelines.py: 管道文件
mySpider/settings.py: 配置文件
mySpider/spiders: 存储爬虫代码目录
Scrapy的使用
操作步骤
创建项目
scrapy startproject mySpider
编写提取的内容(items.py)
class TiebaItem(scrapy.Item):
# 编写要存储的内容
# 贴吧名
name = scrapy.Field()
# 简介
summary = scrapy.Field()
# 贴吧总人数
person_sum = scrapy.Field()
# 贴吧帖子数
text_sum = scrapy.Field()
创建爬虫
cd 到项目目录(mySpider)下
scrapy genspider tieba tieba.baidu.com
爬虫文件创建在mySpider.spiders.tieba
编写爬虫
# -*- coding: utf-8 -*-
import scrapy
from mySpider.items import TiebaItem
class TiebaSpider(scrapy.Spider):
name = 'tieba' # 爬虫名
allowed_domains = ['tieba.baidu.com'] # 爬虫作用范围
page = 1
page_max = 2 #30
url = 'http://tieba.baidu.com/f/index/forumpark?cn=%E5%86%85%E5%9C%B0%E6%98%8E%E6%98%9F&ci=0&pcn=%E5%A8%B1%E4%B9%90%E6%98%8E%E6%98%9F&pci=0&ct=1&st=new&pn='
start_urls = [url + str(page)] # 爬虫起始地址
# 处理响应文件
def parse(self, response):
# scrapy 自带的 xpath 匹配
# .css('title::text') / .re(r'Quotes.*') / .xpath('//title')
tieba_list = response.xpath('//div[@class="ba_content"]') # 数据根目录
for tieba in tieba_list:
# 从网页中获取需要的数据
name = tieba.xpath('./p[@class="ba_name"]/text()').extract_first() # .extract_first() 转成字符串
summary = tieba.xpath('./p[@class="ba_desc"]/text()').extract_first()
person_sum = tieba.xpath('./p/span[@class="ba_m_num"]/text()').extract_first()
text_sum = tieba.xpath('./p/span[@class="ba_p_num"]/text()').extract_first()
item = TiebaItem()
item['name'] = name
item['summary'] = summary
item['person_sum'] = person_sum
item['text_sum'] = text_sum
# 将获取的数据交给管道
yield item
if self.page < self.page_max:
self.page += 1
# 将新请求交给调度器下载
# yield scrapy.Request(next_page, callback=self.parse)
yield response.follow(self.url + str(self.page), callback=self.parse) # 回调方法可以自己写个, 也可以用旧的parse
创建管道文件, 存储内容(pipelines.py)
编写settings.py文件
# 管道文件 (优先级同上)
ITEM_PIPELINES = {
# 'mySpider.pipelines.MyspiderPipeline': 300,
'mySpider.pipelines.TiebaPipeline': 100,
}
编写代码
import json
class TiebaPipeline(object):
# 初始化
def __init__(self):
self.file = open('tieba.json', 'w', encoding='utf-8')
# spider开启时调用
def open_spider(self, spider):
pass
# 必写的方法, 处理item数据
def process_item(self, item, spider):
jsontext = json.dumps(dict(item), ensure_ascii=False) + "\n"
self.file.write(jsontext)
return item
# spider结束后调用
def close_spider(self, spider):
self.file.close()
运行爬虫
启动
scrapy crawl tieba
request和response
# -*- coding: utf-8 -*-
import scrapy
class TiebaSpider(scrapy.Spider):
name = 'reqp'
allowed_domains = ['www.baidu.com']
# start_urls = ['http://www.baidu.com/1']
# 默认 start_urls 使用的是GET请求, 重写该方法, 注释掉 start_urls 就可以在第一次请求时自定义发送请求类型
def start_requests(self):
# 发送一个Form表单数据
return [scrapy.FormRequest(url='http://www.baidu.com/1', formdata={"key1": "value1", "key2": "value2"}, callback=self.parse),]
def parse(self, response):
'''
Response
def __init__(self, url, status=200, headers=None, body='', flags=None, request=None):
url, # 最后url
status=200, # 状态码
headers=None, # 结果头
body='', # 结果体
flags=None,
request=None
'''
'''
Request
def __init__(self, url, callback=None, method='GET', headers=None, body=None,
cookies=None, meta=None, encoding='utf-8', priority=0,
dont_filter=False, errback=None):
url = 'http://www.baidu.com/2' # url
callback = self.parse # response回调
method = "GET" # "GET"(默认), "POST" 等
headers = None # 请求头 (默认有请求头) >> headers = {"xxx":"xxx", "xxx", "xxx"}
body = None # 请求体
cookies=None # cookie >> cookies={'key1':'value1', 'key2': 'value2'}
meta=None # 传送数据(meta={'metakey':'metavalue'}), 可在回调函数取出(metavalue = response.meta['metakey'])
encoding='utf-8' # 字符集
priority=0 # 优先级
dont_filter=False # 不要过滤重复url (默认False)
errback=None # 错误回调
'''
yield scrapy.Request('http://www.baidu.com/2', callback=self.parse_meta, meta={'metakey': 'metavalue'})
# 发送From表单 (POST请求)
yield scrapy.FormRequest(url='http://www.baidu.com/3',
formdata={'key': 'value'}, # From表单数据
callback=self.parse)
# 同 scrapy.Request
yield response.follow('http://www.baidu.com/4', callback=None, method='GET', headers=None, body=None, cookies=None, meta=None, encoding='utf-8',
priority=0, dont_filter=False, errback=None)
def parse_meta(self, response):
print("====== {} ======".format(response.meta['metakey']))
settings.py 的配置
# 爬虫是否遵循robots协议
ROBOTSTXT_OBEY = False # True
# 爬虫并发量 (默认16)
#CONCURRENT_REQUESTS = 32
# 下载延迟(默认:0)
#DOWNLOAD_DELAY = 3
# 是否启动Cookie (默认True)
COOKIES_ENABLED = False
# 请求头
DEFAULT_REQUEST_HEADERS = {
'User-Agent': 'Mozilla/4.0 xxx,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
}
# 爬虫中间件
#SPIDER_MIDDLEWARES = {
# 'mySpider.middlewares.MyspiderSpiderMiddleware': 543,
#}
# 下载中间件 (优先级: 数字越小越高 [0,1000])
#DOWNLOADER_MIDDLEWARES = {
# 'mySpider.middlewares.MyspiderDownloaderMiddleware': 543,
#}
# 管道文件 (优先级同上)
#ITEM_PIPELINES = {
# 'mySpider.pipelines.MyspiderPipeline': 300,
#}
下载文件(图片)
settings.py配置
# 管道文件 (优先级同上)
ITEM_PIPELINES = {
# 'mySpider.pipelines.MyspiderPipeline': 300,
'mySpider.pipelines.TiebaPipeline': 100,
'mySpider.pipelines.MyFilesPipeline': 1,
}
FILES_STORE = "C:\Code\Python_Vir\mySpider\static"
爬虫代码
# -*- coding: utf-8 -*-
import scrapy
from mySpider.items import FileItem
class ImageSpider(scrapy.Spider):
name = 'image' # 爬虫名
allowed_domains = ['tieba.baidu.com'] # 爬虫作用范围
page = 1
page_max = 30
url = 'http://tieba.baidu.com/f/index/forumpark?cn=%E5%86%85%E5%9C%B0%E6%98%8E%E6%98%9F&ci=0&pcn=%E5%A8%B1%E4%B9%90%E6%98%8E%E6%98%9F&pci=0&ct=1&st=new&pn='
start_urls = [url + str(page)] # 爬虫起始地址
# 处理响应文件
def parse(self, response):
# scrapy 自带的 xpath 匹配
# .css('title::text') / .re(r'Quotes.*') / .xpath('//title')
tieba_list = response.xpath('//a[@rel="noopener"]') # 数据根目录
for tieba in tieba_list:
# 从网页中获取需要的数据
file_url = tieba.xpath('./img[@class="ba_pic"]/@src').extract_first()
file_name = tieba.xpath('./div[@class="ba_content"]/p[@class="ba_name"]/text()').extract_first()
if (file_url and file_name) is not None:
item = FileItem()
item['file_url'] = file_url
item['file_name'] = file_name
yield item
if self.page < self.page_max:
self.page += 1
yield response.follow(self.url + str(self.page), callback=self.parse)
目标内容代码
class FileItem(scrapy.Item):
# 图片地址
file_url = scrapy.Field()
# 图片名
file_name = scrapy.Field()
# 图片路径
file_path = scrapy.Field()
管道代码 (TiebaPipeline管道与操作步骤里的代码是一样的, 主要是保存json数据, 这里不拷贝了)
import scrapy
from scrapy.utils.project import get_project_settings # 从settings.py获取值
from scrapy.conf import settings # 从settings.py获取值 >> settings["FILES_STORE"]
# from scrapy.pipelines.images import ImagesPipeline # 下载图片的管道
from scrapy.pipelines.files import FilesPipeline # 下载文件的管道
import os
class MyFilesPipeline(FilesPipeline):
# 从settings.py获取变量值
IMAGES_STORE = get_project_settings().get("FILES_STORE")
# 重写 发送图片地址
def get_media_requests(self, item, info):
file_url = item['file_url']
yield scrapy.Request(file_url) # 返回None表示没有图片可下载
# 重写 下载完成
def item_completed(self, results, item, info):
'''
result
[(success:成功True 失败False
{'checksum': '图片MD5'
'path': '图片存储路径'
'url': '图片下载url'
}, # 成功为字典, 失败为Failure
)]
[(True,
{'checksum': '2b00042f7481c7b056c4b410d28f33cf',
'path': 'full/7d97e98f8af710c7e7fe703abc8f639e0ee507c4.jpg',
'url': 'http://www.example.com/images/product1.jpg'}),]
'''
file_paths = [x['path'] for ok, x in results if ok]
# 对文件进行重命名
name_new = self.IMAGES_STORE + os.sep + item["file_name"]
os.rename(self.IMAGES_STORE + os.sep + file_paths[0], name_new + ".jpg")
item['file_path'] = name_new
return item
跟进爬虫CrawlSpiders
提取链接并跟进爬取 (注意, 部分反爬虫机制可能会提供假的url)
创建命令: scrapy genspider -t crawl tieba tieba.baidu.com
爬虫代码:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from mySpider.items import TiebaItem
class TiebaCrawlSpider(CrawlSpider):
name = 'tieba_crawl'
allowed_domains = ['tieba.baidu.com']
start_urls = ['http://tieba.baidu.com/f/index/forumpark?cn=%E5%86%85%E5%9C%B0%E6%98%8E%E6%98%9F&ci=0&pcn=%E5%A8%B1%E4%B9%90%E6%98%8E%E6%98%9F&pci=0&ct=1&st=new&pn=1']
# 提取页面url规则
page_lx = LinkExtractor(allow=('pn=\d+'))
# 提取内容url规则
# content_lx = LinkExtractor(allow=('xxx'))
'''
可写多个Rule匹配规则
LinkExtractor: (用于提取链接)
allow = (), # 满足正则提取, 为空提取全部
deny = (), # 不满足正则提取
allow_domains = (), # 满足域提取
deny_domains = (), # 不满足域提取
deny_extensions = None,
restrict_xpaths = (), # 满足xpath提取
restrict_css = () # 满足css提取
tags = ('a','area'), # 满足标签
attrs = ('href'), # 满足属性
canonicalize = True, # 是否规范链接
unique = True, # 是否过滤重复链接
process_value = None
Rule:
link_extractor: LinkExtractor对象(定义需要获取的链接)
callback = None: 每获取到一个链接时的回调(不要使用parse函数)
cb_kwargs = None,
follow = None: 是否对提取的链接进行跟进
process_links = None: 过滤链接
process_request = None: 过滤request
'''
rules = (
# 跟进page
Rule(page_lx, callback='parse_item', follow=True),
# 不跟进内容
# Rule(page_lx, callback='content_item', follow=False),
)
def parse_item(self, response):
tieba_list = response.xpath('//div[@class="ba_content"]')
for tieba in tieba_list:
name = tieba.xpath('./p[@class="ba_name"]/text()').extract_first()
summary = tieba.xpath('./p[@class="ba_desc"]/text()').extract_first()
person_sum = tieba.xpath('./p/span[@class="ba_m_num"]/text()').extract_first()
text_sum = tieba.xpath('./p/span[@class="ba_p_num"]/text()').extract_first()
item = TiebaItem()
item['name'] = name
item['summary'] = summary
item['person_sum'] = person_sum
item['text_sum'] = text_sum
yield item
def content_item(self, response):
pass
保存数据到数据库
import pymongo
# MONGODB数据库的使用
class MongoPipeline(object):
def __init__(self):
# 获取settings.py里的参数
host = settings["MONGODB_HOST"]
port = settings["MONGODB_PORT"]
dbname = settings["MONGODB_DBNAME"]
sheetname= settings["MONGODB_SHEETNAME"]
# MongoDB数据库
client = pymongo.MongoClient(host=host, port=port) # 连接
mydb = client[dbname] # 数据库
self.sheet = mydb[sheetname] # 表
def process_item(self, item, spider):
data = dict(item)
self.sheet.insert(data) # 插入数据
return item
下载中间件
settings.py配置
DOWNLOADER_MIDDLEWARES = {
# 'mySpider.middlewares.MyspiderDownloaderMiddleware': 543,
'mySpider.middlewares.RandomUserAgent': 100,
'mySpider.middlewares.RandomProxy': 200,
}
下载中间件代码
import random
import base64
class RandomUserAgent(object):
def __init__(self):
self.USER_AGENTS = [
'Mozilla/5.0(Windows;U;WindowsNT6.1;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50',
]
# 必须写
def process_request(self, request, spider):
useragent = random.choice(self.USER_AGENTS)
# 修改请求头 (优先级高于settings.py里配置的)
request.headers.setdefault("User-Agent", useragent)
class RandomProxy(object):
def __init__(self):
self.PROXIES = [
# {"ip_port": "http://115.215.56.138:22502", "user_passwd": b"user:passwd"},
{"ip_port": "http://115.215.56.138:22502", "user_passwd": b""}
]
def process_request(self, request, spider):
proxy = random.choice(self.PROXIES)
# 无须授权的代理ip
if len(proxy['user_passwd']) <= 0:
request.meta['proxy'] = proxy['ip_port']
# 需要授权代理ip
else:
request.meta['proxy'] = proxy['ip_port']
userpasswd_base64 = base64.b64encode(proxy['user_passwd'])
request.headers['Proxy-Authorization'] = 'Basic ' + userpasswd_base64 # 按指定格式格式化
其他
保存log
在settings.py里配置
# 保存日志信息
LOG_FILE = "tieba.log"
LOG_LEVEL = "DEBUG"
logging的配置参数
LOG_ENABLED = True # 启动logging (默认True)
LOG_ENCODING = 'utf-8' # logging编码
LOG_FILE = "tieba.log" # logging文件名
LOG_LEVEL = "DEBUG " # log级别 (默认DEBUG)
LOG_STDOUT = False # True进程所有标准输出到写到log文件中 (默认False)
logging的错误级别 (级别最低从最高: 1->5)
1. CRITICAL(严重错误)
2. ERROR(一般错误)
3. WARNING(警告信息)
4. INFO(一般信息)
5. DEBUG(调试信息)
代码中使用logging
import logging
logging.warning("This is a warning")
logging.log(logging.WARNING, "This is a warning")
logger = logging.getLogger()
logger.warning("This is a warning")
Scrapy Shell
Scrapy终端是一个交互终端,供您在未启动spider的情况下尝试及调试您的爬取代码
启动
scrapy shell "http://www.baidu.com"
获取
body: response.body
内容(解析用): response.text
状态码: response.status
请求头: response.headers
结果url: response.url
用浏览器查看response: view(response)
转成文字: response.xpath('//div[@class="ba_content"]').extract()
第一个转成文字: response.xpath('//div[@class="ba_content"]').extract_first()
匹配
xpath: response.xpath('//div[@class="ba_content"]')
运行代码
from scrapy.linkextractors import LinkExtractor
link_extrator = LinkExtractor(allow=("\d+"))
links_list = link_extrator.extract_links(response)
fetch('http://www.baidu.com')
—
分布式爬虫 scrapy-redis
Scrapy 爬虫框架本身不支持分布式, Scrapy-redis为了实现Scrapy分布式提供了一些以redis为基础的组件
https://github.com/rolando/scrapy-redis.git
框架
Scheduler(调度器):
Scrapy的Scrapy queue 换成 redis 数据库, 多个 spider 从同一个 redis-server 获取要爬取的request
Scheduler 对新的request进行入队列操作, 取出下个要爬取的request给爬虫, 使用Scrapy-redis的scheduler组件
Duplication Filter(指纹去重):
在scrapy-redis中去重是由Duplication Filter组件来实现的, 通过redis的set不重复的特性
scrapy-redis调度器从引擎接受request, 并指纹存⼊set检查是否重复, 不重复写⼊redis的request queue
Item Pipeline(管道):
引擎将爬取到的Item给Item Pipeline, scrapy-redis组件的Item Pipeline将爬取到的Item存⼊redis的items队列里
Base Spider(爬虫):
RedisSpider继承了Spider和RedisMixin这两个类, RedisMixin用来从redis读取url的类
我们写个Spider类继承RedisSpider, 调用setup_redis函数去连接redis数据库, 然后设置signals(信号)
当spider空闲时候的signal, 会调用spider_idle函数, 保证spider是一直活着的状态
当有item时的signal, 会调用item_scraped函数, 获取下一个request
Scrapy-Redis分布式策略
Master端(核心服务器): 搭建一个Redis数据库, 一般不负责爬取, 只负责Request去重, Request的分配, 数据的存储
Slaver端(爬虫程序执行端): 负责执行爬虫程序, 从Master端拿任务进行数据抓取, 运行过程中提交新的Request给Master处理
Master端的Scrapy-Redis调度的任务是Request对象(包含url, callback, headers等信息), 占用大量的Redis存储空间, 会导致爬虫速度降低, 对硬件要求比较高(内存, 网络)
准备
Redis (仅需Master端启动)
官网: https://redis.io/download
管理软件: https://redisdesktop.com/download
redis命令(cd到redis安装目录):
启动: redis-server redis.windows.conf
连接: 运行redis-cli
连接其他主机: redis-cli -h 192.168.1.2
切换数据库: select 1
查看键: keys *
设置值: set [键] [值]
查看值: get [键]
删除数据: del [键/*]
修改Master端redis.windows.conf配置文件
注释掉 bind 127.0.0.1
Ubuntu可选(作为后台线程): daemonize yes
关闭保护模式(protected-mode no)或设置密码
启动
scrapy-redis(组件)
安装: pip install scrapy-redis
scrapy-redis-0.6.8
下载案例代码: git clone https://github.com/rolando/scrapy-redis.git
操作步骤
创建项目(同上)
scrapy startproject myRedisSpider
编写要存储的内容(items.py) (除了保存数据从哪来, 其他同上)
class TiebaItem(scrapy.Item):
name = scrapy.Field()
summary = scrapy.Field()
person_sum = scrapy.Field()
text_sum = scrapy.Field()
# 数据来源
from_url = scrapy.Field()
from_name = scrapy.Field()
time = scrapy.Field()
创建爬虫 (除了保存数据从哪来, 其他同上)
cd 到项目目录(myRedisSpider)下
scrapy genspider tieba tieba.baidu.com
爬虫文件创建在mySpider.spiders.tieba
编写爬虫
# -*- coding: utf-8 -*-
import scrapy
# 1. 导入 RedisSpider库
from scrapy_redis.spiders import RedisSpider
from myRedisSpider.items import TiebaItem
# 普通的 Spider 爬虫改造成分布式的 RedisSpider 爬虫 [1,4]
# 2. 继承 scrapy.Spider 换成 RedisSpider
class TiebaSpider(RedisSpider):
name = 'tieba'
allowed_domains = ['tieba.baidu.com']
page = 1
page_max = 30
url = 'http://tieba.baidu.com/f/index/forumpark?cn=%E5%86%85%E5%9C%B0%E6%98%8E%E6%98%9F&ci=0&pcn=%E5%A8%B1%E4%B9%90%E6%98%8E%E6%98%9F&pci=0&ct=1&st=new&pn='
# 3. 注释 start_urls
# start_urls = [url + str(page)]
# 4. 启动所有爬虫端的指令 (爬虫名:start_urls)
redis_key = 'tieba:start_urls'
def parse(self, response):
tieba_list = response.xpath('//div[@class="ba_content"]') # 数据根目录
for tieba in tieba_list:
# 从网页中获取需要的数据
name = tieba.xpath('./p[@class="ba_name"]/text()').extract_first()
summary = tieba.xpath('./p[@class="ba_desc"]/text()').extract_first()
person_sum = tieba.xpath('./p/span[@class="ba_m_num"]/text()').extract_first()
text_sum = tieba.xpath('./p/span[@class="ba_p_num"]/text()').extract_first()
item = TiebaItem()
item['name'] = name
item['summary'] = summary
item['person_sum'] = person_sum
item['text_sum'] = text_sum
# 告知数据从哪里来
item['from_url'] = response.url
yield item
if self.page < self.page_max:
self.page += 1
yield response.follow(self.url + str(self.page), callback=self.parse)
配置settings.py文件 (添加以下设置, 其他同上)
ITEM_PIPELINES = {
'myRedisSpider.pipelines.MyredisspiderPipeline': 300,
'scrapy_redis.pipelines.RedisPipeline': 400, # 将数据存到redis数据库 (优先级要比其他管道低(数值高))
}
# scrapy_redis去重组件
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
# scrapy_redis调度器组件
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# 可中途暂停, 不清空信息
SCHEDULER_PERSIST = True
# 默认Scrapy队列模式, 优先级
# SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderPriorityQueue"
# 队列模式, 先进先出
#SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderQueue"
# 栈模式, 先进后出
#SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderStack"
# redis主机
REDIS_HOST = '192.168.1.104'
REDIS_PORT = '6379'
REDIS_ENCODING = "utf-8"
设置值的通道
class MyredisspiderPipeline(object):
def process_item(self, item, spider):
item['time'] = datetime.utcnow() # 格林威治时间
item['from_name'] = spider.name
return item
运行
Slaver端开启爬虫: (scrapy runspider 爬虫文件名.py)
cd到myRedisSpider\spiders爬虫目录
scrapy runspider tieba.py
Master端发送指令: (lpush redis_key 爬虫起始url)
cd到redis安装目录并打开redis-cli, 输出命令
lpush tieba:start_urls http://tieba.baidu.com/f/index/forumpark?cn=%E5%86%85%E5%9C%B0%E6%98%8E%E6%98%9F&ci=0&pcn=%E5%A8%B1%E4%B9%90%E6%98%8E%E6%98%9F&pci=0&ct=1&st=new&pn=1
跟进爬虫CrawlSpiders
创建命令: scrapy genspider -t crawl tieba_crawl tieba.baidu.com
爬虫代码
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
# from scrapy.spiders import CrawlSpider, Rule
from scrapy.spiders import Rule
# 1. 把 CrawlSpider 换成 RedisCrawlSpider
from scrapy_redis.spiders import RedisCrawlSpider
from myRedisSpider.items import TiebaItem
# 普通的 CrawlSpider 爬虫改造成分布式的 RedisCrawlSpider 爬虫 [1,4]
# class TiebaSpider(CrawlSpider):
# 2. 把继承 CrawlSpider 换成 RedisCrawlSpider
class TiebaSpider(RedisCrawlSpider):
name = 'tieba_crawl'
# 不要使用动态域, 因为会导致获取不到域而将所有请求过滤 Filtered offsite request to 'tieba.baidu.com'
allowed_domains = ['tieba.baidu.com']
# 3. 注释 start_urls
# start_urls = ['http://tieba.baidu.com/f/index/forumpark?cn=%E5%86%85%E5%9C%B0%E6%98%8E%E6%98%9F&ci=0&pcn=%E5%A8%B1%E4%B9%90%E6%98%8E%E6%98%9F&pci=0&ct=1&st=new&pn=1']
# 4. 启动所有爬虫端的指令 (爬虫名:start_urls)
redis_key = 'tiebacrawl:start_urls'
rules = (
Rule(LinkExtractor(allow=('pn=\d+')), callback='parse_item', follow=True), # st=new&
)
def parse_item(self, response):
tieba_list = response.xpath('//div[@class="ba_content"]')
for tieba in tieba_list:
name = tieba.xpath('./p[@class="ba_name"]/text()').extract_first()
summary = tieba.xpath('./p[@class="ba_desc"]/text()').extract_first()
person_sum = tieba.xpath('./p/span[@class="ba_m_num"]/text()').extract_first()
text_sum = tieba.xpath('./p/span[@class="ba_p_num"]/text()').extract_first()
item = TiebaItem()
item['name'] = name
item['summary'] = summary
item['person_sum'] = person_sum
item['text_sum'] = text_sum
# 告知数据从哪里来
item['from_url'] = response.url
item['from_name'] = self.name
yield item
运行爬虫 (同上)
scrapy runspider tieba_crawl.py
发送指令 (同上)
lpush tiebacrawl:start_urls http://tieba.baidu.com/f/index/forumpark?cn=%E5%86%85%E5%9C%B0%E6%98%8E%E6%98%9F&ci=0&pcn=%E5%A8%B1%E4%B9%90%E6%98%8E%E6%98%9F&pci=0&ct=1&st=new&pn=1
把Redis数据保存到本地数据库
MySQL
创建数据库创建表:
创建表: create table tieba (name varchar(1000), summary varchar(1000), person_sum varchar(1000), text_sum varchar(1000), from_url varchar(1000), from_name varchar(1000), time varchar(1000));
插入数据: insert into tieba(name, summary, person_sum, text_sum, from_url, from_name) values("名字", "简介", "111", "222", "http://tieba.baidu.com", "tieba", "xxx");
python代码实现
#!/usr/bin/env python
# coding=utf-8
# pip install redis
import redis
# pip install pymysql
import pymysql
import json
import sys
def process_item():
# 创建数据库连接
redisc = redis.Redis(host="192.168.1.104", port=6379, db=0, encoding='utf-8')
mysql = pymysql.connect(host='127.0.0.1', port=3306, user='root', password='root', db='test', charset='utf8mb4')
cursor = None
while True:
try:
# 获取数据
source, data = redisc.blpop("tieba:items")
item = json.loads(data, encoding="utf-8")
# 保存数据
cursor = mysql.cursor()
sql = 'insert into tieba(name, summary, person_sum, text_sum, from_url, from_name, time) values(%s, %s, %s, %s, %s, %s, %s)'
cursor.execute(sql, [item["name"], item["summary"], item["person_sum"], item["text_sum"], item["from_url"], item["from_name"], item["time"]]);
mysql.commit()
print(end=">")
except:
types, value, back = sys.exc_info() # 捕获异常
sys.excepthook(types, value, back) # 打印异常
finally:
if cursor is not None:
cursor.close()
if __name__ == "__main__":
process_item()