python3 scrapy爬虫_Python3 Scrapy爬虫框架(Scrapy/scrapy-redis)

Python3 Scrapy爬虫框架(Scrapy/scrapy-redis)

本文由 Luzhuo 编写,转发请保留该信息.

原文: https://blog..net/Rozol/article/details/80010173

Scrapy

Scrapy 是 Python 写的, 主要用于爬取网站数据, 爬过的链接会自动过滤

使用的 Twisted 异步网络框架

官网: https://scrapy.org/

文档: https://docs.scrapy.org/en/latest/

中文文档: http://scrapy-chs.readthedocs.io/zh_CN/latest/index.html

安装: pip install Scrapy

PyDispatcher-2.0.5 Scrapy-1.5.0 asn1crypto-0.24.0 cffi-1.11.5 cryptography-2.2.2 cssselect-1.0.3 parsel-1.4.0 pyOpenSSL-17.5.0 pyasn1-0.4.2 pyasn1-modules-0.2.1 pycparser-2.18 queuelib-1.5.0 service-identity-17.0.0 w3lib-1.19.0

其他依赖库: pywin32-223

常用命令

创建项目: scrapy startproject mySpider

爬虫

创建爬虫:

scrapy genspider tieba tieba.baidu.com

scrapy genspider -t crawl tieba tieba.baidu.com

启动爬虫: scrapy crawl tieba

分布式爬虫:

启动爬虫: scrapy runspider tieba.py

发布指令: lpush tieba:start_urls http://tieba.baidu.com/f/index/xxx

框架

bb4039a63f265432d6d5c59e86e14dfc.png

Scrapy Engine(引擎): 负责Spider、ItemPipeline、Downloader、Scheduler中间的通讯

Scheduler(调度器): 负责接收 引擎 发来请求, 加入队列, 当引擎需要时交还给 引擎

Downloader(下载器): 负责下载 引擎 发送的所有请求, 将结果交还给 引擎, 引擎交给Spider来处理

Spider(爬虫): 负责从结果里提取数据, 获取 Item字段 需要的数据, 将需要跟进的URL提交给 引擎, 引擎交给 调度器

Item Pipeline(管道): 负责处理Spider中获取到的Item, 并进行处理与保存

Downloader Middlewares(下载中间件): 扩展下载功能的组件

Spider Middlewares(Spider中间件): 扩展Spider中间通信的组件

项目目录

08a9fe7f39676b72d909a4af2d7f717b.png

scrapy.cfg: 项目的配置文件

mySpider: 项目

mySpider/items.py: 目标文件

mySpider/pipelines.py: 管道文件

mySpider/settings.py: 配置文件

mySpider/spiders: 存储爬虫代码目录

Scrapy的使用

操作步骤

创建项目

scrapy startproject mySpider

编写提取的内容(items.py)

class TiebaItem(scrapy.Item):

# 编写要存储的内容

# 贴吧名

name = scrapy.Field()

# 简介

summary = scrapy.Field()

# 贴吧总人数

person_sum = scrapy.Field()

# 贴吧帖子数

text_sum = scrapy.Field()

创建爬虫

cd 到项目目录(mySpider)下

scrapy genspider tieba tieba.baidu.com

爬虫文件创建在mySpider.spiders.tieba

编写爬虫

# -*- coding: utf-8 -*-

import scrapy

from mySpider.items import TiebaItem

class TiebaSpider(scrapy.Spider):

name = 'tieba' # 爬虫名

allowed_domains = ['tieba.baidu.com'] # 爬虫作用范围

page = 1

page_max = 2 #30

url = 'http://tieba.baidu.com/f/index/forumpark?cn=%E5%86%85%E5%9C%B0%E6%98%8E%E6%98%9F&ci=0&pcn=%E5%A8%B1%E4%B9%90%E6%98%8E%E6%98%9F&pci=0&ct=1&st=new&pn='

start_urls = [url + str(page)] # 爬虫起始地址

# 处理响应文件

def parse(self, response):

# scrapy 自带的 xpath 匹配

# .css('title::text') / .re(r'Quotes.*') / .xpath('//title')

tieba_list = response.xpath('//div[@class="ba_content"]') # 数据根目录

for tieba in tieba_list:

# 从网页中获取需要的数据

name = tieba.xpath('./p[@class="ba_name"]/text()').extract_first() # .extract_first() 转成字符串

summary = tieba.xpath('./p[@class="ba_desc"]/text()').extract_first()

person_sum = tieba.xpath('./p/span[@class="ba_m_num"]/text()').extract_first()

text_sum = tieba.xpath('./p/span[@class="ba_p_num"]/text()').extract_first()

item = TiebaItem()

item['name'] = name

item['summary'] = summary

item['person_sum'] = person_sum

item['text_sum'] = text_sum

# 将获取的数据交给管道

yield item

if self.page < self.page_max:

self.page += 1

# 将新请求交给调度器下载

# yield scrapy.Request(next_page, callback=self.parse)

yield response.follow(self.url + str(self.page), callback=self.parse) # 回调方法可以自己写个, 也可以用旧的parse

创建管道文件, 存储内容(pipelines.py)

编写settings.py文件

# 管道文件 (优先级同上)

ITEM_PIPELINES = {

# 'mySpider.pipelines.MyspiderPipeline': 300,

'mySpider.pipelines.TiebaPipeline': 100,

}

编写代码

import json

class TiebaPipeline(object):

# 初始化

def __init__(self):

self.file = open('tieba.json', 'w', encoding='utf-8')

# spider开启时调用

def open_spider(self, spider):

pass

# 必写的方法, 处理item数据

def process_item(self, item, spider):

jsontext = json.dumps(dict(item), ensure_ascii=False) + "\n"

self.file.write(jsontext)

return item

# spider结束后调用

def close_spider(self, spider):

self.file.close()

运行爬虫

启动

scrapy crawl tieba

request和response

# -*- coding: utf-8 -*-

import scrapy

class TiebaSpider(scrapy.Spider):

name = 'reqp'

allowed_domains = ['www.baidu.com']

# start_urls = ['http://www.baidu.com/1']

# 默认 start_urls 使用的是GET请求, 重写该方法, 注释掉 start_urls 就可以在第一次请求时自定义发送请求类型

def start_requests(self):

# 发送一个Form表单数据

return [scrapy.FormRequest(url='http://www.baidu.com/1', formdata={"key1": "value1", "key2": "value2"}, callback=self.parse),]

def parse(self, response):

'''

Response

def __init__(self, url, status=200, headers=None, body='', flags=None, request=None):

url, # 最后url

status=200, # 状态码

headers=None, # 结果头

body='', # 结果体

flags=None,

request=None

'''

'''

Request

def __init__(self, url, callback=None, method='GET', headers=None, body=None,

cookies=None, meta=None, encoding='utf-8', priority=0,

dont_filter=False, errback=None):

url = 'http://www.baidu.com/2' # url

callback = self.parse # response回调

method = "GET" # "GET"(默认), "POST" 等

headers = None # 请求头 (默认有请求头) >> headers = {"xxx":"xxx", "xxx", "xxx"}

body = None # 请求体

cookies=None # cookie >> cookies={'key1':'value1', 'key2': 'value2'}

meta=None # 传送数据(meta={'metakey':'metavalue'}), 可在回调函数取出(metavalue = response.meta['metakey'])

encoding='utf-8' # 字符集

priority=0 # 优先级

dont_filter=False # 不要过滤重复url (默认False)

errback=None # 错误回调

'''

yield scrapy.Request('http://www.baidu.com/2', callback=self.parse_meta, meta={'metakey': 'metavalue'})

# 发送From表单 (POST请求)

yield scrapy.FormRequest(url='http://www.baidu.com/3',

formdata={'key': 'value'}, # From表单数据

callback=self.parse)

# 同 scrapy.Request

yield response.follow('http://www.baidu.com/4', callback=None, method='GET', headers=None, body=None, cookies=None, meta=None, encoding='utf-8',

priority=0, dont_filter=False, errback=None)

def parse_meta(self, response):

print("====== {} ======".format(response.meta['metakey']))

settings.py 的配置

# 爬虫是否遵循robots协议

ROBOTSTXT_OBEY = False # True

# 爬虫并发量 (默认16)

#CONCURRENT_REQUESTS = 32

# 下载延迟(默认:0)

#DOWNLOAD_DELAY = 3

# 是否启动Cookie (默认True)

COOKIES_ENABLED = False

# 请求头

DEFAULT_REQUEST_HEADERS = {

'User-Agent': 'Mozilla/4.0 xxx,

'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',

# 'Accept-Language': 'en',

}

# 爬虫中间件

#SPIDER_MIDDLEWARES = {

# 'mySpider.middlewares.MyspiderSpiderMiddleware': 543,

#}

# 下载中间件 (优先级: 数字越小越高 [0,1000])

#DOWNLOADER_MIDDLEWARES = {

# 'mySpider.middlewares.MyspiderDownloaderMiddleware': 543,

#}

# 管道文件 (优先级同上)

#ITEM_PIPELINES = {

# 'mySpider.pipelines.MyspiderPipeline': 300,

#}

下载文件(图片)

settings.py配置

# 管道文件 (优先级同上)

ITEM_PIPELINES = {

# 'mySpider.pipelines.MyspiderPipeline': 300,

'mySpider.pipelines.TiebaPipeline': 100,

'mySpider.pipelines.MyFilesPipeline': 1,

}

FILES_STORE = "C:\Code\Python_Vir\mySpider\static"

爬虫代码

# -*- coding: utf-8 -*-

import scrapy

from mySpider.items import FileItem

class ImageSpider(scrapy.Spider):

name = 'image' # 爬虫名

allowed_domains = ['tieba.baidu.com'] # 爬虫作用范围

page = 1

page_max = 30

url = 'http://tieba.baidu.com/f/index/forumpark?cn=%E5%86%85%E5%9C%B0%E6%98%8E%E6%98%9F&ci=0&pcn=%E5%A8%B1%E4%B9%90%E6%98%8E%E6%98%9F&pci=0&ct=1&st=new&pn='

start_urls = [url + str(page)] # 爬虫起始地址

# 处理响应文件

def parse(self, response):

# scrapy 自带的 xpath 匹配

# .css('title::text') / .re(r'Quotes.*') / .xpath('//title')

tieba_list = response.xpath('//a[@rel="noopener"]') # 数据根目录

for tieba in tieba_list:

# 从网页中获取需要的数据

file_url = tieba.xpath('./img[@class="ba_pic"]/@src').extract_first()

file_name = tieba.xpath('./div[@class="ba_content"]/p[@class="ba_name"]/text()').extract_first()

if (file_url and file_name) is not None:

item = FileItem()

item['file_url'] = file_url

item['file_name'] = file_name

yield item

if self.page < self.page_max:

self.page += 1

yield response.follow(self.url + str(self.page), callback=self.parse)

目标内容代码

class FileItem(scrapy.Item):

# 图片地址

file_url = scrapy.Field()

# 图片名

file_name = scrapy.Field()

# 图片路径

file_path = scrapy.Field()

管道代码 (TiebaPipeline管道与操作步骤里的代码是一样的, 主要是保存json数据, 这里不拷贝了)

import scrapy

from scrapy.utils.project import get_project_settings # 从settings.py获取值

from scrapy.conf import settings # 从settings.py获取值 >> settings["FILES_STORE"]

# from scrapy.pipelines.images import ImagesPipeline # 下载图片的管道

from scrapy.pipelines.files import FilesPipeline # 下载文件的管道

import os

class MyFilesPipeline(FilesPipeline):

# 从settings.py获取变量值

IMAGES_STORE = get_project_settings().get("FILES_STORE")

# 重写 发送图片地址

def get_media_requests(self, item, info):

file_url = item['file_url']

yield scrapy.Request(file_url) # 返回None表示没有图片可下载

# 重写 下载完成

def item_completed(self, results, item, info):

'''

result

[(success:成功True 失败False

{'checksum': '图片MD5'

'path': '图片存储路径'

'url': '图片下载url'

}, # 成功为字典, 失败为Failure

)]

[(True,

{'checksum': '2b00042f7481c7b056c4b410d28f33cf',

'path': 'full/7d97e98f8af710c7e7fe703abc8f639e0ee507c4.jpg',

'url': 'http://www.example.com/images/product1.jpg'}),]

'''

file_paths = [x['path'] for ok, x in results if ok]

# 对文件进行重命名

name_new = self.IMAGES_STORE + os.sep + item["file_name"]

os.rename(self.IMAGES_STORE + os.sep + file_paths[0], name_new + ".jpg")

item['file_path'] = name_new

return item

跟进爬虫CrawlSpiders

提取链接并跟进爬取 (注意, 部分反爬虫机制可能会提供假的url)

创建命令: scrapy genspider -t crawl tieba tieba.baidu.com

爬虫代码:

# -*- coding: utf-8 -*-

import scrapy

from scrapy.linkextractors import LinkExtractor

from scrapy.spiders import CrawlSpider, Rule

from mySpider.items import TiebaItem

class TiebaCrawlSpider(CrawlSpider):

name = 'tieba_crawl'

allowed_domains = ['tieba.baidu.com']

start_urls = ['http://tieba.baidu.com/f/index/forumpark?cn=%E5%86%85%E5%9C%B0%E6%98%8E%E6%98%9F&ci=0&pcn=%E5%A8%B1%E4%B9%90%E6%98%8E%E6%98%9F&pci=0&ct=1&st=new&pn=1']

# 提取页面url规则

page_lx = LinkExtractor(allow=('pn=\d+'))

# 提取内容url规则

# content_lx = LinkExtractor(allow=('xxx'))

'''

可写多个Rule匹配规则

LinkExtractor: (用于提取链接)

allow = (), # 满足正则提取, 为空提取全部

deny = (), # 不满足正则提取

allow_domains = (), # 满足域提取

deny_domains = (), # 不满足域提取

deny_extensions = None,

restrict_xpaths = (), # 满足xpath提取

restrict_css = () # 满足css提取

tags = ('a','area'), # 满足标签

attrs = ('href'), # 满足属性

canonicalize = True, # 是否规范链接

unique = True, # 是否过滤重复链接

process_value = None

Rule:

link_extractor: LinkExtractor对象(定义需要获取的链接)

callback = None: 每获取到一个链接时的回调(不要使用parse函数)

cb_kwargs = None,

follow = None: 是否对提取的链接进行跟进

process_links = None: 过滤链接

process_request = None: 过滤request

'''

rules = (

# 跟进page

Rule(page_lx, callback='parse_item', follow=True),

# 不跟进内容

# Rule(page_lx, callback='content_item', follow=False),

)

def parse_item(self, response):

tieba_list = response.xpath('//div[@class="ba_content"]')

for tieba in tieba_list:

name = tieba.xpath('./p[@class="ba_name"]/text()').extract_first()

summary = tieba.xpath('./p[@class="ba_desc"]/text()').extract_first()

person_sum = tieba.xpath('./p/span[@class="ba_m_num"]/text()').extract_first()

text_sum = tieba.xpath('./p/span[@class="ba_p_num"]/text()').extract_first()

item = TiebaItem()

item['name'] = name

item['summary'] = summary

item['person_sum'] = person_sum

item['text_sum'] = text_sum

yield item

def content_item(self, response):

pass

保存数据到数据库

import pymongo

# MONGODB数据库的使用

class MongoPipeline(object):

def __init__(self):

# 获取settings.py里的参数

host = settings["MONGODB_HOST"]

port = settings["MONGODB_PORT"]

dbname = settings["MONGODB_DBNAME"]

sheetname= settings["MONGODB_SHEETNAME"]

# MongoDB数据库

client = pymongo.MongoClient(host=host, port=port) # 连接

mydb = client[dbname] # 数据库

self.sheet = mydb[sheetname] # 表

def process_item(self, item, spider):

data = dict(item)

self.sheet.insert(data) # 插入数据

return item

下载中间件

settings.py配置

DOWNLOADER_MIDDLEWARES = {

# 'mySpider.middlewares.MyspiderDownloaderMiddleware': 543,

'mySpider.middlewares.RandomUserAgent': 100,

'mySpider.middlewares.RandomProxy': 200,

}

下载中间件代码

import random

import base64

class RandomUserAgent(object):

def __init__(self):

self.USER_AGENTS = [

'Mozilla/5.0(Windows;U;WindowsNT6.1;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50',

]

# 必须写

def process_request(self, request, spider):

useragent = random.choice(self.USER_AGENTS)

# 修改请求头 (优先级高于settings.py里配置的)

request.headers.setdefault("User-Agent", useragent)

class RandomProxy(object):

def __init__(self):

self.PROXIES = [

# {"ip_port": "http://115.215.56.138:22502", "user_passwd": b"user:passwd"},

{"ip_port": "http://115.215.56.138:22502", "user_passwd": b""}

]

def process_request(self, request, spider):

proxy = random.choice(self.PROXIES)

# 无须授权的代理ip

if len(proxy['user_passwd']) <= 0:

request.meta['proxy'] = proxy['ip_port']

# 需要授权代理ip

else:

request.meta['proxy'] = proxy['ip_port']

userpasswd_base64 = base64.b64encode(proxy['user_passwd'])

request.headers['Proxy-Authorization'] = 'Basic ' + userpasswd_base64 # 按指定格式格式化

其他

保存log

在settings.py里配置

# 保存日志信息

LOG_FILE = "tieba.log"

LOG_LEVEL = "DEBUG"

logging的配置参数

LOG_ENABLED = True # 启动logging (默认True)

LOG_ENCODING = 'utf-8' # logging编码

LOG_FILE = "tieba.log" # logging文件名

LOG_LEVEL = "DEBUG " # log级别 (默认DEBUG)

LOG_STDOUT = False # True进程所有标准输出到写到log文件中 (默认False)

logging的错误级别 (级别最低从最高: 1->5)

1. CRITICAL(严重错误)

2. ERROR(一般错误)

3. WARNING(警告信息)

4. INFO(一般信息)

5. DEBUG(调试信息)

代码中使用logging

import logging

logging.warning("This is a warning")

logging.log(logging.WARNING, "This is a warning")

logger = logging.getLogger()

logger.warning("This is a warning")

Scrapy Shell

Scrapy终端是一个交互终端,供您在未启动spider的情况下尝试及调试您的爬取代码

启动

scrapy shell "http://www.baidu.com"

获取

body: response.body

内容(解析用): response.text

状态码: response.status

请求头: response.headers

结果url: response.url

用浏览器查看response: view(response)

转成文字: response.xpath('//div[@class="ba_content"]').extract()

第一个转成文字: response.xpath('//div[@class="ba_content"]').extract_first()

匹配

xpath: response.xpath('//div[@class="ba_content"]')

运行代码

from scrapy.linkextractors import LinkExtractor

link_extrator = LinkExtractor(allow=("\d+"))

links_list = link_extrator.extract_links(response)

fetch('http://www.baidu.com')

分布式爬虫 scrapy-redis

Scrapy 爬虫框架本身不支持分布式, Scrapy-redis为了实现Scrapy分布式提供了一些以redis为基础的组件

https://github.com/rolando/scrapy-redis.git

框架

619d2e2209d1a487615574590e58dd71.png

Scheduler(调度器):

Scrapy的Scrapy queue 换成 redis 数据库, 多个 spider 从同一个 redis-server 获取要爬取的request

Scheduler 对新的request进行入队列操作, 取出下个要爬取的request给爬虫, 使用Scrapy-redis的scheduler组件

Duplication Filter(指纹去重):

在scrapy-redis中去重是由Duplication Filter组件来实现的, 通过redis的set不重复的特性

scrapy-redis调度器从引擎接受request, 并指纹存⼊set检查是否重复, 不重复写⼊redis的request queue

Item Pipeline(管道):

引擎将爬取到的Item给Item Pipeline, scrapy-redis组件的Item Pipeline将爬取到的Item存⼊redis的items队列里

Base Spider(爬虫):

RedisSpider继承了Spider和RedisMixin这两个类, RedisMixin用来从redis读取url的类

我们写个Spider类继承RedisSpider, 调用setup_redis函数去连接redis数据库, 然后设置signals(信号)

当spider空闲时候的signal, 会调用spider_idle函数, 保证spider是一直活着的状态

当有item时的signal, 会调用item_scraped函数, 获取下一个request

Scrapy-Redis分布式策略

4ba36d1ce3360fdd939d0a4418cf5861.png

Master端(核心服务器): 搭建一个Redis数据库, 一般不负责爬取, 只负责Request去重, Request的分配, 数据的存储

Slaver端(爬虫程序执行端): 负责执行爬虫程序, 从Master端拿任务进行数据抓取, 运行过程中提交新的Request给Master处理

Master端的Scrapy-Redis调度的任务是Request对象(包含url, callback, headers等信息), 占用大量的Redis存储空间, 会导致爬虫速度降低, 对硬件要求比较高(内存, 网络)

准备

Redis (仅需Master端启动)

官网: https://redis.io/download

管理软件: https://redisdesktop.com/download

redis命令(cd到redis安装目录):

启动: redis-server redis.windows.conf

连接: 运行redis-cli

连接其他主机: redis-cli -h 192.168.1.2

切换数据库: select 1

查看键: keys *

设置值: set [键] [值]

查看值: get [键]

删除数据: del [键/*]

修改Master端redis.windows.conf配置文件

注释掉 bind 127.0.0.1

Ubuntu可选(作为后台线程): daemonize yes

关闭保护模式(protected-mode no)或设置密码

启动

scrapy-redis(组件)

安装: pip install scrapy-redis

scrapy-redis-0.6.8

下载案例代码: git clone https://github.com/rolando/scrapy-redis.git

操作步骤

创建项目(同上)

scrapy startproject myRedisSpider

编写要存储的内容(items.py) (除了保存数据从哪来, 其他同上)

class TiebaItem(scrapy.Item):

name = scrapy.Field()

summary = scrapy.Field()

person_sum = scrapy.Field()

text_sum = scrapy.Field()

# 数据来源

from_url = scrapy.Field()

from_name = scrapy.Field()

time = scrapy.Field()

创建爬虫 (除了保存数据从哪来, 其他同上)

cd 到项目目录(myRedisSpider)下

scrapy genspider tieba tieba.baidu.com

爬虫文件创建在mySpider.spiders.tieba

编写爬虫

# -*- coding: utf-8 -*-

import scrapy

# 1. 导入 RedisSpider库

from scrapy_redis.spiders import RedisSpider

from myRedisSpider.items import TiebaItem

# 普通的 Spider 爬虫改造成分布式的 RedisSpider 爬虫 [1,4]

# 2. 继承 scrapy.Spider 换成 RedisSpider

class TiebaSpider(RedisSpider):

name = 'tieba'

allowed_domains = ['tieba.baidu.com']

page = 1

page_max = 30

url = 'http://tieba.baidu.com/f/index/forumpark?cn=%E5%86%85%E5%9C%B0%E6%98%8E%E6%98%9F&ci=0&pcn=%E5%A8%B1%E4%B9%90%E6%98%8E%E6%98%9F&pci=0&ct=1&st=new&pn='

# 3. 注释 start_urls

# start_urls = [url + str(page)]

# 4. 启动所有爬虫端的指令 (爬虫名:start_urls)

redis_key = 'tieba:start_urls'

def parse(self, response):

tieba_list = response.xpath('//div[@class="ba_content"]') # 数据根目录

for tieba in tieba_list:

# 从网页中获取需要的数据

name = tieba.xpath('./p[@class="ba_name"]/text()').extract_first()

summary = tieba.xpath('./p[@class="ba_desc"]/text()').extract_first()

person_sum = tieba.xpath('./p/span[@class="ba_m_num"]/text()').extract_first()

text_sum = tieba.xpath('./p/span[@class="ba_p_num"]/text()').extract_first()

item = TiebaItem()

item['name'] = name

item['summary'] = summary

item['person_sum'] = person_sum

item['text_sum'] = text_sum

# 告知数据从哪里来

item['from_url'] = response.url

yield item

if self.page < self.page_max:

self.page += 1

yield response.follow(self.url + str(self.page), callback=self.parse)

配置settings.py文件 (添加以下设置, 其他同上)

ITEM_PIPELINES = {

'myRedisSpider.pipelines.MyredisspiderPipeline': 300,

'scrapy_redis.pipelines.RedisPipeline': 400, # 将数据存到redis数据库 (优先级要比其他管道低(数值高))

}

# scrapy_redis去重组件

DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"

# scrapy_redis调度器组件

SCHEDULER = "scrapy_redis.scheduler.Scheduler"

# 可中途暂停, 不清空信息

SCHEDULER_PERSIST = True

# 默认Scrapy队列模式, 优先级

# SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderPriorityQueue"

# 队列模式, 先进先出

#SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderQueue"

# 栈模式, 先进后出

#SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderStack"

# redis主机

REDIS_HOST = '192.168.1.104'

REDIS_PORT = '6379'

REDIS_ENCODING = "utf-8"

设置值的通道

class MyredisspiderPipeline(object):

def process_item(self, item, spider):

item['time'] = datetime.utcnow() # 格林威治时间

item['from_name'] = spider.name

return item

运行

Slaver端开启爬虫: (scrapy runspider 爬虫文件名.py)

cd到myRedisSpider\spiders爬虫目录

scrapy runspider tieba.py

Master端发送指令: (lpush redis_key 爬虫起始url)

cd到redis安装目录并打开redis-cli, 输出命令

lpush tieba:start_urls http://tieba.baidu.com/f/index/forumpark?cn=%E5%86%85%E5%9C%B0%E6%98%8E%E6%98%9F&ci=0&pcn=%E5%A8%B1%E4%B9%90%E6%98%8E%E6%98%9F&pci=0&ct=1&st=new&pn=1

跟进爬虫CrawlSpiders

创建命令: scrapy genspider -t crawl tieba_crawl tieba.baidu.com

爬虫代码

# -*- coding: utf-8 -*-

import scrapy

from scrapy.linkextractors import LinkExtractor

# from scrapy.spiders import CrawlSpider, Rule

from scrapy.spiders import Rule

# 1. 把 CrawlSpider 换成 RedisCrawlSpider

from scrapy_redis.spiders import RedisCrawlSpider

from myRedisSpider.items import TiebaItem

# 普通的 CrawlSpider 爬虫改造成分布式的 RedisCrawlSpider 爬虫 [1,4]

# class TiebaSpider(CrawlSpider):

# 2. 把继承 CrawlSpider 换成 RedisCrawlSpider

class TiebaSpider(RedisCrawlSpider):

name = 'tieba_crawl'

# 不要使用动态域, 因为会导致获取不到域而将所有请求过滤 Filtered offsite request to 'tieba.baidu.com'

allowed_domains = ['tieba.baidu.com']

# 3. 注释 start_urls

# start_urls = ['http://tieba.baidu.com/f/index/forumpark?cn=%E5%86%85%E5%9C%B0%E6%98%8E%E6%98%9F&ci=0&pcn=%E5%A8%B1%E4%B9%90%E6%98%8E%E6%98%9F&pci=0&ct=1&st=new&pn=1']

# 4. 启动所有爬虫端的指令 (爬虫名:start_urls)

redis_key = 'tiebacrawl:start_urls'

rules = (

Rule(LinkExtractor(allow=('pn=\d+')), callback='parse_item', follow=True), # st=new&

)

def parse_item(self, response):

tieba_list = response.xpath('//div[@class="ba_content"]')

for tieba in tieba_list:

name = tieba.xpath('./p[@class="ba_name"]/text()').extract_first()

summary = tieba.xpath('./p[@class="ba_desc"]/text()').extract_first()

person_sum = tieba.xpath('./p/span[@class="ba_m_num"]/text()').extract_first()

text_sum = tieba.xpath('./p/span[@class="ba_p_num"]/text()').extract_first()

item = TiebaItem()

item['name'] = name

item['summary'] = summary

item['person_sum'] = person_sum

item['text_sum'] = text_sum

# 告知数据从哪里来

item['from_url'] = response.url

item['from_name'] = self.name

yield item

运行爬虫 (同上)

scrapy runspider tieba_crawl.py

发送指令 (同上)

lpush tiebacrawl:start_urls http://tieba.baidu.com/f/index/forumpark?cn=%E5%86%85%E5%9C%B0%E6%98%8E%E6%98%9F&ci=0&pcn=%E5%A8%B1%E4%B9%90%E6%98%8E%E6%98%9F&pci=0&ct=1&st=new&pn=1

把Redis数据保存到本地数据库

MySQL

创建数据库创建表:

创建表: create table tieba (name varchar(1000), summary varchar(1000), person_sum varchar(1000), text_sum varchar(1000), from_url varchar(1000), from_name varchar(1000), time varchar(1000));

插入数据: insert into tieba(name, summary, person_sum, text_sum, from_url, from_name) values("名字", "简介", "111", "222", "http://tieba.baidu.com", "tieba", "xxx");

python代码实现

#!/usr/bin/env python

# coding=utf-8

# pip install redis

import redis

# pip install pymysql

import pymysql

import json

import sys

def process_item():

# 创建数据库连接

redisc = redis.Redis(host="192.168.1.104", port=6379, db=0, encoding='utf-8')

mysql = pymysql.connect(host='127.0.0.1', port=3306, user='root', password='root', db='test', charset='utf8mb4')

cursor = None

while True:

try:

# 获取数据

source, data = redisc.blpop("tieba:items")

item = json.loads(data, encoding="utf-8")

# 保存数据

cursor = mysql.cursor()

sql = 'insert into tieba(name, summary, person_sum, text_sum, from_url, from_name, time) values(%s, %s, %s, %s, %s, %s, %s)'

cursor.execute(sql, [item["name"], item["summary"], item["person_sum"], item["text_sum"], item["from_url"], item["from_name"], item["time"]]);

mysql.commit()

print(end=">")

except:

types, value, back = sys.exc_info() # 捕获异常

sys.excepthook(types, value, back) # 打印异常

finally:

if cursor is not None:

cursor.close()

if __name__ == "__main__":

process_item()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值