一、方法一
- 适合存储相同字段不同网站采集
1、项目结构
共有三个spider
commands文件下是插件命令,与run.py配合一键运行3个spider
- 插件命令需要在settings.py绑定命令
- run.py 与scrapy.cfg同级放置,否则启动不了
- 注意commands文件夹位置与spiders文件夹同级
2、插件代码【一键启动多爬虫】
setiings.py
添加插件命令
COMMANDS_MODULE = 'scrapy_source.commands' # 插件绑定命令
LOG_LEVEL = 'WARNING' # 日志级别
LOG_FILE = './log.log' # 存储位置与文件
crawlall.py
import os
from scrapy.commands import ScrapyCommand
from scrapy.utils.conf import arglist_to_dict
from scrapy.utils.python import without_none_values
from scrapy.exceptions import UsageError
class Command(ScrapyCommand):
requires_project = True
def syntax(self):
return "[options] <spider>"
def short_desc(self):
return "Run all spider"
def add_options(self, parser):
ScrapyCommand.add_options(self, parser)
parser.add_option("-a", dest="spargs", action="append", default=[], metavar="NAME=VALUE",
help="set spider argument (may be repeated)")
parser.add_option("-o", "--output", metavar="FILE",
help="dump scraped items into FILE (use - for stdout)")
parser.add_option("-t", "--output-format", metavar="FORMAT",
help="format to use for dumping items with -o")
def process_options(self, args, opts):
ScrapyCommand.process_options(self, args, opts)
try:
opts.spargs = arglist_to_dict(opts.spargs)
except ValueError:
raise UsageError("Invalid -a value, use -a NAME=VALUE", print_help=False)
if opts.output:
if opts.output == '-':
self.settings.set('FEED_URI', 'stdout:', priority='cmdline')
else:
self.settings.set('FEED_URI', opts.output, priority='cmdline')
feed_exporters = without_none_values(
self.settings.getwithbase('FEED_EXPORTERS'))
valid_output_formats = feed_exporters.keys()
if not opts.output_format:
opts.output_format = os.path.splitext(opts.output)[1].replace(".", "")
if opts.output_format not in valid_output_formats:
raise UsageError("Unrecognized output format '%s', set one"
" using the '-t' switch or as a file extension"
" from the supported list %s" % (opts.output_format,
tuple(valid_output_formats)))
self.settings.set('FEED_FORMAT', opts.output_format, priority='cmdline')
def run(self, args, opts):
# 获取爬虫列表
spd_loader_list = self.crawler_process.spider_loader.list() # 获取所有的爬虫文件。
print(spd_loader_list)
# 遍历各爬虫
for spname in spd_loader_list or args:
self.crawler_process.crawl(spname, **opts.spargs)
print('此时启动的爬虫为:' + spname)
self.crawler_process.start()
run.py
# -*- coding: utf-8 -*-
# @Time : 2021-04-02 15:33
# @Author : XuGuangJun
# @FileName: run.py
# @Software: PyCharm
from scrapy import cmdline
cmdline.execute('scrapy crawlall'.split())
3、piplines.py
- 数据名称,以aaa_spider.py 截断 aaa 命名数据库 positions_aaa
- 表,tb_job_当月今日时间
- 自动对应spider存储对应数据库即表
- 发送邮件
- 日志记录
import time, logging, pymysql
from email.mime.text import MIMEText # 邮件正文
import smtplib # 连接qq服务器
import re
class ScrapySourcePipeline:
start_time = time.time() # 程序开始时间
item_flag = 0 # 存储数据条数
sucess_item = 0 # 成功存储条数
false_item = 0 # 失败存储条数
t_c = time.strftime('%Y%m', time.localtime(time.time())) # 初始化建表时间 tb_job_202012
HOST = '' # 数据库IP
# HOST = '' # 外部服务器IP
def open_spider(self, spider):
self.conn = pymysql.connect(
host=self.HOST,
user='',
password='',
port=3306,
autocommit='True')
self.cursor = self.conn.cursor()
# 判断此时进来的spider名称,lg_spider 得到 position_lg 数据库名称
db_name = 'positions_' + re.findall(r'(.*)_', re.findall(r"'(.*)'", str(spider))[0])[0]
# create databases
self.cursor.execute(
f"CREATE DATABASE IF NOT EXISTS {db_name} DEFAULT CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci")
self.cursor.execute(f'USE {db_name}')
# create table
sql_create_tb = f"""CREATE TABLE IF NOT EXISTS `tb_job_{self.t_c}` (
`id` int(11) NOT NULL AUTO_INCREMENT COMMENT 'ID',
`code` varchar(100) DEFAULT NULL COMMENT '岗位编号',
`name` varchar(50) DEFAULT NULL COMMENT '岗位名称',
`address` varchar(20) DEFAULT NULL COMMENT '工作地点',
`income` varchar(20) DEFAULT NULL COMMENT '薪资要求',
`time` varchar(20) DEFAULT NULL COMMENT '发布时间',
`experience` varchar(20) DEFAULT NULL COMMENT '工作经验',
`education` varchar(10) DEFAULT NULL COMMENT '学历要求',
`count` varchar(10) DEFAULT NULL COMMENT '招聘人数',
`type` varchar(50) DEFAULT NULL COMMENT '职能类别',
`company` varchar(50) DEFAULT NULL COMMENT '企业名称',
`nature` varchar(20) DEFAULT NULL COMMENT '企业性质',
`scale` varchar(20) DEFAULT NULL COMMENT '企业规模',
`business` varchar(50) DEFAULT NULL COMMENT '所属行业',
`keyword` varchar(100) DEFAULT NULL COMMENT '关键字',
`responsibility` varchar(2048) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '所属行业',
`requirement` varchar(2048) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '关键字',
PRIMARY KEY (`id`),
UNIQUE KEY `index1` (`code`,`name`,`address`,`income`,`experience`,`education`,`count`,`type`,`company`,`nature`,`scale`,`business`) USING BTREE
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci;
"""
self.cursor.execute(sql_create_tb)
def process_item(self, item, spider):
db_name = 'positions_' + re.findall(r'(.*)_', re.findall(r"'(.*)'", str(spider))[0])[0]
self.cursor.execute(f'USE {db_name}') # 重新选择数据库
self.item_flag += 1
sql = "insert into tb_job_{}(code, name, address, income, " \
"time, experience, education, count, type, " \
"company, nature, scale, business, keyword, responsibility, requirement) " \
"values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)".format(self.t_c)
try:
# ping,连接,失败自动重连
self.conn.ping(reconnect=True)
self.cursor.execute(f'USE {db_name}') # 重新选择数据库
self.cursor.execute(sql, (item["code"], item["name"], item["address"], item["income"], item["time"],
item["experience"], item["education"], item["count"], item["type"],
item["company"], item["nature"], item["scale"], item["business"],
item['keyword'],
item["responsibility"], item["requirement"]))
self.sucess_item += 1
print('总数据条数:', self.item_flag, '√√√√ 成功条数:', self.sucess_item)
except Exception as e:
self.false_item += 1
print('总数据条数:', self.item_flag, 'xxxx 失败条数:', self.false_item)
# logging.warning(e)
print(e)
def close_spider(self, spider):
pro_name = re.findall(r'(.*)_', re.findall(r"'(.*)'", str(spider))[0])[0]
self.cursor.close()
self.conn.close()
total_time = time.time() - self.start_time # 程序总执行时间
logging.warning('-------------- pipelines ----------------')
logging.warning("执行结束。总耗时:{} s".format(total_time))
logging.warning("爬取记录:{}".format(self.item_flag))
logging.warning("执行速度:{} item/s".format(self.item_flag / total_time))
content = f"""
执行结束。总耗时:{total_time}s \n\r
爬取条数:{self.item_flag} \n\r
成功存储:{self.sucess_item} \n\r
失败存储:{self.false_item} \n\r
执行速度:{self.item_flag / total_time} item/s
"""
subject = pro_name + '_' + 'scrapy项目'
send_email(content,subject)
def send_email(content, subject=None,
receive='1095581956@qq.com',
msg_from='1095581956@qq.com',# 可以写多个收件人
password=''): #授权码
'''
发送邮件
:param content: 发送邮件内容
:param subject: 邮件主题
:param receive: 收件人,多个收件人格式:'收件人1;收件2;...'
:param msg_from: 发件人
:param password: 授权码
:return:
'''
# 定义发件人
msg_from = msg_from
# 定义收件人
msg_to = receive
# 授权码
password = password
# 主题
subject = subject
msg = MIMEText(content, 'plain', 'utf-8')
msg['From'] = msg_from
msg['To'] = msg_to
msg['Subject'] = subject
try:
# 创建邮件对象
smtpobj = smtplib.SMTP()
# 连接服务器
# 指定qq邮箱的smt服务器
smtpobj.connect('smtp.qq.com')
# 登陆操作:指定邮箱和授权码
smtpobj.login(msg_from, password)
# 发邮件:
# sendmail:参数是:发件人,多个收件人列表,msg对象的字符串格式
smtpobj.sendmail(msg_from, msg['To'].split(';'), msg.as_string())
print('邮件发送成功!')
except Exception as e:
print('邮件发送失败!', e)
4、spider测试代码
- 这里没有用到items.py 直接yield存储测试
import scrapy
class AaaSpiderSpider(scrapy.Spider):
name = 'aaa_spider'
# allowed_domains = ['www']
# start_urls = ['http://www/']
def start_requests(self):
yield scrapy.Request(url='https://www.baidu.com/',callback=self.parse)
def parse(self, response):
item = {}
item["code"] = 'a'
item["name"] = ''
item["address"] = ''
item["income"] = ''
item["time"] = '2021-03-04'
item["experience"] = ''
item["education"] = ''
item["count"] = ''
item["type"] = ''
item["company"] = ''
item["nature"] = ''
item["scale"] = ''
item["business"] = ''
item['keyword'] = ''
item["responsibility"] = ''
item["requirement"] = ''
yield item
二、方法二
- 适合用于spider存储的piplines数据类型不同,比如一个存储字段类,一个存储文件类
- 如果存储都是字段,且假如都为16个相同字段,那么用一,比较方便
多个spider对应不同piplines存储
piplines.py
- 创建2个类即可
settings.py
- settings配置类
ITEM_PIPELINES = {
'ycwl.pipelines.JzycPipeline': 300, # 建筑英才
'ycwl.pipelines.HgycPipeline': 301, # 化工英才
}
hgyc_spider.py
- 采用custom_settings 指定piplines即可
hgyc_spider.py 同理