准备要抓取的数据和得到链接的xpath
创建项目
scrapy startproject pythonjob
创建爬虫
scrapy genspider job 51job.com
job.py中代码:
# -*- coding: utf-8 -*-
import scrapy
from scrapy_redis.spiders import RedisSpider
from pythonjob.items import PythonjobItem
class JobSpider(RedisSpider):
name = 'job'
# 存于redis
redis_key = 'jobspider:start_urls'
allowed_domains = ['51job.com']
page = 1
url = "http://search.51job.com/jobsearch/search_result.php?fromJs=1&jobarea=010000%2C00&keyword=python&curr_page="
# start_urls = [url + str(page)]
def parse(self, response):
href_list = response.xpath('//div[@class="el"]/p/span/a/@href').extract()
for href in href_list:
yield scrapy.Request(href, callback=self.detal_page_parse)
def detal_page_parse(self, response):
item = PythonjobItem()
# 职位链接
url = response.url
# 招聘标题
title = response.xpath('//div[@class="cn"]/h1/text()').extract()[0]
# 工资情况
salary = response.xpath('//div[@class="cn"]/strong/text()').extract()
salary = ''.join(salary)
# 招聘内容
content = response.xpath('//div[@class="bmsg job_msg inbox"]/p/text()|//div[@class="bmsg job_msg inbox"]//li/text()|//div[@class="bmsg job_msg inbox"]//text()').extract()
content = ''.join(content).replace('\t','').replace('\r\n','').replace(' ','').replace('\xa0','')
# 招聘职位
pos = response.xpath('//span[@class="lname"]/text()').extract()[0]
# 招聘人数
count = response.xpath('//div[@class="t1"]/span[3]/text()').extract()[0]
print('count=====',count)
if len(content) < 1:
content = title
item['url'] = url
item['title'] = title
item['salary'] = salary
item['content'] = content
item['pos'] = pos
return item
添加运行分布式 test_redis.py 文件:
from scrapy import cmdline
cmdline.execute('scrapy runspider job.py'.split())
pipelines.py 文件(管道文件,用于对数据的处理以及存储):
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
from datetime import datetime
#新添加的
import pymongo
import pymysql
from pythonjob.settings import MONGO_HOST, MONGO_PORT, MONG0_DBNAME, SHEET_NAME
class ExamplePipeline(object):
# 存于redis中增加两个字段
def process_item(self, item, spider):
item["crawled"] = datetime.utcnow()
item["spider"] = 'PC'
return item
class PythonjobPipeline(object):
# 用于存储到本地
def __init__(self):
self.file = open('job.josn','w',encoding='utf-8')
def process_item(self, item, spider):
# josn_str = json.dumps(dict(item),ensure_ascii=False)+'\n'
# self.file.write(josn_str)
return item
def close_file(self, spider):
self.file.close()
class job_mongo(object):
# 用于存储到mongodb数据库
def __init__(self):
mongo_host = MONGO_HOST
mongo_port = MONGO_PORT
sheet_name = SHEET_NAME
db_name = MONG0_DBNAME
print("打印mongo的信息===", mongo_host, mongo_port, db_name, sheet_name)
client = pymongo.MongoClient(host=mongo_host,port=mongo_port)
db_name = client[db_name]
self.sheet_name = db_name[sheet_name]
def process_item(self, item, spider):
python_dict = dict(item)
self.sheet_name.insert(python_dict)
return item
class job_mysql(object):
# 用于存储到MySQL数据库
def __init__(self):
mysql_port = 3306
mysql_host = '127.0.0.1'
dbname = 'python_jobdb'
user = 'afu'
password = '123456'
sheetname = 'job_items'
print("打印mysql的信息===", mysql_host, mysql_port, dbname, sheetname)
self.con = pymysql.connect(host=mysql_host, user=user, password=password,
database=dbname, port=mysql_port, charset='utf8')
# 创建表
self.cursor = self.con.cursor()
def process_item(self, item, spider):
args = [item["url"], item["title"], item["salary"], item["content"], item["pos"], item["crawled"], item["spider"]]
sql = "INSERT INTO job_items(url,title,salary,content,pos,crawled,spider) VALUES (%s,%s,%s,%s,%s,%s,%s)"
self.cursor.execute(sql, args)
# 事务提交
self.con.commit()
return item
def close_spider(self, spider):
self.cursor.close()
self.con.close()
items中:
import scrapy
class PythonjobItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# 职位链接
url = scrapy.Field()
# 招聘标题
title = scrapy.Field()
# 工资情况
salary = scrapy.Field()
# 招聘内容
content = scrapy.Field()
# 招聘职位
pos = scrapy.Field()
# 增加到redis两个字段(记录时间和分机姓名)
crawled = scrapy.Field()
spider = scrapy.Field()
settings.py配置中:
BOT_NAME = 'pythonjob'
SPIDER_MODULES = ['pythonjob.spiders']
NEWSPIDER_MODULE = 'pythonjob.spiders'
# scrapy_redis相关配置
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36'
#使用scrapy-redis自己的组件去重,不使用scrapy默认的去重
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
#使用scrapy-redis自己调度器,不使用scrapy默认的调度器
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
#调度状态持久化,不清理redis缓存,允许暂停/启动爬虫
SCHEDULER_PERSIST = True
#按照sorted 排序顺序出队列,建议使用某一个,这样才能在redis数据库中看到,其实可以不写不影响结果
SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderPriorityQueue"
# Crawl responsibly by identifying yourself (and your website) on the user-agent
COOKIES_ENABLED = False
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# 记录日志
LOG_LEVEL = 'DEBUG'
# 增加爬出延迟时间,减轻服务器压力
DOWNLOAD_DELAY = 1
ITEM_PIPELINES = {
'pythonjob.pipelines.ExamplePipeline': 290,
'pythonjob.pipelines.PythonjobPipeline': 291,
# 数据存到MySQL,mongodb,redis数据库没有先后顺序,互不干扰
'pythonjob.pipelines.job_mysql': 292,
'pythonjob.pipelines.job_mongo': 293,
# 下面这个管道是必须要启用的--支持数据存储到redis数据库里
'scrapy_redis.pipelines.RedisPipeline': 400,
}
# 变量名可以随意设置
REDIS_HOST = '127.0.0.1'
REDIS_PORT = 6379
MONGO_HOST = '127.0.0.1'
MONGO_PORT = 27017
MONG0_DBNAME = 'python_job'
SHEET_NAME = 'job'
redis存储数据情况:
mongdb存储数据情况:
存储MySQL情况: