创建项目:
scrapy startproject ScrapyDemo
cd ScrapyDemo
scrapy genspider bigqcwy msearch.51job.com
items.py文件添加爬取信息:
class ScrapydemoItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# 职位名称
name = scrapy.Field()
# 薪资水平
salary = scrapy.Field()
# 招聘单位
company = scrapy.Field()
# 工作地点
jobPlace = scrapy.Field()
# 工作经验
jobExperience = scrapy.Field()
# 学历要求
education = scrapy.Field()
# 工作内容(岗位职责)
# jobContent = scrapy.Field()
# 任职要求(技能要求)
jobRequirement = scrapy.Field()
编辑spider文件bigqcwy.py:
对薪资简单做了清洗
# -*- coding: utf-8 -*-
import scrapy
import time
from ScrapyDemo.items import ScrapydemoItem
import re
class BigqcwySpider(scrapy.Spider):
name = 'bigqcwy'
allowed_domains = ['msearch.51job.com']
custom_settings = {
"DEFAULT_REQUEST_HEADERS": {
'Cookie':'设置你的cookie',
},
"AUTOTHROTTLE_ENABLED": True,
# "DOWNLOAD_DELAY": 1,
# "ScrapyDemo.pipelines.ScrapydemoPipeline": 300,
}
start_urls = ['https://msearch.51job.com/']
def start_requests(self):
# 搜索关键词列表
list = ['0100%2C7700%2C7200%2C7300%2C7800', '7400%2C2700%2C7900%2C7500%2C6600', '8000%2C6100%2C2600%2C2800%2C3300']
for i in list:
# 每个关键词有2000页
for j in range(1, 2001):