1.安装 pip install scrapy
报错解决:离线安装twisted pip install xxx-twisted.whl(百度搜索twisted 点击twisted pypi进去下载相应的whl
pip install pywin32(https://www.lfd.uci.edu/~gohlke/pythonlibs/,下载对应的whl文件)
2 切换到自己pycharm文件的位置
。创建项目:scrapy startproject 项目名字
。创建爬虫:scrapy genspider 爬虫名字 域名
。编写爬虫
。运行爬虫 scrapy crawl 爬虫名字
。保存为csv文件 scrapy crawl job -o job.csv
#核心代码job.py
# -*- coding: utf-8 -*-
import scrapy
class JobSpider(scrapy.Spider):
name = 'job'
allowed_domains = ['51job.com']
start_urls = ['https://search.51job.com/list/030000%252C200000%252C130000%252C070000%252C140000,000000,0000,00,9,99,python,2,2.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=']
def parse(self, response):
'''
解析数据的response就是start_urls网址对应的数据
:param response:网站返回的数据
:return: 提取数据
'''
#print(response.text)
selectors = response.xpath('//div[@class="el"]')
for jk in selectors:
#详情页网址
url = jk.xpath('./p/span/a/@href').get()
if url:
#User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0
print(url)
#url1 = jk.xpath('./p/span/a/text()').get()
#url2 = url+url1
#进入详情页 发出请求
#yield 是一个生成器 类似return
yield scrapy.Request(url,callback=self.parseDetail)
#print(url)
def parseDetail(self,response):
'''
这个函数 用来处理详情页的数据
:param response: 详情页的结果
:return: 详情页提取的数据
'''
xinzi = response.xpath('//div[@class="cn"]/strong/text()').get(default='')
zhiwei = response.xpath('//div[@class="cn"]/h1/text()').get(default='')
gongzi = response.xpath('//p[@class="cname"]/a/@title').get(default='')
#一定字典格式
items = {
'薪资:':xinzi,
'职位:':zhiwei,
'公司:':gongzi
}
yield items
#return items
setting的代码:
True改为False
这里改为:
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xm…plication/xml;q=0.9,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Host': 'search.51job.com',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; …) Gecko/20100101 Firefox/71.0'
}
效果图
保存至当前目录的csv文件