分析
1.分析网页,确定数据爬取规则
2.创建项目
3.创建数据模型Item
4.创建爬虫Spider,进行数据爬取
5.创建Item Pipeline,进行数据处理
6.按需求设置配置文件
源码
items.py
class TencentPositionItem(scrapy.Item):
"""腾讯招聘爬虫Item"""
# 职位名称
title=scrapy.Field()
# 职位类别
type=scrapy.Field()
# 招聘人数
count=scrapy.Field()
# 地点
location=scrapy.Field()
# 发布时间
pubtime=scrapy.Field()
spiders/tencent_position_spider.py
# !/usr/bin/env python
# -*- coding:utf-8 -*-
import scrapy
from myscrapy.items import TencentPositionItem
class PositionSpider(scrapy.Spider):
"""
爬取腾讯招聘的全部职位信息
练习:
1. 在Spider中的parse()方法中处理爬虫的2种选择:返回给Item Pipeline处理, 或者发起新的Request;
2. Item Pipeline的用法;
"""
name = 'tencent_position'
allowed_domains=['hr.tencent.com']
offset=0
base_url='http://hr.tencent.com/position.php?&start='
start_urls=[base_url+str(offset)]
def parse(self, response):
# 一页中的所有职位
positions=response.xpath('//tr[@class="even"]|//tr[@class="odd"]')
for position in positions:
title = position.xpath('./td[position()=1]/a/text()').extract()[0]
type = position.xpath('./td[position()=2]/text()').extract()
# 好坑爹,第6页中有一个职位没有类别
if len(type):
type=type[0]
else:
type='马化腾二大爷'
count = position.xpath('./td[position()=3]/text()').extract()[0]
location = position.xpath('./td[position()=4]/text()').extract()[0]
pubtime = position.xpath('./td[position()=5]/text()').extract()[0]
item=TencentPositionItem()
item['title']=title
item['type']=type
item['count']=count
item['location']=location
item['pubtime']=pubtime
# 将item传递给Pipeline处理
yield item
# 如果没有到最后一页,就爬取下一页
if self.offset<=2680:
self.offset+=10
yield scrapy.Request(url=self.base_url+str(self.offset),callback=self.parse)
pipelines.py
class TencentPositionPipeline(object):
"""
腾讯招聘爬虫Item Pipeline
"""
def __init__(self):
self.f=open('data/tencentposition.json',mode='wb+')
def process_item(self,item,spider):
data=json.dumps(dict(item),ensure_ascii=False,indent=4)
self.f.write(data.encode('utf-8')+b'\n')
return item
def close_spider(self,spider):
self.f.close()
settings.py
ITEM_PIPELINES = {
'myscrapy.pipelines.TencentPositionPipeline': 300,
}