项目名/items.py(定义Item):
# -*- coding: utf-8 -*-
import scrapy
# 定义Item,继承scrapy.Item。 scrapy.Item 类似字典类型,可以通过[]获取和设置值,但不可以动态添加未定义的属性字段。
class TencentItem(scrapy.Item):
title = scrapy.Field()
position = scrapy.Field()
publish_date = scrapy.Field()
项目名/spiders/爬虫名.py(爬虫,提取数据并封装到 Item 中):
# -*- coding: utf-8 -*-
import scrapy
from tencent.items import TencentItem # 导入
class HrSpider(scrapy.Spider):
name = 'hr' # 爬虫名
allowed_domains = ['tencent.com']
start_urls = ['http://hr.tencent.com/position.php']
def parse(self, response):
tr_list = response.xpath("//table[@class='tablelist']/tr")[1:-1]
for tr in tr_list:
# 实例化Item对象